lib/Target/AArch64/AArch64ISelLowering.cpp

   1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements the AArch64TargetLowering class.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AArch64ISelLowering.h"
  15 #include "AArch64CallingConvention.h"
  16 #include "AArch64MachineFunctionInfo.h"
  17 #include "AArch64PerfectShuffle.h"
  18 #include "AArch64RegisterInfo.h"
  19 #include "AArch64Subtarget.h"
  20 #include "MCTargetDesc/AArch64AddressingModes.h"
  21 #include "Utils/AArch64BaseInfo.h"
  22 #include "llvm/ADT/APFloat.h"
  23 #include "llvm/ADT/APInt.h"
  24 #include "llvm/ADT/ArrayRef.h"
  25 #include "llvm/ADT/STLExtras.h"
  26 #include "llvm/ADT/SmallVector.h"
  27 #include "llvm/ADT/Statistic.h"
  28 #include "llvm/ADT/StringRef.h"
  29 #include "llvm/ADT/StringSwitch.h"
  30 #include "llvm/ADT/Triple.h"
  31 #include "llvm/ADT/Twine.h"
  32 #include "llvm/Analysis/VectorUtils.h"
  33 #include "llvm/CodeGen/CallingConvLower.h"
  34 #include "llvm/CodeGen/MachineBasicBlock.h"
  35 #include "llvm/CodeGen/MachineFrameInfo.h"
  36 #include "llvm/CodeGen/MachineFunction.h"
  37 #include "llvm/CodeGen/MachineInstr.h"
  38 #include "llvm/CodeGen/MachineInstrBuilder.h"
  39 #include "llvm/CodeGen/MachineMemOperand.h"
  40 #include "llvm/CodeGen/MachineRegisterInfo.h"
  41 #include "llvm/CodeGen/MachineValueType.h"
  42 #include "llvm/CodeGen/RuntimeLibcalls.h"
  43 #include "llvm/CodeGen/SelectionDAG.h"
  44 #include "llvm/CodeGen/SelectionDAGNodes.h"
  45 #include "llvm/CodeGen/TargetCallingConv.h"
  46 #include "llvm/CodeGen/TargetInstrInfo.h"
  47 #include "llvm/CodeGen/ValueTypes.h"
  48 #include "llvm/IR/Attributes.h"
  49 #include "llvm/IR/Constants.h"
  50 #include "llvm/IR/DataLayout.h"
  51 #include "llvm/IR/DebugLoc.h"
  52 #include "llvm/IR/DerivedTypes.h"
  53 #include "llvm/IR/Function.h"
  54 #include "llvm/IR/GetElementPtrTypeIterator.h"
  55 #include "llvm/IR/GlobalValue.h"
  56 #include "llvm/IR/IRBuilder.h"
  57 #include "llvm/IR/Instruction.h"
  58 #include "llvm/IR/Instructions.h"
  59 #include "llvm/IR/Intrinsics.h"
  60 #include "llvm/IR/Module.h"
  61 #include "llvm/IR/OperandTraits.h"
  62 #include "llvm/IR/Type.h"
  63 #include "llvm/IR/Use.h"
  64 #include "llvm/IR/Value.h"
  65 #include "llvm/MC/MCRegisterInfo.h"
  66 #include "llvm/Support/Casting.h"
  67 #include "llvm/Support/CodeGen.h"
  68 #include "llvm/Support/CommandLine.h"
  69 #include "llvm/Support/Compiler.h"
  70 #include "llvm/Support/Debug.h"
  71 #include "llvm/Support/ErrorHandling.h"
  72 #include "llvm/Support/KnownBits.h"
  73 #include "llvm/Support/MathExtras.h"
  74 #include "llvm/Support/raw_ostream.h"
  75 #include "llvm/Target/TargetMachine.h"
  76 #include "llvm/Target/TargetOptions.h"
  77 #include <algorithm>
  78 #include <bitset>
  79 #include <cassert>
  80 #include <cctype>
  81 #include <cstdint>
  82 #include <cstdlib>
  83 #include <iterator>
  84 #include <limits>
  85 #include <tuple>
  86 #include <utility>
  87 #include <vector>
  88
  89 using namespace llvm;
  90
  91 #define DEBUG_TYPE "aarch64-lower"
  92
  93 STATISTIC(NumTailCalls, "Number of tail calls");
  94 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
  95 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
  96
  97 static cl::opt<bool>
  98 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
  99                            cl::desc("Allow AArch64 SLI/SRI formation"),
 100                            cl::init(false));
 101
 102 // FIXME: The necessary dtprel relocations don't seem to be supported
 103 // well in the GNU bfd and gold linkers at the moment. Therefore, by
 104 // default, for now, fall back to GeneralDynamic code generation.
 105 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
 106     "aarch64-elf-ldtls-generation", cl::Hidden,
 107     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
 108     cl::init(false));
 109
 110 static cl::opt<bool>
 111 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
 112                          cl::desc("Enable AArch64 logical imm instruction "
 113                                   "optimization"),
 114                          cl::init(true));
 115
 116 /// Value type used for condition codes.
 117 static const MVT MVT_CC = MVT::i32;
 118
 119 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 120                                              const AArch64Subtarget &STI)
 121     : TargetLowering(TM), Subtarget(&STI) {
 122   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
 123   // we have to make something up. Arbitrarily, choose ZeroOrOne.
 124   setBooleanContents(ZeroOrOneBooleanContent);
 125   // When comparing vectors the result sets the different elements in the
 126   // vector to all-one or all-zero.
 127   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 128
 129   // Set up the register classes.
 130   addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
 131   addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
 132
 133   if (Subtarget->hasFPARMv8()) {
 134     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
 135     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
 136     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
 137     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
 138   }
 139
 140   if (Subtarget->hasNEON()) {
 141     addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
 142     addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
 143     // Someone set us up the NEON.
 144     addDRTypeForNEON(MVT::v2f32);
 145     addDRTypeForNEON(MVT::v8i8);
 146     addDRTypeForNEON(MVT::v4i16);
 147     addDRTypeForNEON(MVT::v2i32);
 148     addDRTypeForNEON(MVT::v1i64);
 149     addDRTypeForNEON(MVT::v1f64);
 150     addDRTypeForNEON(MVT::v4f16);
 151
 152     addQRTypeForNEON(MVT::v4f32);
 153     addQRTypeForNEON(MVT::v2f64);
 154     addQRTypeForNEON(MVT::v16i8);
 155     addQRTypeForNEON(MVT::v8i16);
 156     addQRTypeForNEON(MVT::v4i32);
 157     addQRTypeForNEON(MVT::v2i64);
 158     addQRTypeForNEON(MVT::v8f16);
 159   }
 160
 161   // Compute derived properties from the register classes
 162   computeRegisterProperties(Subtarget->getRegisterInfo());
 163
 164   // Provide all sorts of operation actions
 165   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
 166   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 167   setOperationAction(ISD::SETCC, MVT::i32, Custom);
 168   setOperationAction(ISD::SETCC, MVT::i64, Custom);
 169   setOperationAction(ISD::SETCC, MVT::f16, Custom);
 170   setOperationAction(ISD::SETCC, MVT::f32, Custom);
 171   setOperationAction(ISD::SETCC, MVT::f64, Custom);
 172   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
 173   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
 174   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
 175   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
 176   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
 177   setOperationAction(ISD::BR_CC, MVT::f16, Custom);
 178   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
 179   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
 180   setOperationAction(ISD::SELECT, MVT::i32, Custom);
 181   setOperationAction(ISD::SELECT, MVT::i64, Custom);
 182   setOperationAction(ISD::SELECT, MVT::f16, Custom);
 183   setOperationAction(ISD::SELECT, MVT::f32, Custom);
 184   setOperationAction(ISD::SELECT, MVT::f64, Custom);
 185   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 186   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
 187   setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
 188   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
 189   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 190   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
 191   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
 192
 193   setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
 194   setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
 195   setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
 196
 197   setOperationAction(ISD::FREM, MVT::f32, Expand);
 198   setOperationAction(ISD::FREM, MVT::f64, Expand);
 199   setOperationAction(ISD::FREM, MVT::f80, Expand);
 200
 201   // Custom lowering hooks are needed for XOR
 202   // to fold it into CSINC/CSINV.
 203   setOperationAction(ISD::XOR, MVT::i32, Custom);
 204   setOperationAction(ISD::XOR, MVT::i64, Custom);
 205
 206   // Virtually no operation on f128 is legal, but LLVM can't expand them when
 207   // there's a valid register class, so we need custom operations in most cases.
 208   setOperationAction(ISD::FABS, MVT::f128, Expand);
 209   setOperationAction(ISD::FADD, MVT::f128, Custom);
 210   setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
 211   setOperationAction(ISD::FCOS, MVT::f128, Expand);
 212   setOperationAction(ISD::FDIV, MVT::f128, Custom);
 213   setOperationAction(ISD::FMA, MVT::f128, Expand);
 214   setOperationAction(ISD::FMUL, MVT::f128, Custom);
 215   setOperationAction(ISD::FNEG, MVT::f128, Expand);
 216   setOperationAction(ISD::FPOW, MVT::f128, Expand);
 217   setOperationAction(ISD::FREM, MVT::f128, Expand);
 218   setOperationAction(ISD::FRINT, MVT::f128, Expand);
 219   setOperationAction(ISD::FSIN, MVT::f128, Expand);
 220   setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
 221   setOperationAction(ISD::FSQRT, MVT::f128, Expand);
 222   setOperationAction(ISD::FSUB, MVT::f128, Custom);
 223   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
 224   setOperationAction(ISD::SETCC, MVT::f128, Custom);
 225   setOperationAction(ISD::BR_CC, MVT::f128, Custom);
 226   setOperationAction(ISD::SELECT, MVT::f128, Custom);
 227   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
 228   setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
 229
 230   // Lowering for many of the conversions is actually specified by the non-f128
 231   // type. The LowerXXX function will be trivial when f128 isn't involved.
 232   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 233   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 234   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
 235   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 236   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 237   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
 238   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 239   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 240   setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
 241   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
 242   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 243   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
 244   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
 245   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
 246
 247   // Variable arguments.
 248   setOperationAction(ISD::VASTART, MVT::Other, Custom);
 249   setOperationAction(ISD::VAARG, MVT::Other, Custom);
 250   setOperationAction(ISD::VACOPY, MVT::Other, Custom);
 251   setOperationAction(ISD::VAEND, MVT::Other, Expand);
 252
 253   // Variable-sized objects.
 254   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
 255   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
 256
 257   if (Subtarget->isTargetWindows())
 258     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
 259   else
 260     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
 261
 262   // Constant pool entries
 263   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
 264
 265   // BlockAddress
 266   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
 267
 268   // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
 269   setOperationAction(ISD::ADDC, MVT::i32, Custom);
 270   setOperationAction(ISD::ADDE, MVT::i32, Custom);
 271   setOperationAction(ISD::SUBC, MVT::i32, Custom);
 272   setOperationAction(ISD::SUBE, MVT::i32, Custom);
 273   setOperationAction(ISD::ADDC, MVT::i64, Custom);
 274   setOperationAction(ISD::ADDE, MVT::i64, Custom);
 275   setOperationAction(ISD::SUBC, MVT::i64, Custom);
 276   setOperationAction(ISD::SUBE, MVT::i64, Custom);
 277
 278   // AArch64 lacks both left-rotate and popcount instructions.
 279   setOperationAction(ISD::ROTL, MVT::i32, Expand);
 280   setOperationAction(ISD::ROTL, MVT::i64, Expand);
 281   for (MVT VT : MVT::vector_valuetypes()) {
 282     setOperationAction(ISD::ROTL, VT, Expand);
 283     setOperationAction(ISD::ROTR, VT, Expand);
 284   }
 285
 286   // AArch64 doesn't have {U|S}MUL_LOHI.
 287   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
 288   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 289
 290   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
 291   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
 292
 293   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
 294   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
 295   for (MVT VT : MVT::vector_valuetypes()) {
 296     setOperationAction(ISD::SDIVREM, VT, Expand);
 297     setOperationAction(ISD::UDIVREM, VT, Expand);
 298   }
 299   setOperationAction(ISD::SREM, MVT::i32, Expand);
 300   setOperationAction(ISD::SREM, MVT::i64, Expand);
 301   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
 302   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
 303   setOperationAction(ISD::UREM, MVT::i32, Expand);
 304   setOperationAction(ISD::UREM, MVT::i64, Expand);
 305
 306   // Custom lower Add/Sub/Mul with overflow.
 307   setOperationAction(ISD::SADDO, MVT::i32, Custom);
 308   setOperationAction(ISD::SADDO, MVT::i64, Custom);
 309   setOperationAction(ISD::UADDO, MVT::i32, Custom);
 310   setOperationAction(ISD::UADDO, MVT::i64, Custom);
 311   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
 312   setOperationAction(ISD::SSUBO, MVT::i64, Custom);
 313   setOperationAction(ISD::USUBO, MVT::i32, Custom);
 314   setOperationAction(ISD::USUBO, MVT::i64, Custom);
 315   setOperationAction(ISD::SMULO, MVT::i32, Custom);
 316   setOperationAction(ISD::SMULO, MVT::i64, Custom);
 317   setOperationAction(ISD::UMULO, MVT::i32, Custom);
 318   setOperationAction(ISD::UMULO, MVT::i64, Custom);
 319
 320   setOperationAction(ISD::FSIN, MVT::f32, Expand);
 321   setOperationAction(ISD::FSIN, MVT::f64, Expand);
 322   setOperationAction(ISD::FCOS, MVT::f32, Expand);
 323   setOperationAction(ISD::FCOS, MVT::f64, Expand);
 324   setOperationAction(ISD::FPOW, MVT::f32, Expand);
 325   setOperationAction(ISD::FPOW, MVT::f64, Expand);
 326   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 327   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 328   if (Subtarget->hasFullFP16())
 329     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
 330   else
 331     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
 332
 333   setOperationAction(ISD::FREM,    MVT::f16,   Promote);
 334   setOperationAction(ISD::FREM,    MVT::v4f16, Promote);
 335   setOperationAction(ISD::FREM,    MVT::v8f16, Promote);
 336   setOperationAction(ISD::FPOW,    MVT::f16,   Promote);
 337   setOperationAction(ISD::FPOW,    MVT::v4f16, Promote);
 338   setOperationAction(ISD::FPOW,    MVT::v8f16, Promote);
 339   setOperationAction(ISD::FPOWI,   MVT::f16,   Promote);
 340   setOperationAction(ISD::FCOS,    MVT::f16,   Promote);
 341   setOperationAction(ISD::FCOS,    MVT::v4f16, Promote);
 342   setOperationAction(ISD::FCOS,    MVT::v8f16, Promote);
 343   setOperationAction(ISD::FSIN,    MVT::f16,   Promote);
 344   setOperationAction(ISD::FSIN,    MVT::v4f16, Promote);
 345   setOperationAction(ISD::FSIN,    MVT::v8f16, Promote);
 346   setOperationAction(ISD::FSINCOS, MVT::f16,   Promote);
 347   setOperationAction(ISD::FSINCOS, MVT::v4f16, Promote);
 348   setOperationAction(ISD::FSINCOS, MVT::v8f16, Promote);
 349   setOperationAction(ISD::FEXP,    MVT::f16,   Promote);
 350   setOperationAction(ISD::FEXP,    MVT::v4f16, Promote);
 351   setOperationAction(ISD::FEXP,    MVT::v8f16, Promote);
 352   setOperationAction(ISD::FEXP2,   MVT::f16,   Promote);
 353   setOperationAction(ISD::FEXP2,   MVT::v4f16, Promote);
 354   setOperationAction(ISD::FEXP2,   MVT::v8f16, Promote);
 355   setOperationAction(ISD::FLOG,    MVT::f16,   Promote);
 356   setOperationAction(ISD::FLOG,    MVT::v4f16, Promote);
 357   setOperationAction(ISD::FLOG,    MVT::v8f16, Promote);
 358   setOperationAction(ISD::FLOG2,   MVT::f16,   Promote);
 359   setOperationAction(ISD::FLOG2,   MVT::v4f16, Promote);
 360   setOperationAction(ISD::FLOG2,   MVT::v8f16, Promote);
 361   setOperationAction(ISD::FLOG10,  MVT::f16,   Promote);
 362   setOperationAction(ISD::FLOG10,  MVT::v4f16, Promote);
 363   setOperationAction(ISD::FLOG10,  MVT::v8f16, Promote);
 364
 365   if (!Subtarget->hasFullFP16()) {
 366     setOperationAction(ISD::SELECT,      MVT::f16,  Promote);
 367     setOperationAction(ISD::SELECT_CC,   MVT::f16,  Promote);
 368     setOperationAction(ISD::SETCC,       MVT::f16,  Promote);
 369     setOperationAction(ISD::BR_CC,       MVT::f16,  Promote);
 370     setOperationAction(ISD::FADD,        MVT::f16,  Promote);
 371     setOperationAction(ISD::FSUB,        MVT::f16,  Promote);
 372     setOperationAction(ISD::FMUL,        MVT::f16,  Promote);
 373     setOperationAction(ISD::FDIV,        MVT::f16,  Promote);
 374     setOperationAction(ISD::FMA,         MVT::f16,  Promote);
 375     setOperationAction(ISD::FNEG,        MVT::f16,  Promote);
 376     setOperationAction(ISD::FABS,        MVT::f16,  Promote);
 377     setOperationAction(ISD::FCEIL,       MVT::f16,  Promote);
 378     setOperationAction(ISD::FSQRT,       MVT::f16,  Promote);
 379     setOperationAction(ISD::FFLOOR,      MVT::f16,  Promote);
 380     setOperationAction(ISD::FNEARBYINT,  MVT::f16,  Promote);
 381     setOperationAction(ISD::FRINT,       MVT::f16,  Promote);
 382     setOperationAction(ISD::FROUND,      MVT::f16,  Promote);
 383     setOperationAction(ISD::FTRUNC,      MVT::f16,  Promote);
 384     setOperationAction(ISD::FMINNUM,     MVT::f16,  Promote);
 385     setOperationAction(ISD::FMAXNUM,     MVT::f16,  Promote);
 386     setOperationAction(ISD::FMINNAN,     MVT::f16,  Promote);
 387     setOperationAction(ISD::FMAXNAN,     MVT::f16,  Promote);
 388
 389     // promote v4f16 to v4f32 when that is known to be safe.
 390     setOperationAction(ISD::FADD,        MVT::v4f16, Promote);
 391     setOperationAction(ISD::FSUB,        MVT::v4f16, Promote);
 392     setOperationAction(ISD::FMUL,        MVT::v4f16, Promote);
 393     setOperationAction(ISD::FDIV,        MVT::v4f16, Promote);
 394     setOperationAction(ISD::FP_EXTEND,   MVT::v4f16, Promote);
 395     setOperationAction(ISD::FP_ROUND,    MVT::v4f16, Promote);
 396     AddPromotedToType(ISD::FADD,         MVT::v4f16, MVT::v4f32);
 397     AddPromotedToType(ISD::FSUB,         MVT::v4f16, MVT::v4f32);
 398     AddPromotedToType(ISD::FMUL,         MVT::v4f16, MVT::v4f32);
 399     AddPromotedToType(ISD::FDIV,         MVT::v4f16, MVT::v4f32);
 400     AddPromotedToType(ISD::FP_EXTEND,    MVT::v4f16, MVT::v4f32);
 401     AddPromotedToType(ISD::FP_ROUND,     MVT::v4f16, MVT::v4f32);
 402
 403     setOperationAction(ISD::FABS,        MVT::v4f16, Expand);
 404     setOperationAction(ISD::FNEG,        MVT::v4f16, Expand);
 405     setOperationAction(ISD::FROUND,      MVT::v4f16, Expand);
 406     setOperationAction(ISD::FMA,         MVT::v4f16, Expand);
 407     setOperationAction(ISD::SETCC,       MVT::v4f16, Expand);
 408     setOperationAction(ISD::BR_CC,       MVT::v4f16, Expand);
 409     setOperationAction(ISD::SELECT,      MVT::v4f16, Expand);
 410     setOperationAction(ISD::SELECT_CC,   MVT::v4f16, Expand);
 411     setOperationAction(ISD::FTRUNC,      MVT::v4f16, Expand);
 412     setOperationAction(ISD::FCOPYSIGN,   MVT::v4f16, Expand);
 413     setOperationAction(ISD::FFLOOR,      MVT::v4f16, Expand);
 414     setOperationAction(ISD::FCEIL,       MVT::v4f16, Expand);
 415     setOperationAction(ISD::FRINT,       MVT::v4f16, Expand);
 416     setOperationAction(ISD::FNEARBYINT,  MVT::v4f16, Expand);
 417     setOperationAction(ISD::FSQRT,       MVT::v4f16, Expand);
 418
 419     setOperationAction(ISD::FABS,        MVT::v8f16, Expand);
 420     setOperationAction(ISD::FADD,        MVT::v8f16, Expand);
 421     setOperationAction(ISD::FCEIL,       MVT::v8f16, Expand);
 422     setOperationAction(ISD::FCOPYSIGN,   MVT::v8f16, Expand);
 423     setOperationAction(ISD::FDIV,        MVT::v8f16, Expand);
 424     setOperationAction(ISD::FFLOOR,      MVT::v8f16, Expand);
 425     setOperationAction(ISD::FMA,         MVT::v8f16, Expand);
 426     setOperationAction(ISD::FMUL,        MVT::v8f16, Expand);
 427     setOperationAction(ISD::FNEARBYINT,  MVT::v8f16, Expand);
 428     setOperationAction(ISD::FNEG,        MVT::v8f16, Expand);
 429     setOperationAction(ISD::FROUND,      MVT::v8f16, Expand);
 430     setOperationAction(ISD::FRINT,       MVT::v8f16, Expand);
 431     setOperationAction(ISD::FSQRT,       MVT::v8f16, Expand);
 432     setOperationAction(ISD::FSUB,        MVT::v8f16, Expand);
 433     setOperationAction(ISD::FTRUNC,      MVT::v8f16, Expand);
 434     setOperationAction(ISD::SETCC,       MVT::v8f16, Expand);
 435     setOperationAction(ISD::BR_CC,       MVT::v8f16, Expand);
 436     setOperationAction(ISD::SELECT,      MVT::v8f16, Expand);
 437     setOperationAction(ISD::SELECT_CC,   MVT::v8f16, Expand);
 438     setOperationAction(ISD::FP_EXTEND,   MVT::v8f16, Expand);
 439   }
 440
 441   // AArch64 has implementations of a lot of rounding-like FP operations.
 442   for (MVT Ty : {MVT::f32, MVT::f64}) {
 443     setOperationAction(ISD::FFLOOR, Ty, Legal);
 444     setOperationAction(ISD::FNEARBYINT, Ty, Legal);
 445     setOperationAction(ISD::FCEIL, Ty, Legal);
 446     setOperationAction(ISD::FRINT, Ty, Legal);
 447     setOperationAction(ISD::FTRUNC, Ty, Legal);
 448     setOperationAction(ISD::FROUND, Ty, Legal);
 449     setOperationAction(ISD::FMINNUM, Ty, Legal);
 450     setOperationAction(ISD::FMAXNUM, Ty, Legal);
 451     setOperationAction(ISD::FMINNAN, Ty, Legal);
 452     setOperationAction(ISD::FMAXNAN, Ty, Legal);
 453   }
 454
 455   if (Subtarget->hasFullFP16()) {
 456     setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
 457     setOperationAction(ISD::FFLOOR,  MVT::f16, Legal);
 458     setOperationAction(ISD::FCEIL,   MVT::f16, Legal);
 459     setOperationAction(ISD::FRINT,   MVT::f16, Legal);
 460     setOperationAction(ISD::FTRUNC,  MVT::f16, Legal);
 461     setOperationAction(ISD::FROUND,  MVT::f16, Legal);
 462     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
 463     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
 464     setOperationAction(ISD::FMINNAN, MVT::f16, Legal);
 465     setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);
 466   }
 467
 468   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 469
 470   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
 471   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
 472   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
 473   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
 474   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
 475
 476   // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
 477   // This requires the Performance Monitors extension.
 478   if (Subtarget->hasPerfMon())
 479     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
 480
 481   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
 482       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
 483     // Issue __sincos_stret if available.
 484     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
 485     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
 486   } else {
 487     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 488     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 489   }
 490
 491   // Make floating-point constants legal for the large code model, so they don't
 492   // become loads from the constant pool.
 493   if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
 494     setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 495     setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
 496   }
 497
 498   // AArch64 does not have floating-point extending loads, i1 sign-extending
 499   // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
 500   for (MVT VT : MVT::fp_valuetypes()) {
 501     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
 502     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
 503     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
 504     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
 505   }
 506   for (MVT VT : MVT::integer_valuetypes())
 507     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
 508
 509   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 510   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 511   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 512   setTruncStoreAction(MVT::f128, MVT::f80, Expand);
 513   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
 514   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
 515   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 516
 517   setOperationAction(ISD::BITCAST, MVT::i16, Custom);
 518   setOperationAction(ISD::BITCAST, MVT::f16, Custom);
 519
 520   // Indexed loads and stores are supported.
 521   for (unsigned im = (unsigned)ISD::PRE_INC;
 522        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
 523     setIndexedLoadAction(im, MVT::i8, Legal);
 524     setIndexedLoadAction(im, MVT::i16, Legal);
 525     setIndexedLoadAction(im, MVT::i32, Legal);
 526     setIndexedLoadAction(im, MVT::i64, Legal);
 527     setIndexedLoadAction(im, MVT::f64, Legal);
 528     setIndexedLoadAction(im, MVT::f32, Legal);
 529     setIndexedLoadAction(im, MVT::f16, Legal);
 530     setIndexedStoreAction(im, MVT::i8, Legal);
 531     setIndexedStoreAction(im, MVT::i16, Legal);
 532     setIndexedStoreAction(im, MVT::i32, Legal);
 533     setIndexedStoreAction(im, MVT::i64, Legal);
 534     setIndexedStoreAction(im, MVT::f64, Legal);
 535     setIndexedStoreAction(im, MVT::f32, Legal);
 536     setIndexedStoreAction(im, MVT::f16, Legal);
 537   }
 538
 539   // Trap.
 540   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 541
 542   // We combine OR nodes for bitfield operations.
 543   setTargetDAGCombine(ISD::OR);
 544
 545   // Vector add and sub nodes may conceal a high-half opportunity.
 546   // Also, try to fold ADD into CSINC/CSINV..
 547   setTargetDAGCombine(ISD::ADD);
 548   setTargetDAGCombine(ISD::SUB);
 549   setTargetDAGCombine(ISD::SRL);
 550   setTargetDAGCombine(ISD::XOR);
 551   setTargetDAGCombine(ISD::SINT_TO_FP);
 552   setTargetDAGCombine(ISD::UINT_TO_FP);
 553
 554   setTargetDAGCombine(ISD::FP_TO_SINT);
 555   setTargetDAGCombine(ISD::FP_TO_UINT);
 556   setTargetDAGCombine(ISD::FDIV);
 557
 558   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 559
 560   setTargetDAGCombine(ISD::ANY_EXTEND);
 561   setTargetDAGCombine(ISD::ZERO_EXTEND);
 562   setTargetDAGCombine(ISD::SIGN_EXTEND);
 563   setTargetDAGCombine(ISD::BITCAST);
 564   setTargetDAGCombine(ISD::CONCAT_VECTORS);
 565   setTargetDAGCombine(ISD::STORE);
 566   if (Subtarget->supportsAddressTopByteIgnored())
 567     setTargetDAGCombine(ISD::LOAD);
 568
 569   setTargetDAGCombine(ISD::MUL);
 570
 571   setTargetDAGCombine(ISD::SELECT);
 572   setTargetDAGCombine(ISD::VSELECT);
 573
 574   setTargetDAGCombine(ISD::INTRINSIC_VOID);
 575   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 576   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 577
 578   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
 579   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
 580   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
 581
 582   setStackPointerRegisterToSaveRestore(AArch64::SP);
 583
 584   setSchedulingPreference(Sched::Hybrid);
 585
 586   EnableExtLdPromotion = true;
 587
 588   // Set required alignment.
 589   setMinFunctionAlignment(2);
 590   // Set preferred alignments.
 591   setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
 592   setPrefLoopAlignment(STI.getPrefLoopAlignment());
 593
 594   // Only change the limit for entries in a jump table if specified by
 595   // the subtarget, but not at the command line.
 596   unsigned MaxJT = STI.getMaximumJumpTableSize();
 597   if (MaxJT && getMaximumJumpTableSize() == 0)
 598     setMaximumJumpTableSize(MaxJT);
 599
 600   setHasExtractBitsInsn(true);
 601
 602   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 603
 604   if (Subtarget->hasNEON()) {
 605     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
 606     // silliness like this:
 607     setOperationAction(ISD::FABS, MVT::v1f64, Expand);
 608     setOperationAction(ISD::FADD, MVT::v1f64, Expand);
 609     setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
 610     setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
 611     setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
 612     setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
 613     setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
 614     setOperationAction(ISD::FMA, MVT::v1f64, Expand);
 615     setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
 616     setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
 617     setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
 618     setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
 619     setOperationAction(ISD::FREM, MVT::v1f64, Expand);
 620     setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
 621     setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
 622     setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
 623     setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
 624     setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
 625     setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
 626     setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
 627     setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
 628     setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
 629     setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
 630     setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
 631     setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
 632
 633     setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
 634     setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
 635     setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
 636     setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
 637     setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
 638
 639     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
 640
 641     // AArch64 doesn't have a direct vector ->f32 conversion instructions for
 642     // elements smaller than i32, so promote the input to i32 first.
 643     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
 644     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
 645     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
 646     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
 647     // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
 648     // -> v8f16 conversions.
 649     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
 650     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
 651     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
 652     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
 653     // Similarly, there is no direct i32 -> f64 vector conversion instruction.
 654     setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
 655     setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
 656     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
 657     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
 658     // Or, direct i32 -> f16 vector conversion.  Set it so custom, so the
 659     // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
 660     setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
 661     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
 662
 663     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
 664     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
 665
 666     setOperationAction(ISD::CTTZ,       MVT::v2i8,  Expand);
 667     setOperationAction(ISD::CTTZ,       MVT::v4i16, Expand);
 668     setOperationAction(ISD::CTTZ,       MVT::v2i32, Expand);
 669     setOperationAction(ISD::CTTZ,       MVT::v1i64, Expand);
 670     setOperationAction(ISD::CTTZ,       MVT::v16i8, Expand);
 671     setOperationAction(ISD::CTTZ,       MVT::v8i16, Expand);
 672     setOperationAction(ISD::CTTZ,       MVT::v4i32, Expand);
 673     setOperationAction(ISD::CTTZ,       MVT::v2i64, Expand);
 674
 675     // AArch64 doesn't have MUL.2d:
 676     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
 677     // Custom handling for some quad-vector types to detect MULL.
 678     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
 679     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
 680     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
 681
 682     // Vector reductions
 683     for (MVT VT : MVT::integer_valuetypes()) {
 684       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
 685       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
 686       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
 687       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
 688       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
 689     }
 690     for (MVT VT : MVT::fp_valuetypes()) {
 691       setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
 692       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
 693     }
 694
 695     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
 696     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
 697     // Likewise, narrowing and extending vector loads/stores aren't handled
 698     // directly.
 699     for (MVT VT : MVT::vector_valuetypes()) {
 700       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 701
 702       setOperationAction(ISD::MULHS, VT, Expand);
 703       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 704       setOperationAction(ISD::MULHU, VT, Expand);
 705       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 706
 707       setOperationAction(ISD::BSWAP, VT, Expand);
 708
 709       for (MVT InnerVT : MVT::vector_valuetypes()) {
 710         setTruncStoreAction(VT, InnerVT, Expand);
 711         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
 712         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
 713         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
 714       }
 715     }
 716
 717     // AArch64 has implementations of a lot of rounding-like FP operations.
 718     for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
 719       setOperationAction(ISD::FFLOOR, Ty, Legal);
 720       setOperationAction(ISD::FNEARBYINT, Ty, Legal);
 721       setOperationAction(ISD::FCEIL, Ty, Legal);
 722       setOperationAction(ISD::FRINT, Ty, Legal);
 723       setOperationAction(ISD::FTRUNC, Ty, Legal);
 724       setOperationAction(ISD::FROUND, Ty, Legal);
 725     }
 726   }
 727
 728   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 729 }
 730
 731 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
 732   assert(VT.isVector() && "VT should be a vector type");
 733
 734   if (VT.isFloatingPoint()) {
 735     MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
 736     setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
 737     setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
 738   }
 739
 740   // Mark vector float intrinsics as expand.
 741   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
 742     setOperationAction(ISD::FSIN, VT, Expand);
 743     setOperationAction(ISD::FCOS, VT, Expand);
 744     setOperationAction(ISD::FPOW, VT, Expand);
 745     setOperationAction(ISD::FLOG, VT, Expand);
 746     setOperationAction(ISD::FLOG2, VT, Expand);
 747     setOperationAction(ISD::FLOG10, VT, Expand);
 748     setOperationAction(ISD::FEXP, VT, Expand);
 749     setOperationAction(ISD::FEXP2, VT, Expand);
 750
 751     // But we do support custom-lowering for FCOPYSIGN.
 752     setOperationAction(ISD::FCOPYSIGN, VT, Custom);
 753   }
 754
 755   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 756   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 757   setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 758   setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 759   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 760   setOperationAction(ISD::SRA, VT, Custom);
 761   setOperationAction(ISD::SRL, VT, Custom);
 762   setOperationAction(ISD::SHL, VT, Custom);
 763   setOperationAction(ISD::AND, VT, Custom);
 764   setOperationAction(ISD::OR, VT, Custom);
 765   setOperationAction(ISD::SETCC, VT, Custom);
 766   setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
 767
 768   setOperationAction(ISD::SELECT, VT, Expand);
 769   setOperationAction(ISD::SELECT_CC, VT, Expand);
 770   setOperationAction(ISD::VSELECT, VT, Expand);
 771   for (MVT InnerVT : MVT::all_valuetypes())
 772     setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 773
 774   // CNT supports only B element sizes.
 775   if (VT != MVT::v8i8 && VT != MVT::v16i8)
 776     setOperationAction(ISD::CTPOP, VT, Expand);
 777
 778   setOperationAction(ISD::UDIV, VT, Expand);
 779   setOperationAction(ISD::SDIV, VT, Expand);
 780   setOperationAction(ISD::UREM, VT, Expand);
 781   setOperationAction(ISD::SREM, VT, Expand);
 782   setOperationAction(ISD::FREM, VT, Expand);
 783
 784   setOperationAction(ISD::FP_TO_SINT, VT, Custom);
 785   setOperationAction(ISD::FP_TO_UINT, VT, Custom);
 786
 787   if (!VT.isFloatingPoint())
 788     setOperationAction(ISD::ABS, VT, Legal);
 789
 790   // [SU][MIN|MAX] are available for all NEON types apart from i64.
 791   if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
 792     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
 793       setOperationAction(Opcode, VT, Legal);
 794
 795   // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
 796   if (VT.isFloatingPoint() &&
 797       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
 798     for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
 799                             ISD::FMINNUM, ISD::FMAXNUM})
 800       setOperationAction(Opcode, VT, Legal);
 801
 802   if (Subtarget->isLittleEndian()) {
 803     for (unsigned im = (unsigned)ISD::PRE_INC;
 804          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
 805       setIndexedLoadAction(im, VT, Legal);
 806       setIndexedStoreAction(im, VT, Legal);
 807     }
 808   }
 809 }
 810
 811 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
 812   addRegisterClass(VT, &AArch64::FPR64RegClass);
 813   addTypeForNEON(VT, MVT::v2i32);
 814 }
 815
 816 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
 817   addRegisterClass(VT, &AArch64::FPR128RegClass);
 818   addTypeForNEON(VT, MVT::v4i32);
 819 }
 820
 821 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
 822                                               EVT VT) const {
 823   if (!VT.isVector())
 824     return MVT::i32;
 825   return VT.changeVectorElementTypeToInteger();
 826 }
 827
 828 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
 829                                const APInt &Demanded,
 830                                TargetLowering::TargetLoweringOpt &TLO,
 831                                unsigned NewOpc) {
 832   uint64_t OldImm = Imm, NewImm, Enc;
 833   uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
 834
 835   // Return if the immediate is already all zeros, all ones, a bimm32 or a
 836   // bimm64.
 837   if (Imm == 0 || Imm == Mask ||
 838       AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
 839     return false;
 840
 841   unsigned EltSize = Size;
 842   uint64_t DemandedBits = Demanded.getZExtValue();
 843
 844   // Clear bits that are not demanded.
 845   Imm &= DemandedBits;
 846
 847   while (true) {
 848     // The goal here is to set the non-demanded bits in a way that minimizes
 849     // the number of switching between 0 and 1. In order to achieve this goal,
 850     // we set the non-demanded bits to the value of the preceding demanded bits.
 851     // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
 852     // non-demanded bit), we copy bit0 (1) to the least significant 'x',
 853     // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
 854     // The final result is 0b11000011.
 855     uint64_t NonDemandedBits = ~DemandedBits;
 856     uint64_t InvertedImm = ~Imm & DemandedBits;
 857     uint64_t RotatedImm =
 858         ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
 859         NonDemandedBits;
 860     uint64_t Sum = RotatedImm + NonDemandedBits;
 861     bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
 862     uint64_t Ones = (Sum + Carry) & NonDemandedBits;
 863     NewImm = (Imm | Ones) & Mask;
 864
 865     // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
 866     // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
 867     // we halve the element size and continue the search.
 868     if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
 869       break;
 870
 871     // We cannot shrink the element size any further if it is 2-bits.
 872     if (EltSize == 2)
 873       return false;
 874
 875     EltSize /= 2;
 876     Mask >>= EltSize;
 877     uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
 878
 879     // Return if there is mismatch in any of the demanded bits of Imm and Hi.
 880     if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
 881       return false;
 882
 883     // Merge the upper and lower halves of Imm and DemandedBits.
 884     Imm |= Hi;
 885     DemandedBits |= DemandedBitsHi;
 886   }
 887
 888   ++NumOptimizedImms;
 889
 890   // Replicate the element across the register width.
 891   while (EltSize < Size) {
 892     NewImm |= NewImm << EltSize;
 893     EltSize *= 2;
 894   }
 895
 896   (void)OldImm;
 897   assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
 898          "demanded bits should never be altered");
 899   assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
 900
 901   // Create the new constant immediate node.
 902   EVT VT = Op.getValueType();
 903   SDLoc DL(Op);
 904   SDValue New;
 905
 906   // If the new constant immediate is all-zeros or all-ones, let the target
 907   // independent DAG combine optimize this node.
 908   if (NewImm == 0 || NewImm == OrigMask) {
 909     New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
 910                           TLO.DAG.getConstant(NewImm, DL, VT));
 911   // Otherwise, create a machine node so that target independent DAG combine
 912   // doesn't undo this optimization.
 913   } else {
 914     Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
 915     SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
 916     New = SDValue(
 917         TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
 918   }
 919
 920   return TLO.CombineTo(Op, New);
 921 }
 922
 923 bool AArch64TargetLowering::targetShrinkDemandedConstant(
 924     SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
 925   // Delay this optimization to as late as possible.
 926   if (!TLO.LegalOps)
 927     return false;
 928
 929   if (!EnableOptimizeLogicalImm)
 930     return false;
 931
 932   EVT VT = Op.getValueType();
 933   if (VT.isVector())
 934     return false;
 935
 936   unsigned Size = VT.getSizeInBits();
 937   assert((Size == 32 || Size == 64) &&
 938          "i32 or i64 is expected after legalization.");
 939
 940   // Exit early if we demand all bits.
 941   if (Demanded.countPopulation() == Size)
 942     return false;
 943
 944   unsigned NewOpc;
 945   switch (Op.getOpcode()) {
 946   default:
 947     return false;
 948   case ISD::AND:
 949     NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
 950     break;
 951   case ISD::OR:
 952     NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
 953     break;
 954   case ISD::XOR:
 955     NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
 956     break;
 957   }
 958   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
 959   if (!C)
 960     return false;
 961   uint64_t Imm = C->getZExtValue();
 962   return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
 963 }
 964
 965 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
 966 /// Mask are known to be either zero or one and return them Known.
 967 void AArch64TargetLowering::computeKnownBitsForTargetNode(
 968     const SDValue Op, KnownBits &Known,
 969     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
 970   switch (Op.getOpcode()) {
 971   default:
 972     break;
 973   case AArch64ISD::CSEL: {
 974     KnownBits Known2;
 975     DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
 976     DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
 977     Known.Zero &= Known2.Zero;
 978     Known.One &= Known2.One;
 979     break;
 980   }
 981   case ISD::INTRINSIC_W_CHAIN: {
 982     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
 983     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
 984     switch (IntID) {
 985     default: return;
 986     case Intrinsic::aarch64_ldaxr:
 987     case Intrinsic::aarch64_ldxr: {
 988       unsigned BitWidth = Known.getBitWidth();
 989       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
 990       unsigned MemBits = VT.getScalarSizeInBits();
 991       Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
 992       return;
 993     }
 994     }
 995     break;
 996   }
 997   case ISD::INTRINSIC_WO_CHAIN:
 998   case ISD::INTRINSIC_VOID: {
 999     unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1000     switch (IntNo) {
1001     default:
1002       break;
1003     case Intrinsic::aarch64_neon_umaxv:
1004     case Intrinsic::aarch64_neon_uminv: {
1005       // Figure out the datatype of the vector operand. The UMINV instruction
1006       // will zero extend the result, so we can mark as known zero all the
1007       // bits larger than the element datatype. 32-bit or larget doesn't need
1008       // this as those are legal types and will be handled by isel directly.
1009       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1010       unsigned BitWidth = Known.getBitWidth();
1011       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1012         assert(BitWidth >= 8 && "Unexpected width!");
1013         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1014         Known.Zero |= Mask;
1015       } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1016         assert(BitWidth >= 16 && "Unexpected width!");
1017         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1018         Known.Zero |= Mask;
1019       }
1020       break;
1021     } break;
1022     }
1023   }
1024   }
1025 }
1026
1027 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
1028                                                   EVT) const {
1029   return MVT::i64;
1030 }
1031
1032 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1033                                                            unsigned AddrSpace,
1034                                                            unsigned Align,
1035                                                            bool *Fast) const {
1036   if (Subtarget->requiresStrictAlign())
1037     return false;
1038
1039   if (Fast) {
1040     // Some CPUs are fine with unaligned stores except for 128-bit ones.
1041     *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1042             // See comments in performSTORECombine() for more details about
1043             // these conditions.
1044
1045             // Code that uses clang vector extensions can mark that it
1046             // wants unaligned accesses to be treated as fast by
1047             // underspecifying alignment to be 1 or 2.
1048             Align <= 2 ||
1049
1050             // Disregard v2i64. Memcpy lowering produces those and splitting
1051             // them regresses performance on micro-benchmarks and olden/bh.
1052             VT == MVT::v2i64;
1053   }
1054   return true;
1055 }
1056
1057 FastISel *
1058 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1059                                       const TargetLibraryInfo *libInfo) const {
1060   return AArch64::createFastISel(funcInfo, libInfo);
1061 }
1062
1063 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1064   switch ((AArch64ISD::NodeType)Opcode) {
1065   case AArch64ISD::FIRST_NUMBER:      break;
1066   case AArch64ISD::CALL:              return "AArch64ISD::CALL";
1067   case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
1068   case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
1069   case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
1070   case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
1071   case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
1072   case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
1073   case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
1074   case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
1075   case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
1076   case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
1077   case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
1078   case AArch64ISD::TLSDESC_CALLSEQ:   return "AArch64ISD::TLSDESC_CALLSEQ";
1079   case AArch64ISD::ADC:               return "AArch64ISD::ADC";
1080   case AArch64ISD::SBC:               return "AArch64ISD::SBC";
1081   case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
1082   case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
1083   case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
1084   case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
1085   case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
1086   case AArch64ISD::CCMP:              return "AArch64ISD::CCMP";
1087   case AArch64ISD::CCMN:              return "AArch64ISD::CCMN";
1088   case AArch64ISD::FCCMP:             return "AArch64ISD::FCCMP";
1089   case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
1090   case AArch64ISD::DUP:               return "AArch64ISD::DUP";
1091   case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
1092   case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
1093   case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
1094   case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
1095   case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
1096   case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
1097   case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
1098   case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
1099   case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
1100   case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
1101   case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
1102   case AArch64ISD::BICi:              return "AArch64ISD::BICi";
1103   case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
1104   case AArch64ISD::BSL:               return "AArch64ISD::BSL";
1105   case AArch64ISD::NEG:               return "AArch64ISD::NEG";
1106   case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
1107   case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
1108   case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
1109   case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
1110   case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
1111   case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
1112   case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
1113   case AArch64ISD::REV16:             return "AArch64ISD::REV16";
1114   case AArch64ISD::REV32:             return "AArch64ISD::REV32";
1115   case AArch64ISD::REV64:             return "AArch64ISD::REV64";
1116   case AArch64ISD::EXT:               return "AArch64ISD::EXT";
1117   case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
1118   case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
1119   case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
1120   case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
1121   case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
1122   case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
1123   case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
1124   case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
1125   case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
1126   case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
1127   case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
1128   case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
1129   case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
1130   case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
1131   case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
1132   case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
1133   case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
1134   case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
1135   case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
1136   case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
1137   case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
1138   case AArch64ISD::SADDV:             return "AArch64ISD::SADDV";
1139   case AArch64ISD::UADDV:             return "AArch64ISD::UADDV";
1140   case AArch64ISD::SMINV:             return "AArch64ISD::SMINV";
1141   case AArch64ISD::UMINV:             return "AArch64ISD::UMINV";
1142   case AArch64ISD::SMAXV:             return "AArch64ISD::SMAXV";
1143   case AArch64ISD::UMAXV:             return "AArch64ISD::UMAXV";
1144   case AArch64ISD::NOT:               return "AArch64ISD::NOT";
1145   case AArch64ISD::BIT:               return "AArch64ISD::BIT";
1146   case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
1147   case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
1148   case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
1149   case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
1150   case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
1151   case AArch64ISD::PREFETCH:          return "AArch64ISD::PREFETCH";
1152   case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
1153   case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
1154   case AArch64ISD::NVCAST:            return "AArch64ISD::NVCAST";
1155   case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
1156   case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
1157   case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
1158   case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
1159   case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
1160   case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
1161   case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
1162   case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
1163   case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
1164   case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
1165   case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
1166   case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
1167   case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
1168   case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
1169   case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
1170   case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
1171   case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
1172   case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
1173   case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
1174   case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
1175   case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
1176   case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
1177   case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
1178   case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
1179   case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
1180   case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
1181   case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
1182   case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
1183   case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
1184   case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
1185   case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
1186   case AArch64ISD::FRECPE:            return "AArch64ISD::FRECPE";
1187   case AArch64ISD::FRECPS:            return "AArch64ISD::FRECPS";
1188   case AArch64ISD::FRSQRTE:           return "AArch64ISD::FRSQRTE";
1189   case AArch64ISD::FRSQRTS:           return "AArch64ISD::FRSQRTS";
1190   }
1191   return nullptr;
1192 }
1193
1194 MachineBasicBlock *
1195 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
1196                                     MachineBasicBlock *MBB) const {
1197   // We materialise the F128CSEL pseudo-instruction as some control flow and a
1198   // phi node:
1199
1200   // OrigBB:
1201   //     [... previous instrs leading to comparison ...]
1202   //     b.ne TrueBB
1203   //     b EndBB
1204   // TrueBB:
1205   //     ; Fallthrough
1206   // EndBB:
1207   //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
1208
1209   MachineFunction *MF = MBB->getParent();
1210   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1211   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
1212   DebugLoc DL = MI.getDebugLoc();
1213   MachineFunction::iterator It = ++MBB->getIterator();
1214
1215   unsigned DestReg = MI.getOperand(0).getReg();
1216   unsigned IfTrueReg = MI.getOperand(1).getReg();
1217   unsigned IfFalseReg = MI.getOperand(2).getReg();
1218   unsigned CondCode = MI.getOperand(3).getImm();
1219   bool NZCVKilled = MI.getOperand(4).isKill();
1220
1221   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
1222   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
1223   MF->insert(It, TrueBB);
1224   MF->insert(It, EndBB);
1225
1226   // Transfer rest of current basic-block to EndBB
1227   EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
1228                 MBB->end());
1229   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
1230
1231   BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
1232   BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
1233   MBB->addSuccessor(TrueBB);
1234   MBB->addSuccessor(EndBB);
1235
1236   // TrueBB falls through to the end.
1237   TrueBB->addSuccessor(EndBB);
1238
1239   if (!NZCVKilled) {
1240     TrueBB->addLiveIn(AArch64::NZCV);
1241     EndBB->addLiveIn(AArch64::NZCV);
1242   }
1243
1244   BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
1245       .addReg(IfTrueReg)
1246       .addMBB(TrueBB)
1247       .addReg(IfFalseReg)
1248       .addMBB(MBB);
1249
1250   MI.eraseFromParent();
1251   return EndBB;
1252 }
1253
1254 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
1255     MachineInstr &MI, MachineBasicBlock *BB) const {
1256   switch (MI.getOpcode()) {
1257   default:
1258 #ifndef NDEBUG
1259     MI.dump();
1260 #endif
1261     llvm_unreachable("Unexpected instruction for custom inserter!");
1262
1263   case AArch64::F128CSEL:
1264     return EmitF128CSEL(MI, BB);
1265
1266   case TargetOpcode::STACKMAP:
1267   case TargetOpcode::PATCHPOINT:
1268     return emitPatchPoint(MI, BB);
1269   }
1270 }
1271
1272 //===----------------------------------------------------------------------===//
1273 // AArch64 Lowering private implementation.
1274 //===----------------------------------------------------------------------===//
1275
1276 //===----------------------------------------------------------------------===//
1277 // Lowering Code
1278 //===----------------------------------------------------------------------===//
1279
1280 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
1281 /// CC
1282 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
1283   switch (CC) {
1284   default:
1285     llvm_unreachable("Unknown condition code!");
1286   case ISD::SETNE:
1287     return AArch64CC::NE;
1288   case ISD::SETEQ:
1289     return AArch64CC::EQ;
1290   case ISD::SETGT:
1291     return AArch64CC::GT;
1292   case ISD::SETGE:
1293     return AArch64CC::GE;
1294   case ISD::SETLT:
1295     return AArch64CC::LT;
1296   case ISD::SETLE:
1297     return AArch64CC::LE;
1298   case ISD::SETUGT:
1299     return AArch64CC::HI;
1300   case ISD::SETUGE:
1301     return AArch64CC::HS;
1302   case ISD::SETULT:
1303     return AArch64CC::LO;
1304   case ISD::SETULE:
1305     return AArch64CC::LS;
1306   }
1307 }
1308
1309 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
1310 static void changeFPCCToAArch64CC(ISD::CondCode CC,
1311                                   AArch64CC::CondCode &CondCode,
1312                                   AArch64CC::CondCode &CondCode2) {
1313   CondCode2 = AArch64CC::AL;
1314   switch (CC) {
1315   default:
1316     llvm_unreachable("Unknown FP condition!");
1317   case ISD::SETEQ:
1318   case ISD::SETOEQ:
1319     CondCode = AArch64CC::EQ;
1320     break;
1321   case ISD::SETGT:
1322   case ISD::SETOGT:
1323     CondCode = AArch64CC::GT;
1324     break;
1325   case ISD::SETGE:
1326   case ISD::SETOGE:
1327     CondCode = AArch64CC::GE;
1328     break;
1329   case ISD::SETOLT:
1330     CondCode = AArch64CC::MI;
1331     break;
1332   case ISD::SETOLE:
1333     CondCode = AArch64CC::LS;
1334     break;
1335   case ISD::SETONE:
1336     CondCode = AArch64CC::MI;
1337     CondCode2 = AArch64CC::GT;
1338     break;
1339   case ISD::SETO:
1340     CondCode = AArch64CC::VC;
1341     break;
1342   case ISD::SETUO:
1343     CondCode = AArch64CC::VS;
1344     break;
1345   case ISD::SETUEQ:
1346     CondCode = AArch64CC::EQ;
1347     CondCode2 = AArch64CC::VS;
1348     break;
1349   case ISD::SETUGT:
1350     CondCode = AArch64CC::HI;
1351     break;
1352   case ISD::SETUGE:
1353     CondCode = AArch64CC::PL;
1354     break;
1355   case ISD::SETLT:
1356   case ISD::SETULT:
1357     CondCode = AArch64CC::LT;
1358     break;
1359   case ISD::SETLE:
1360   case ISD::SETULE:
1361     CondCode = AArch64CC::LE;
1362     break;
1363   case ISD::SETNE:
1364   case ISD::SETUNE:
1365     CondCode = AArch64CC::NE;
1366     break;
1367   }
1368 }
1369
1370 /// Convert a DAG fp condition code to an AArch64 CC.
1371 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1372 /// should be AND'ed instead of OR'ed.
1373 static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
1374                                      AArch64CC::CondCode &CondCode,
1375                                      AArch64CC::CondCode &CondCode2) {
1376   CondCode2 = AArch64CC::AL;
1377   switch (CC) {
1378   default:
1379     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1380     assert(CondCode2 == AArch64CC::AL);
1381     break;
1382   case ISD::SETONE:
1383     // (a one b)
1384     // == ((a olt b) || (a ogt b))
1385     // == ((a ord b) && (a une b))
1386     CondCode = AArch64CC::VC;
1387     CondCode2 = AArch64CC::NE;
1388     break;
1389   case ISD::SETUEQ:
1390     // (a ueq b)
1391     // == ((a uno b) || (a oeq b))
1392     // == ((a ule b) && (a uge b))
1393     CondCode = AArch64CC::PL;
1394     CondCode2 = AArch64CC::LE;
1395     break;
1396   }
1397 }
1398
1399 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
1400 /// CC usable with the vector instructions. Fewer operations are available
1401 /// without a real NZCV register, so we have to use less efficient combinations
1402 /// to get the same effect.
1403 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
1404                                         AArch64CC::CondCode &CondCode,
1405                                         AArch64CC::CondCode &CondCode2,
1406                                         bool &Invert) {
1407   Invert = false;
1408   switch (CC) {
1409   default:
1410     // Mostly the scalar mappings work fine.
1411     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
1412     break;
1413   case ISD::SETUO:
1414     Invert = true;
1415     LLVM_FALLTHROUGH;
1416   case ISD::SETO:
1417     CondCode = AArch64CC::MI;
1418     CondCode2 = AArch64CC::GE;
1419     break;
1420   case ISD::SETUEQ:
1421   case ISD::SETULT:
1422   case ISD::SETULE:
1423   case ISD::SETUGT:
1424   case ISD::SETUGE:
1425     // All of the compare-mask comparisons are ordered, but we can switch
1426     // between the two by a double inversion. E.g. ULE == !OGT.
1427     Invert = true;
1428     changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
1429     break;
1430   }
1431 }
1432
1433 static bool isLegalArithImmed(uint64_t C) {
1434   // Matches AArch64DAGToDAGISel::SelectArithImmed().
1435   bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
1436   DEBUG(dbgs() << "Is imm " << C << " legal: " << (IsLegal ? "yes\n" : "no\n"));
1437   return IsLegal;
1438 }
1439
1440 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
1441                               const SDLoc &dl, SelectionDAG &DAG) {
1442   EVT VT = LHS.getValueType();
1443   const bool FullFP16 =
1444     static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1445
1446   if (VT.isFloatingPoint()) {
1447     assert(VT != MVT::f128);
1448     if (VT == MVT::f16 && !FullFP16) {
1449       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
1450       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
1451       VT = MVT::f32;
1452     }
1453     return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
1454   }
1455
1456   // The CMP instruction is just an alias for SUBS, and representing it as
1457   // SUBS means that it's possible to get CSE with subtract operations.
1458   // A later phase can perform the optimization of setting the destination
1459   // register to WZR/XZR if it ends up being unused.
1460   unsigned Opcode = AArch64ISD::SUBS;
1461
1462   if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
1463       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1464     // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
1465     // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
1466     // can be set differently by this operation. It comes down to whether
1467     // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
1468     // everything is fine. If not then the optimization is wrong. Thus general
1469     // comparisons are only valid if op2 != 0.
1470
1471     // So, finally, the only LLVM-native comparisons that don't mention C and V
1472     // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
1473     // the absence of information about op2.
1474     Opcode = AArch64ISD::ADDS;
1475     RHS = RHS.getOperand(1);
1476   } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
1477              !isUnsignedIntSetCC(CC)) {
1478     // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
1479     // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
1480     // of the signed comparisons.
1481     Opcode = AArch64ISD::ANDS;
1482     RHS = LHS.getOperand(1);
1483     LHS = LHS.getOperand(0);
1484   }
1485
1486   return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
1487       .getValue(1);
1488 }
1489
1490 /// \defgroup AArch64CCMP CMP;CCMP matching
1491 ///
1492 /// These functions deal with the formation of CMP;CCMP;... sequences.
1493 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
1494 /// a comparison. They set the NZCV flags to a predefined value if their
1495 /// predicate is false. This allows to express arbitrary conjunctions, for
1496 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
1497 /// expressed as:
1498 ///   cmp A
1499 ///   ccmp B, inv(CB), CA
1500 ///   check for CB flags
1501 ///
1502 /// In general we can create code for arbitrary "... (and (and A B) C)"
1503 /// sequences. We can also implement some "or" expressions, because "(or A B)"
1504 /// is equivalent to "not (and (not A) (not B))" and we can implement some
1505 /// negation operations:
1506 /// We can negate the results of a single comparison by inverting the flags
1507 /// used when the predicate fails and inverting the flags tested in the next
1508 /// instruction; We can also negate the results of the whole previous
1509 /// conditional compare sequence by inverting the flags tested in the next
1510 /// instruction. However there is no way to negate the result of a partial
1511 /// sequence.
1512 ///
1513 /// Therefore on encountering an "or" expression we can negate the subtree on
1514 /// one side and have to be able to push the negate to the leafs of the subtree
1515 /// on the other side (see also the comments in code). As complete example:
1516 /// "or (or (setCA (cmp A)) (setCB (cmp B)))
1517 ///     (and (setCC (cmp C)) (setCD (cmp D)))"
1518 /// is transformed to
1519 /// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
1520 ///           (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
1521 /// and implemented as:
1522 ///   cmp C
1523 ///   ccmp D, inv(CD), CC
1524 ///   ccmp A, CA, inv(CD)
1525 ///   ccmp B, CB, inv(CA)
1526 ///   check for CB flags
1527 /// A counterexample is "or (and A B) (and C D)" which cannot be implemented
1528 /// by conditional compare sequences.
1529 /// @{
1530
1531 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
1532 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
1533                                          ISD::CondCode CC, SDValue CCOp,
1534                                          AArch64CC::CondCode Predicate,
1535                                          AArch64CC::CondCode OutCC,
1536                                          const SDLoc &DL, SelectionDAG &DAG) {
1537   unsigned Opcode = 0;
1538   const bool FullFP16 =
1539     static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
1540
1541   if (LHS.getValueType().isFloatingPoint()) {
1542     assert(LHS.getValueType() != MVT::f128);
1543     if (LHS.getValueType() == MVT::f16 && !FullFP16) {
1544       LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
1545       RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
1546     }
1547     Opcode = AArch64ISD::FCCMP;
1548   } else if (RHS.getOpcode() == ISD::SUB) {
1549     SDValue SubOp0 = RHS.getOperand(0);
1550     if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
1551       // See emitComparison() on why we can only do this for SETEQ and SETNE.
1552       Opcode = AArch64ISD::CCMN;
1553       RHS = RHS.getOperand(1);
1554     }
1555   }
1556   if (Opcode == 0)
1557     Opcode = AArch64ISD::CCMP;
1558
1559   SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
1560   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
1561   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
1562   SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
1563   return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
1564 }
1565
1566 /// Returns true if @p Val is a tree of AND/OR/SETCC operations.
1567 /// CanPushNegate is set to true if we can push a negate operation through
1568 /// the tree in a was that we are left with AND operations and negate operations
1569 /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
1570 /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
1571 /// brought into such a form.
1572 static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
1573                                          unsigned Depth = 0) {
1574   if (!Val.hasOneUse())
1575     return false;
1576   unsigned Opcode = Val->getOpcode();
1577   if (Opcode == ISD::SETCC) {
1578     if (Val->getOperand(0).getValueType() == MVT::f128)
1579       return false;
1580     CanNegate = true;
1581     return true;
1582   }
1583   // Protect against exponential runtime and stack overflow.
1584   if (Depth > 6)
1585     return false;
1586   if (Opcode == ISD::AND || Opcode == ISD::OR) {
1587     SDValue O0 = Val->getOperand(0);
1588     SDValue O1 = Val->getOperand(1);
1589     bool CanNegateL;
1590     if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
1591       return false;
1592     bool CanNegateR;
1593     if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
1594       return false;
1595
1596     if (Opcode == ISD::OR) {
1597       // For an OR expression we need to be able to negate at least one side or
1598       // we cannot do the transformation at all.
1599       if (!CanNegateL && !CanNegateR)
1600         return false;
1601       // We can however change a (not (or x y)) to (and (not x) (not y)) if we
1602       // can negate the x and y subtrees.
1603       CanNegate = CanNegateL && CanNegateR;
1604     } else {
1605       // If the operands are OR expressions then we finally need to negate their
1606       // outputs, we can only do that for the operand with emitted last by
1607       // negating OutCC, not for both operands.
1608       bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
1609       bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
1610       if (NeedsNegOutL && NeedsNegOutR)
1611         return false;
1612       // We cannot negate an AND operation (it would become an OR),
1613       CanNegate = false;
1614     }
1615     return true;
1616   }
1617   return false;
1618 }
1619
1620 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1621 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1622 /// Tries to transform the given i1 producing node @p Val to a series compare
1623 /// and conditional compare operations. @returns an NZCV flags producing node
1624 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
1625 /// transformation was not possible.
1626 /// On recursive invocations @p PushNegate may be set to true to have negation
1627 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
1628 /// for the comparisons in the current subtree; @p Depth limits the search
1629 /// depth to avoid stack overflow.
1630 static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
1631     AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
1632     AArch64CC::CondCode Predicate) {
1633   // We're at a tree leaf, produce a conditional comparison operation.
1634   unsigned Opcode = Val->getOpcode();
1635   if (Opcode == ISD::SETCC) {
1636     SDValue LHS = Val->getOperand(0);
1637     SDValue RHS = Val->getOperand(1);
1638     ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
1639     bool isInteger = LHS.getValueType().isInteger();
1640     if (Negate)
1641       CC = getSetCCInverse(CC, isInteger);
1642     SDLoc DL(Val);
1643     // Determine OutCC and handle FP special case.
1644     if (isInteger) {
1645       OutCC = changeIntCCToAArch64CC(CC);
1646     } else {
1647       assert(LHS.getValueType().isFloatingPoint());
1648       AArch64CC::CondCode ExtraCC;
1649       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
1650       // Some floating point conditions can't be tested with a single condition
1651       // code. Construct an additional comparison in this case.
1652       if (ExtraCC != AArch64CC::AL) {
1653         SDValue ExtraCmp;
1654         if (!CCOp.getNode())
1655           ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
1656         else
1657           ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
1658                                                ExtraCC, DL, DAG);
1659         CCOp = ExtraCmp;
1660         Predicate = ExtraCC;
1661       }
1662     }
1663
1664     // Produce a normal comparison if we are first in the chain
1665     if (!CCOp)
1666       return emitComparison(LHS, RHS, CC, DL, DAG);
1667     // Otherwise produce a ccmp.
1668     return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
1669                                      DAG);
1670   }
1671   assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&
1672          "Valid conjunction/disjunction tree");
1673
1674   // Check if both sides can be transformed.
1675   SDValue LHS = Val->getOperand(0);
1676   SDValue RHS = Val->getOperand(1);
1677
1678   // In case of an OR we need to negate our operands and the result.
1679   // (A v B) <=> not(not(A) ^ not(B))
1680   bool NegateOpsAndResult = Opcode == ISD::OR;
1681   // We can negate the results of all previous operations by inverting the
1682   // predicate flags giving us a free negation for one side. The other side
1683   // must be negatable by itself.
1684   if (NegateOpsAndResult) {
1685     // See which side we can negate.
1686     bool CanNegateL;
1687     bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
1688     assert(isValidL && "Valid conjunction/disjunction tree");
1689     (void)isValidL;
1690
1691 #ifndef NDEBUG
1692     bool CanNegateR;
1693     bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
1694     assert(isValidR && "Valid conjunction/disjunction tree");
1695     assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
1696 #endif
1697
1698     // Order the side which we cannot negate to RHS so we can emit it first.
1699     if (!CanNegateL)
1700       std::swap(LHS, RHS);
1701   } else {
1702     bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
1703     assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&
1704            "Valid conjunction/disjunction tree");
1705     // Order the side where we need to negate the output flags to RHS so it
1706     // gets emitted first.
1707     if (NeedsNegOutL)
1708       std::swap(LHS, RHS);
1709   }
1710
1711   // Emit RHS. If we want to negate the tree we only need to push a negate
1712   // through if we are already in a PushNegate case, otherwise we can negate
1713   // the "flags to test" afterwards.
1714   AArch64CC::CondCode RHSCC;
1715   SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
1716                                                    CCOp, Predicate);
1717   if (NegateOpsAndResult && !Negate)
1718     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
1719   // Emit LHS. We may need to negate it.
1720   SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
1721                                                    NegateOpsAndResult, CmpR,
1722                                                    RHSCC);
1723   // If we transformed an OR to and AND then we have to negate the result
1724   // (or absorb the Negate parameter).
1725   if (NegateOpsAndResult && !Negate)
1726     OutCC = AArch64CC::getInvertedCondCode(OutCC);
1727   return CmpL;
1728 }
1729
1730 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
1731 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
1732 /// \see emitConjunctionDisjunctionTreeRec().
1733 static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
1734                                               AArch64CC::CondCode &OutCC) {
1735   bool CanNegate;
1736   if (!isConjunctionDisjunctionTree(Val, CanNegate))
1737     return SDValue();
1738
1739   return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
1740                                            AArch64CC::AL);
1741 }
1742
1743 /// @}
1744
1745 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
1746                              SDValue &AArch64cc, SelectionDAG &DAG,
1747                              const SDLoc &dl) {
1748   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
1749     EVT VT = RHS.getValueType();
1750     uint64_t C = RHSC->getZExtValue();
1751     if (!isLegalArithImmed(C)) {
1752       // Constant does not fit, try adjusting it by one?
1753       switch (CC) {
1754       default:
1755         break;
1756       case ISD::SETLT:
1757       case ISD::SETGE:
1758         if ((VT == MVT::i32 && C != 0x80000000 &&
1759              isLegalArithImmed((uint32_t)(C - 1))) ||
1760             (VT == MVT::i64 && C != 0x80000000ULL &&
1761              isLegalArithImmed(C - 1ULL))) {
1762           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
1763           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1764           RHS = DAG.getConstant(C, dl, VT);
1765         }
1766         break;
1767       case ISD::SETULT:
1768       case ISD::SETUGE:
1769         if ((VT == MVT::i32 && C != 0 &&
1770              isLegalArithImmed((uint32_t)(C - 1))) ||
1771             (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
1772           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
1773           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
1774           RHS = DAG.getConstant(C, dl, VT);
1775         }
1776         break;
1777       case ISD::SETLE:
1778       case ISD::SETGT:
1779         if ((VT == MVT::i32 && C != INT32_MAX &&
1780              isLegalArithImmed((uint32_t)(C + 1))) ||
1781             (VT == MVT::i64 && C != INT64_MAX &&
1782              isLegalArithImmed(C + 1ULL))) {
1783           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
1784           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1785           RHS = DAG.getConstant(C, dl, VT);
1786         }
1787         break;
1788       case ISD::SETULE:
1789       case ISD::SETUGT:
1790         if ((VT == MVT::i32 && C != UINT32_MAX &&
1791              isLegalArithImmed((uint32_t)(C + 1))) ||
1792             (VT == MVT::i64 && C != UINT64_MAX &&
1793              isLegalArithImmed(C + 1ULL))) {
1794           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
1795           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
1796           RHS = DAG.getConstant(C, dl, VT);
1797         }
1798         break;
1799       }
1800     }
1801   }
1802   SDValue Cmp;
1803   AArch64CC::CondCode AArch64CC;
1804   if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
1805     const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
1806
1807     // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
1808     // For the i8 operand, the largest immediate is 255, so this can be easily
1809     // encoded in the compare instruction. For the i16 operand, however, the
1810     // largest immediate cannot be encoded in the compare.
1811     // Therefore, use a sign extending load and cmn to avoid materializing the
1812     // -1 constant. For example,
1813     // movz w1, #65535
1814     // ldrh w0, [x0, #0]
1815     // cmp w0, w1
1816     // >
1817     // ldrsh w0, [x0, #0]
1818     // cmn w0, #1
1819     // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
1820     // if and only if (sext LHS) == (sext RHS). The checks are in place to
1821     // ensure both the LHS and RHS are truly zero extended and to make sure the
1822     // transformation is profitable.
1823     if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
1824         cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
1825         cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
1826         LHS.getNode()->hasNUsesOfValue(1, 0)) {
1827       int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
1828       if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
1829         SDValue SExt =
1830             DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
1831                         DAG.getValueType(MVT::i16));
1832         Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
1833                                                    RHS.getValueType()),
1834                              CC, dl, DAG);
1835         AArch64CC = changeIntCCToAArch64CC(CC);
1836       }
1837     }
1838
1839     if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
1840       if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
1841         if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
1842           AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
1843       }
1844     }
1845   }
1846
1847   if (!Cmp) {
1848     Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
1849     AArch64CC = changeIntCCToAArch64CC(CC);
1850   }
1851   AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
1852   return Cmp;
1853 }
1854
1855 static std::pair<SDValue, SDValue>
1856 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
1857   assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
1858          "Unsupported value type");
1859   SDValue Value, Overflow;
1860   SDLoc DL(Op);
1861   SDValue LHS = Op.getOperand(0);
1862   SDValue RHS = Op.getOperand(1);
1863   unsigned Opc = 0;
1864   switch (Op.getOpcode()) {
1865   default:
1866     llvm_unreachable("Unknown overflow instruction!");
1867   case ISD::SADDO:
1868     Opc = AArch64ISD::ADDS;
1869     CC = AArch64CC::VS;
1870     break;
1871   case ISD::UADDO:
1872     Opc = AArch64ISD::ADDS;
1873     CC = AArch64CC::HS;
1874     break;
1875   case ISD::SSUBO:
1876     Opc = AArch64ISD::SUBS;
1877     CC = AArch64CC::VS;
1878     break;
1879   case ISD::USUBO:
1880     Opc = AArch64ISD::SUBS;
1881     CC = AArch64CC::LO;
1882     break;
1883   // Multiply needs a little bit extra work.
1884   case ISD::SMULO:
1885   case ISD::UMULO: {
1886     CC = AArch64CC::NE;
1887     bool IsSigned = Op.getOpcode() == ISD::SMULO;
1888     if (Op.getValueType() == MVT::i32) {
1889       unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1890       // For a 32 bit multiply with overflow check we want the instruction
1891       // selector to generate a widening multiply (SMADDL/UMADDL). For that we
1892       // need to generate the following pattern:
1893       // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
1894       LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
1895       RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
1896       SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
1897       SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
1898                                 DAG.getConstant(0, DL, MVT::i64));
1899       // On AArch64 the upper 32 bits are always zero extended for a 32 bit
1900       // operation. We need to clear out the upper 32 bits, because we used a
1901       // widening multiply that wrote all 64 bits. In the end this should be a
1902       // noop.
1903       Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
1904       if (IsSigned) {
1905         // The signed overflow check requires more than just a simple check for
1906         // any bit set in the upper 32 bits of the result. These bits could be
1907         // just the sign bits of a negative number. To perform the overflow
1908         // check we have to arithmetic shift right the 32nd bit of the result by
1909         // 31 bits. Then we compare the result to the upper 32 bits.
1910         SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
1911                                         DAG.getConstant(32, DL, MVT::i64));
1912         UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
1913         SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
1914                                         DAG.getConstant(31, DL, MVT::i64));
1915         // It is important that LowerBits is last, otherwise the arithmetic
1916         // shift will not be folded into the compare (SUBS).
1917         SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
1918         Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
1919                        .getValue(1);
1920       } else {
1921         // The overflow check for unsigned multiply is easy. We only need to
1922         // check if any of the upper 32 bits are set. This can be done with a
1923         // CMP (shifted register). For that we need to generate the following
1924         // pattern:
1925         // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
1926         SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
1927                                         DAG.getConstant(32, DL, MVT::i64));
1928         SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1929         Overflow =
1930             DAG.getNode(AArch64ISD::SUBS, DL, VTs,
1931                         DAG.getConstant(0, DL, MVT::i64),
1932                         UpperBits).getValue(1);
1933       }
1934       break;
1935     }
1936     assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
1937     // For the 64 bit multiply
1938     Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
1939     if (IsSigned) {
1940       SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
1941       SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
1942                                       DAG.getConstant(63, DL, MVT::i64));
1943       // It is important that LowerBits is last, otherwise the arithmetic
1944       // shift will not be folded into the compare (SUBS).
1945       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1946       Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
1947                      .getValue(1);
1948     } else {
1949       SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
1950       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
1951       Overflow =
1952           DAG.getNode(AArch64ISD::SUBS, DL, VTs,
1953                       DAG.getConstant(0, DL, MVT::i64),
1954                       UpperBits).getValue(1);
1955     }
1956     break;
1957   }
1958   } // switch (...)
1959
1960   if (Opc) {
1961     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
1962
1963     // Emit the AArch64 operation with overflow check.
1964     Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
1965     Overflow = Value.getValue(1);
1966   }
1967   return std::make_pair(Value, Overflow);
1968 }
1969
1970 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
1971                                              RTLIB::Libcall Call) const {
1972   SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
1973   return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
1974 }
1975
1976 // Returns true if the given Op is the overflow flag result of an overflow
1977 // intrinsic operation.
1978 static bool isOverflowIntrOpRes(SDValue Op) {
1979   unsigned Opc = Op.getOpcode();
1980   return (Op.getResNo() == 1 &&
1981           (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
1982            Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
1983 }
1984
1985 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
1986   SDValue Sel = Op.getOperand(0);
1987   SDValue Other = Op.getOperand(1);
1988   SDLoc dl(Sel);
1989
1990   // If the operand is an overflow checking operation, invert the condition
1991   // code and kill the Not operation. I.e., transform:
1992   // (xor (overflow_op_bool, 1))
1993   //   -->
1994   // (csel 1, 0, invert(cc), overflow_op_bool)
1995   // ... which later gets transformed to just a cset instruction with an
1996   // inverted condition code, rather than a cset + eor sequence.
1997   if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
1998     // Only lower legal XALUO ops.
1999     if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
2000       return SDValue();
2001
2002     SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2003     SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2004     AArch64CC::CondCode CC;
2005     SDValue Value, Overflow;
2006     std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
2007     SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2008     return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
2009                        CCVal, Overflow);
2010   }
2011   // If neither operand is a SELECT_CC, give up.
2012   if (Sel.getOpcode() != ISD::SELECT_CC)
2013     std::swap(Sel, Other);
2014   if (Sel.getOpcode() != ISD::SELECT_CC)
2015     return Op;
2016
2017   // The folding we want to perform is:
2018   // (xor x, (select_cc a, b, cc, 0, -1) )
2019   //   -->
2020   // (csel x, (xor x, -1), cc ...)
2021   //
2022   // The latter will get matched to a CSINV instruction.
2023
2024   ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
2025   SDValue LHS = Sel.getOperand(0);
2026   SDValue RHS = Sel.getOperand(1);
2027   SDValue TVal = Sel.getOperand(2);
2028   SDValue FVal = Sel.getOperand(3);
2029
2030   // FIXME: This could be generalized to non-integer comparisons.
2031   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
2032     return Op;
2033
2034   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
2035   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
2036
2037   // The values aren't constants, this isn't the pattern we're looking for.
2038   if (!CFVal || !CTVal)
2039     return Op;
2040
2041   // We can commute the SELECT_CC by inverting the condition.  This
2042   // might be needed to make this fit into a CSINV pattern.
2043   if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
2044     std::swap(TVal, FVal);
2045     std::swap(CTVal, CFVal);
2046     CC = ISD::getSetCCInverse(CC, true);
2047   }
2048
2049   // If the constants line up, perform the transform!
2050   if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
2051     SDValue CCVal;
2052     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
2053
2054     FVal = Other;
2055     TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
2056                        DAG.getConstant(-1ULL, dl, Other.getValueType()));
2057
2058     return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
2059                        CCVal, Cmp);
2060   }
2061
2062   return Op;
2063 }
2064
2065 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
2066   EVT VT = Op.getValueType();
2067
2068   // Let legalize expand this if it isn't a legal type yet.
2069   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
2070     return SDValue();
2071
2072   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
2073
2074   unsigned Opc;
2075   bool ExtraOp = false;
2076   switch (Op.getOpcode()) {
2077   default:
2078     llvm_unreachable("Invalid code");
2079   case ISD::ADDC:
2080     Opc = AArch64ISD::ADDS;
2081     break;
2082   case ISD::SUBC:
2083     Opc = AArch64ISD::SUBS;
2084     break;
2085   case ISD::ADDE:
2086     Opc = AArch64ISD::ADCS;
2087     ExtraOp = true;
2088     break;
2089   case ISD::SUBE:
2090     Opc = AArch64ISD::SBCS;
2091     ExtraOp = true;
2092     break;
2093   }
2094
2095   if (!ExtraOp)
2096     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
2097   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
2098                      Op.getOperand(2));
2099 }
2100
2101 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
2102   // Let legalize expand this if it isn't a legal type yet.
2103   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
2104     return SDValue();
2105
2106   SDLoc dl(Op);
2107   AArch64CC::CondCode CC;
2108   // The actual operation that sets the overflow or carry flag.
2109   SDValue Value, Overflow;
2110   std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
2111
2112   // We use 0 and 1 as false and true values.
2113   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
2114   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
2115
2116   // We use an inverted condition, because the conditional select is inverted
2117   // too. This will allow it to be selected to a single instruction:
2118   // CSINC Wd, WZR, WZR, invert(cond).
2119   SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
2120   Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
2121                          CCVal, Overflow);
2122
2123   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
2124   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
2125 }
2126
2127 // Prefetch operands are:
2128 // 1: Address to prefetch
2129 // 2: bool isWrite
2130 // 3: int locality (0 = no locality ... 3 = extreme locality)
2131 // 4: bool isDataCache
2132 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
2133   SDLoc DL(Op);
2134   unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2135   unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
2136   unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
2137
2138   bool IsStream = !Locality;
2139   // When the locality number is set
2140   if (Locality) {
2141     // The front-end should have filtered out the out-of-range values
2142     assert(Locality <= 3 && "Prefetch locality out-of-range");
2143     // The locality degree is the opposite of the cache speed.
2144     // Put the number the other way around.
2145     // The encoding starts at 0 for level 1
2146     Locality = 3 - Locality;
2147   }
2148
2149   // built the mask value encoding the expected behavior.
2150   unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
2151                    (!IsData << 3) |     // IsDataCache bit
2152                    (Locality << 1) |    // Cache level bits
2153                    (unsigned)IsStream;  // Stream bit
2154   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
2155                      DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
2156 }
2157
2158 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
2159                                               SelectionDAG &DAG) const {
2160   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
2161
2162   RTLIB::Libcall LC;
2163   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
2164
2165   return LowerF128Call(Op, DAG, LC);
2166 }
2167
2168 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
2169                                              SelectionDAG &DAG) const {
2170   if (Op.getOperand(0).getValueType() != MVT::f128) {
2171     // It's legal except when f128 is involved
2172     return Op;
2173   }
2174
2175   RTLIB::Libcall LC;
2176   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
2177
2178   // FP_ROUND node has a second operand indicating whether it is known to be
2179   // precise. That doesn't take part in the LibCall so we can't directly use
2180   // LowerF128Call.
2181   SDValue SrcVal = Op.getOperand(0);
2182   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
2183                      SDLoc(Op)).first;
2184 }
2185
2186 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
2187   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2188   // Any additional optimization in this function should be recorded
2189   // in the cost tables.
2190   EVT InVT = Op.getOperand(0).getValueType();
2191   EVT VT = Op.getValueType();
2192   unsigned NumElts = InVT.getVectorNumElements();
2193
2194   // f16 vectors are promoted to f32 before a conversion.
2195   if (InVT.getVectorElementType() == MVT::f16) {
2196     MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
2197     SDLoc dl(Op);
2198     return DAG.getNode(
2199         Op.getOpcode(), dl, Op.getValueType(),
2200         DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
2201   }
2202
2203   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2204     SDLoc dl(Op);
2205     SDValue Cv =
2206         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
2207                     Op.getOperand(0));
2208     return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
2209   }
2210
2211   if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2212     SDLoc dl(Op);
2213     MVT ExtVT =
2214         MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
2215                          VT.getVectorNumElements());
2216     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
2217     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
2218   }
2219
2220   // Type changing conversions are illegal.
2221   return Op;
2222 }
2223
2224 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
2225                                               SelectionDAG &DAG) const {
2226   if (Op.getOperand(0).getValueType().isVector())
2227     return LowerVectorFP_TO_INT(Op, DAG);
2228
2229   // f16 conversions are promoted to f32 when full fp16 is not supported.
2230   if (Op.getOperand(0).getValueType() == MVT::f16 &&
2231       !Subtarget->hasFullFP16()) {
2232     SDLoc dl(Op);
2233     return DAG.getNode(
2234         Op.getOpcode(), dl, Op.getValueType(),
2235         DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
2236   }
2237
2238   if (Op.getOperand(0).getValueType() != MVT::f128) {
2239     // It's legal except when f128 is involved
2240     return Op;
2241   }
2242
2243   RTLIB::Libcall LC;
2244   if (Op.getOpcode() == ISD::FP_TO_SINT)
2245     LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
2246   else
2247     LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
2248
2249   SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
2250   return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
2251 }
2252
2253 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
2254   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
2255   // Any additional optimization in this function should be recorded
2256   // in the cost tables.
2257   EVT VT = Op.getValueType();
2258   SDLoc dl(Op);
2259   SDValue In = Op.getOperand(0);
2260   EVT InVT = In.getValueType();
2261
2262   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
2263     MVT CastVT =
2264         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
2265                          InVT.getVectorNumElements());
2266     In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
2267     return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
2268   }
2269
2270   if (VT.getSizeInBits() > InVT.getSizeInBits()) {
2271     unsigned CastOpc =
2272         Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2273     EVT CastVT = VT.changeVectorElementTypeToInteger();
2274     In = DAG.getNode(CastOpc, dl, CastVT, In);
2275     return DAG.getNode(Op.getOpcode(), dl, VT, In);
2276   }
2277
2278   return Op;
2279 }
2280
2281 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
2282                                             SelectionDAG &DAG) const {
2283   if (Op.getValueType().isVector())
2284     return LowerVectorINT_TO_FP(Op, DAG);
2285
2286   // f16 conversions are promoted to f32 when full fp16 is not supported.
2287   if (Op.getValueType() == MVT::f16 &&
2288       !Subtarget->hasFullFP16()) {
2289     SDLoc dl(Op);
2290     return DAG.getNode(
2291         ISD::FP_ROUND, dl, MVT::f16,
2292         DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
2293         DAG.getIntPtrConstant(0, dl));
2294   }
2295
2296   // i128 conversions are libcalls.
2297   if (Op.getOperand(0).getValueType() == MVT::i128)
2298     return SDValue();
2299
2300   // Other conversions are legal, unless it's to the completely software-based
2301   // fp128.
2302   if (Op.getValueType() != MVT::f128)
2303     return Op;
2304
2305   RTLIB::Libcall LC;
2306   if (Op.getOpcode() == ISD::SINT_TO_FP)
2307     LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
2308   else
2309     LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
2310
2311   return LowerF128Call(Op, DAG, LC);
2312 }
2313
2314 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
2315                                             SelectionDAG &DAG) const {
2316   // For iOS, we want to call an alternative entry point: __sincos_stret,
2317   // which returns the values in two S / D registers.
2318   SDLoc dl(Op);
2319   SDValue Arg = Op.getOperand(0);
2320   EVT ArgVT = Arg.getValueType();
2321   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
2322
2323   ArgListTy Args;
2324   ArgListEntry Entry;
2325
2326   Entry.Node = Arg;
2327   Entry.Ty = ArgTy;
2328   Entry.IsSExt = false;
2329   Entry.IsZExt = false;
2330   Args.push_back(Entry);
2331
2332   RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
2333                                         : RTLIB::SINCOS_STRET_F32;
2334   const char *LibcallName = getLibcallName(LC);
2335   SDValue Callee =
2336       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
2337
2338   StructType *RetTy = StructType::get(ArgTy, ArgTy);
2339   TargetLowering::CallLoweringInfo CLI(DAG);
2340   CLI.setDebugLoc(dl)
2341       .setChain(DAG.getEntryNode())
2342       .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
2343
2344   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
2345   return CallResult.first;
2346 }
2347
2348 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
2349   if (Op.getValueType() != MVT::f16)
2350     return SDValue();
2351
2352   assert(Op.getOperand(0).getValueType() == MVT::i16);
2353   SDLoc DL(Op);
2354
2355   Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
2356   Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
2357   return SDValue(
2358       DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
2359                          DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
2360       0);
2361 }
2362
2363 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
2364   if (OrigVT.getSizeInBits() >= 64)
2365     return OrigVT;
2366
2367   assert(OrigVT.isSimple() && "Expecting a simple value type");
2368
2369   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
2370   switch (OrigSimpleTy) {
2371   default: llvm_unreachable("Unexpected Vector Type");
2372   case MVT::v2i8:
2373   case MVT::v2i16:
2374      return MVT::v2i32;
2375   case MVT::v4i8:
2376     return  MVT::v4i16;
2377   }
2378 }
2379
2380 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
2381                                                  const EVT &OrigTy,
2382                                                  const EVT &ExtTy,
2383                                                  unsigned ExtOpcode) {
2384   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
2385   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
2386   // 64-bits we need to insert a new extension so that it will be 64-bits.
2387   assert(ExtTy.is128BitVector() && "Unexpected extension size");
2388   if (OrigTy.getSizeInBits() >= 64)
2389     return N;
2390
2391   // Must extend size to at least 64 bits to be used as an operand for VMULL.
2392   EVT NewVT = getExtensionTo64Bits(OrigTy);
2393
2394   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
2395 }
2396
2397 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
2398                                    bool isSigned) {
2399   EVT VT = N->getValueType(0);
2400
2401   if (N->getOpcode() != ISD::BUILD_VECTOR)
2402     return false;
2403
2404   for (const SDValue &Elt : N->op_values()) {
2405     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2406       unsigned EltSize = VT.getScalarSizeInBits();
2407       unsigned HalfSize = EltSize / 2;
2408       if (isSigned) {
2409         if (!isIntN(HalfSize, C->getSExtValue()))
2410           return false;
2411       } else {
2412         if (!isUIntN(HalfSize, C->getZExtValue()))
2413           return false;
2414       }
2415       continue;
2416     }
2417     return false;
2418   }
2419
2420   return true;
2421 }
2422
2423 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
2424   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
2425     return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
2426                                              N->getOperand(0)->getValueType(0),
2427                                              N->getValueType(0),
2428                                              N->getOpcode());
2429
2430   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
2431   EVT VT = N->getValueType(0);
2432   SDLoc dl(N);
2433   unsigned EltSize = VT.getScalarSizeInBits() / 2;
2434   unsigned NumElts = VT.getVectorNumElements();
2435   MVT TruncVT = MVT::getIntegerVT(EltSize);
2436   SmallVector<SDValue, 8> Ops;
2437   for (unsigned i = 0; i != NumElts; ++i) {
2438     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
2439     const APInt &CInt = C->getAPIntValue();
2440     // Element types smaller than 32 bits are not legal, so use i32 elements.
2441     // The values are implicitly truncated so sext vs. zext doesn't matter.
2442     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
2443   }
2444   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
2445 }
2446
2447 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
2448   return N->getOpcode() == ISD::SIGN_EXTEND ||
2449          isExtendedBUILD_VECTOR(N, DAG, true);
2450 }
2451
2452 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
2453   return N->getOpcode() == ISD::ZERO_EXTEND ||
2454          isExtendedBUILD_VECTOR(N, DAG, false);
2455 }
2456
2457 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
2458   unsigned Opcode = N->getOpcode();
2459   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2460     SDNode *N0 = N->getOperand(0).getNode();
2461     SDNode *N1 = N->getOperand(1).getNode();
2462     return N0->hasOneUse() && N1->hasOneUse() &&
2463       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
2464   }
2465   return false;
2466 }
2467
2468 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
2469   unsigned Opcode = N->getOpcode();
2470   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
2471     SDNode *N0 = N->getOperand(0).getNode();
2472     SDNode *N1 = N->getOperand(1).getNode();
2473     return N0->hasOneUse() && N1->hasOneUse() &&
2474       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
2475   }
2476   return false;
2477 }
2478
2479 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
2480   // Multiplications are only custom-lowered for 128-bit vectors so that
2481   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
2482   EVT VT = Op.getValueType();
2483   assert(VT.is128BitVector() && VT.isInteger() &&
2484          "unexpected type for custom-lowering ISD::MUL");
2485   SDNode *N0 = Op.getOperand(0).getNode();
2486   SDNode *N1 = Op.getOperand(1).getNode();
2487   unsigned NewOpc = 0;
2488   bool isMLA = false;
2489   bool isN0SExt = isSignExtended(N0, DAG);
2490   bool isN1SExt = isSignExtended(N1, DAG);
2491   if (isN0SExt && isN1SExt)
2492     NewOpc = AArch64ISD::SMULL;
2493   else {
2494     bool isN0ZExt = isZeroExtended(N0, DAG);
2495     bool isN1ZExt = isZeroExtended(N1, DAG);
2496     if (isN0ZExt && isN1ZExt)
2497       NewOpc = AArch64ISD::UMULL;
2498     else if (isN1SExt || isN1ZExt) {
2499       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
2500       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
2501       if (isN1SExt && isAddSubSExt(N0, DAG)) {
2502         NewOpc = AArch64ISD::SMULL;
2503         isMLA = true;
2504       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
2505         NewOpc =  AArch64ISD::UMULL;
2506         isMLA = true;
2507       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
2508         std::swap(N0, N1);
2509         NewOpc =  AArch64ISD::UMULL;
2510         isMLA = true;
2511       }
2512     }
2513
2514     if (!NewOpc) {
2515       if (VT == MVT::v2i64)
2516         // Fall through to expand this.  It is not legal.
2517         return SDValue();
2518       else
2519         // Other vector multiplications are legal.
2520         return Op;
2521     }
2522   }
2523
2524   // Legalize to a S/UMULL instruction
2525   SDLoc DL(Op);
2526   SDValue Op0;
2527   SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
2528   if (!isMLA) {
2529     Op0 = skipExtensionForVectorMULL(N0, DAG);
2530     assert(Op0.getValueType().is64BitVector() &&
2531            Op1.getValueType().is64BitVector() &&
2532            "unexpected types for extended operands to VMULL");
2533     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
2534   }
2535   // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
2536   // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
2537   // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
2538   SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
2539   SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
2540   EVT Op1VT = Op1.getValueType();
2541   return DAG.getNode(N0->getOpcode(), DL, VT,
2542                      DAG.getNode(NewOpc, DL, VT,
2543                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
2544                      DAG.getNode(NewOpc, DL, VT,
2545                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
2546 }
2547
2548 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
2549                                                      SelectionDAG &DAG) const {
2550   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2551   SDLoc dl(Op);
2552   switch (IntNo) {
2553   default: return SDValue();    // Don't custom lower most intrinsics.
2554   case Intrinsic::thread_pointer: {
2555     EVT PtrVT = getPointerTy(DAG.getDataLayout());
2556     return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
2557   }
2558   case Intrinsic::aarch64_neon_abs:
2559     return DAG.getNode(ISD::ABS, dl, Op.getValueType(),
2560                        Op.getOperand(1));
2561   case Intrinsic::aarch64_neon_smax:
2562     return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
2563                        Op.getOperand(1), Op.getOperand(2));
2564   case Intrinsic::aarch64_neon_umax:
2565     return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
2566                        Op.getOperand(1), Op.getOperand(2));
2567   case Intrinsic::aarch64_neon_smin:
2568     return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
2569                        Op.getOperand(1), Op.getOperand(2));
2570   case Intrinsic::aarch64_neon_umin:
2571     return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
2572                        Op.getOperand(1), Op.getOperand(2));
2573   }
2574 }
2575
2576 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
2577                                               SelectionDAG &DAG) const {
2578   DEBUG(dbgs() << "Custom lowering: ");
2579   DEBUG(Op.dump());
2580
2581   switch (Op.getOpcode()) {
2582   default:
2583     llvm_unreachable("unimplemented operand");
2584     return SDValue();
2585   case ISD::BITCAST:
2586     return LowerBITCAST(Op, DAG);
2587   case ISD::GlobalAddress:
2588     return LowerGlobalAddress(Op, DAG);
2589   case ISD::GlobalTLSAddress:
2590     return LowerGlobalTLSAddress(Op, DAG);
2591   case ISD::SETCC:
2592     return LowerSETCC(Op, DAG);
2593   case ISD::BR_CC:
2594     return LowerBR_CC(Op, DAG);
2595   case ISD::SELECT:
2596     return LowerSELECT(Op, DAG);
2597   case ISD::SELECT_CC:
2598     return LowerSELECT_CC(Op, DAG);
2599   case ISD::JumpTable:
2600     return LowerJumpTable(Op, DAG);
2601   case ISD::ConstantPool:
2602     return LowerConstantPool(Op, DAG);
2603   case ISD::BlockAddress:
2604     return LowerBlockAddress(Op, DAG);
2605   case ISD::VASTART:
2606     return LowerVASTART(Op, DAG);
2607   case ISD::VACOPY:
2608     return LowerVACOPY(Op, DAG);
2609   case ISD::VAARG:
2610     return LowerVAARG(Op, DAG);
2611   case ISD::ADDC:
2612   case ISD::ADDE:
2613   case ISD::SUBC:
2614   case ISD::SUBE:
2615     return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
2616   case ISD::SADDO:
2617   case ISD::UADDO:
2618   case ISD::SSUBO:
2619   case ISD::USUBO:
2620   case ISD::SMULO:
2621   case ISD::UMULO:
2622     return LowerXALUO(Op, DAG);
2623   case ISD::FADD:
2624     return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
2625   case ISD::FSUB:
2626     return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
2627   case ISD::FMUL:
2628     return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
2629   case ISD::FDIV:
2630     return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
2631   case ISD::FP_ROUND:
2632     return LowerFP_ROUND(Op, DAG);
2633   case ISD::FP_EXTEND:
2634     return LowerFP_EXTEND(Op, DAG);
2635   case ISD::FRAMEADDR:
2636     return LowerFRAMEADDR(Op, DAG);
2637   case ISD::RETURNADDR:
2638     return LowerRETURNADDR(Op, DAG);
2639   case ISD::INSERT_VECTOR_ELT:
2640     return LowerINSERT_VECTOR_ELT(Op, DAG);
2641   case ISD::EXTRACT_VECTOR_ELT:
2642     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2643   case ISD::BUILD_VECTOR:
2644     return LowerBUILD_VECTOR(Op, DAG);
2645   case ISD::VECTOR_SHUFFLE:
2646     return LowerVECTOR_SHUFFLE(Op, DAG);
2647   case ISD::EXTRACT_SUBVECTOR:
2648     return LowerEXTRACT_SUBVECTOR(Op, DAG);
2649   case ISD::SRA:
2650   case ISD::SRL:
2651   case ISD::SHL:
2652     return LowerVectorSRA_SRL_SHL(Op, DAG);
2653   case ISD::SHL_PARTS:
2654     return LowerShiftLeftParts(Op, DAG);
2655   case ISD::SRL_PARTS:
2656   case ISD::SRA_PARTS:
2657     return LowerShiftRightParts(Op, DAG);
2658   case ISD::CTPOP:
2659     return LowerCTPOP(Op, DAG);
2660   case ISD::FCOPYSIGN:
2661     return LowerFCOPYSIGN(Op, DAG);
2662   case ISD::AND:
2663     return LowerVectorAND(Op, DAG);
2664   case ISD::OR:
2665     return LowerVectorOR(Op, DAG);
2666   case ISD::XOR:
2667     return LowerXOR(Op, DAG);
2668   case ISD::PREFETCH:
2669     return LowerPREFETCH(Op, DAG);
2670   case ISD::SINT_TO_FP:
2671   case ISD::UINT_TO_FP:
2672     return LowerINT_TO_FP(Op, DAG);
2673   case ISD::FP_TO_SINT:
2674   case ISD::FP_TO_UINT:
2675     return LowerFP_TO_INT(Op, DAG);
2676   case ISD::FSINCOS:
2677     return LowerFSINCOS(Op, DAG);
2678   case ISD::MUL:
2679     return LowerMUL(Op, DAG);
2680   case ISD::INTRINSIC_WO_CHAIN:
2681     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
2682   case ISD::VECREDUCE_ADD:
2683   case ISD::VECREDUCE_SMAX:
2684   case ISD::VECREDUCE_SMIN:
2685   case ISD::VECREDUCE_UMAX:
2686   case ISD::VECREDUCE_UMIN:
2687   case ISD::VECREDUCE_FMAX:
2688   case ISD::VECREDUCE_FMIN:
2689     return LowerVECREDUCE(Op, DAG);
2690   case ISD::ATOMIC_LOAD_SUB:
2691     return LowerATOMIC_LOAD_SUB(Op, DAG);
2692   case ISD::ATOMIC_LOAD_AND:
2693     return LowerATOMIC_LOAD_AND(Op, DAG);
2694   case ISD::DYNAMIC_STACKALLOC:
2695     return LowerDYNAMIC_STACKALLOC(Op, DAG);
2696   }
2697 }
2698
2699 //===----------------------------------------------------------------------===//
2700 //                      Calling Convention Implementation
2701 //===----------------------------------------------------------------------===//
2702
2703 #include "AArch64GenCallingConv.inc"
2704
2705 /// Selects the correct CCAssignFn for a given CallingConvention value.
2706 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
2707                                                      bool IsVarArg) const {
2708   switch (CC) {
2709   default:
2710     report_fatal_error("Unsupported calling convention.");
2711   case CallingConv::WebKit_JS:
2712     return CC_AArch64_WebKit_JS;
2713   case CallingConv::GHC:
2714     return CC_AArch64_GHC;
2715   case CallingConv::C:
2716   case CallingConv::Fast:
2717   case CallingConv::PreserveMost:
2718   case CallingConv::CXX_FAST_TLS:
2719   case CallingConv::Swift:
2720     if (Subtarget->isTargetWindows() && IsVarArg)
2721       return CC_AArch64_Win64_VarArg;
2722     if (!Subtarget->isTargetDarwin())
2723       return CC_AArch64_AAPCS;
2724     return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
2725   case CallingConv::Win64:
2726     return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
2727   }
2728 }
2729
2730 CCAssignFn *
2731 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
2732   return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
2733                                       : RetCC_AArch64_AAPCS;
2734 }
2735
2736 SDValue AArch64TargetLowering::LowerFormalArguments(
2737     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2738     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2739     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2740   MachineFunction &MF = DAG.getMachineFunction();
2741   MachineFrameInfo &MFI = MF.getFrameInfo();
2742   bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
2743
2744   // Assign locations to all of the incoming arguments.
2745   SmallVector<CCValAssign, 16> ArgLocs;
2746   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2747                  *DAG.getContext());
2748
2749   // At this point, Ins[].VT may already be promoted to i32. To correctly
2750   // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
2751   // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
2752   // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
2753   // we use a special version of AnalyzeFormalArguments to pass in ValVT and
2754   // LocVT.
2755   unsigned NumArgs = Ins.size();
2756   Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
2757   unsigned CurArgIdx = 0;
2758   for (unsigned i = 0; i != NumArgs; ++i) {
2759     MVT ValVT = Ins[i].VT;
2760     if (Ins[i].isOrigArg()) {
2761       std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
2762       CurArgIdx = Ins[i].getOrigArgIndex();
2763
2764       // Get type of the original argument.
2765       EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
2766                                   /*AllowUnknown*/ true);
2767       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
2768       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
2769       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
2770         ValVT = MVT::i8;
2771       else if (ActualMVT == MVT::i16)
2772         ValVT = MVT::i16;
2773     }
2774     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
2775     bool Res =
2776         AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
2777     assert(!Res && "Call operand has unhandled type");
2778     (void)Res;
2779   }
2780   assert(ArgLocs.size() == Ins.size());
2781   SmallVector<SDValue, 16> ArgValues;
2782   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2783     CCValAssign &VA = ArgLocs[i];
2784
2785     if (Ins[i].Flags.isByVal()) {
2786       // Byval is used for HFAs in the PCS, but the system should work in a
2787       // non-compliant manner for larger structs.
2788       EVT PtrVT = getPointerTy(DAG.getDataLayout());
2789       int Size = Ins[i].Flags.getByValSize();
2790       unsigned NumRegs = (Size + 7) / 8;
2791
2792       // FIXME: This works on big-endian for composite byvals, which are the common
2793       // case. It should also work for fundamental types too.
2794       unsigned FrameIdx =
2795         MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
2796       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
2797       InVals.push_back(FrameIdxN);
2798
2799       continue;
2800     }
2801
2802     if (VA.isRegLoc()) {
2803       // Arguments stored in registers.
2804       EVT RegVT = VA.getLocVT();
2805
2806       SDValue ArgValue;
2807       const TargetRegisterClass *RC;
2808
2809       if (RegVT == MVT::i32)
2810         RC = &AArch64::GPR32RegClass;
2811       else if (RegVT == MVT::i64)
2812         RC = &AArch64::GPR64RegClass;
2813       else if (RegVT == MVT::f16)
2814         RC = &AArch64::FPR16RegClass;
2815       else if (RegVT == MVT::f32)
2816         RC = &AArch64::FPR32RegClass;
2817       else if (RegVT == MVT::f64 || RegVT.is64BitVector())
2818         RC = &AArch64::FPR64RegClass;
2819       else if (RegVT == MVT::f128 || RegVT.is128BitVector())
2820         RC = &AArch64::FPR128RegClass;
2821       else
2822         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
2823
2824       // Transform the arguments in physical registers into virtual ones.
2825       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2826       ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
2827
2828       // If this is an 8, 16 or 32-bit value, it is really passed promoted
2829       // to 64 bits.  Insert an assert[sz]ext to capture this, then
2830       // truncate to the right size.
2831       switch (VA.getLocInfo()) {
2832       default:
2833         llvm_unreachable("Unknown loc info!");
2834       case CCValAssign::Full:
2835         break;
2836       case CCValAssign::BCvt:
2837         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
2838         break;
2839       case CCValAssign::AExt:
2840       case CCValAssign::SExt:
2841       case CCValAssign::ZExt:
2842         // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
2843         // nodes after our lowering.
2844         assert(RegVT == Ins[i].VT && "incorrect register location selected");
2845         break;
2846       }
2847
2848       InVals.push_back(ArgValue);
2849
2850     } else { // VA.isRegLoc()
2851       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
2852       unsigned ArgOffset = VA.getLocMemOffset();
2853       unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
2854
2855       uint32_t BEAlign = 0;
2856       if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
2857           !Ins[i].Flags.isInConsecutiveRegs())
2858         BEAlign = 8 - ArgSize;
2859
2860       int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
2861
2862       // Create load nodes to retrieve arguments from the stack.
2863       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2864       SDValue ArgValue;
2865
2866       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2867       ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2868       MVT MemVT = VA.getValVT();
2869
2870       switch (VA.getLocInfo()) {
2871       default:
2872         break;
2873       case CCValAssign::BCvt:
2874         MemVT = VA.getLocVT();
2875         break;
2876       case CCValAssign::SExt:
2877         ExtType = ISD::SEXTLOAD;
2878         break;
2879       case CCValAssign::ZExt:
2880         ExtType = ISD::ZEXTLOAD;
2881         break;
2882       case CCValAssign::AExt:
2883         ExtType = ISD::EXTLOAD;
2884         break;
2885       }
2886
2887       ArgValue = DAG.getExtLoad(
2888           ExtType, DL, VA.getLocVT(), Chain, FIN,
2889           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
2890           MemVT);
2891
2892       InVals.push_back(ArgValue);
2893     }
2894   }
2895
2896   // varargs
2897   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2898   if (isVarArg) {
2899     if (!Subtarget->isTargetDarwin() || IsWin64) {
2900       // The AAPCS variadic function ABI is identical to the non-variadic
2901       // one. As a result there may be more arguments in registers and we should
2902       // save them for future reference.
2903       // Win64 variadic functions also pass arguments in registers, but all float
2904       // arguments are passed in integer registers.
2905       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
2906     }
2907
2908     // This will point to the next argument passed via stack.
2909     unsigned StackOffset = CCInfo.getNextStackOffset();
2910     // We currently pass all varargs at 8-byte alignment.
2911     StackOffset = ((StackOffset + 7) & ~7);
2912     FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
2913   }
2914
2915   unsigned StackArgSize = CCInfo.getNextStackOffset();
2916   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2917   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
2918     // This is a non-standard ABI so by fiat I say we're allowed to make full
2919     // use of the stack area to be popped, which must be aligned to 16 bytes in
2920     // any case:
2921     StackArgSize = alignTo(StackArgSize, 16);
2922
2923     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
2924     // a multiple of 16.
2925     FuncInfo->setArgumentStackToRestore(StackArgSize);
2926
2927     // This realignment carries over to the available bytes below. Our own
2928     // callers will guarantee the space is free by giving an aligned value to
2929     // CALLSEQ_START.
2930   }
2931   // Even if we're not expected to free up the space, it's useful to know how
2932   // much is there while considering tail calls (because we can reuse it).
2933   FuncInfo->setBytesInStackArgArea(StackArgSize);
2934
2935   return Chain;
2936 }
2937
2938 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
2939                                                 SelectionDAG &DAG,
2940                                                 const SDLoc &DL,
2941                                                 SDValue &Chain) const {
2942   MachineFunction &MF = DAG.getMachineFunction();
2943   MachineFrameInfo &MFI = MF.getFrameInfo();
2944   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2945   auto PtrVT = getPointerTy(DAG.getDataLayout());
2946   bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
2947
2948   SmallVector<SDValue, 8> MemOps;
2949
2950   static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
2951                                           AArch64::X3, AArch64::X4, AArch64::X5,
2952                                           AArch64::X6, AArch64::X7 };
2953   static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
2954   unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
2955
2956   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
2957   int GPRIdx = 0;
2958   if (GPRSaveSize != 0) {
2959     if (IsWin64) {
2960       GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
2961       if (GPRSaveSize & 15)
2962         // The extra size here, if triggered, will always be 8.
2963         MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
2964     } else
2965       GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
2966
2967     SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
2968
2969     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
2970       unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
2971       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
2972       SDValue Store = DAG.getStore(
2973           Val.getValue(1), DL, Val, FIN,
2974           IsWin64
2975               ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
2976                                                   GPRIdx,
2977                                                   (i - FirstVariadicGPR) * 8)
2978               : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
2979       MemOps.push_back(Store);
2980       FIN =
2981           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
2982     }
2983   }
2984   FuncInfo->setVarArgsGPRIndex(GPRIdx);
2985   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
2986
2987   if (Subtarget->hasFPARMv8() && !IsWin64) {
2988     static const MCPhysReg FPRArgRegs[] = {
2989         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
2990         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
2991     static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
2992     unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
2993
2994     unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
2995     int FPRIdx = 0;
2996     if (FPRSaveSize != 0) {
2997       FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
2998
2999       SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
3000
3001       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
3002         unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
3003         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
3004
3005         SDValue Store = DAG.getStore(
3006             Val.getValue(1), DL, Val, FIN,
3007             MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
3008         MemOps.push_back(Store);
3009         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
3010                           DAG.getConstant(16, DL, PtrVT));
3011       }
3012     }
3013     FuncInfo->setVarArgsFPRIndex(FPRIdx);
3014     FuncInfo->setVarArgsFPRSize(FPRSaveSize);
3015   }
3016
3017   if (!MemOps.empty()) {
3018     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3019   }
3020 }
3021
3022 /// LowerCallResult - Lower the result values of a call into the
3023 /// appropriate copies out of appropriate physical registers.
3024 SDValue AArch64TargetLowering::LowerCallResult(
3025     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3026     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3027     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
3028     SDValue ThisVal) const {
3029   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3030                           ? RetCC_AArch64_WebKit_JS
3031                           : RetCC_AArch64_AAPCS;
3032   // Assign locations to each value returned by this call.
3033   SmallVector<CCValAssign, 16> RVLocs;
3034   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3035                  *DAG.getContext());
3036   CCInfo.AnalyzeCallResult(Ins, RetCC);
3037
3038   // Copy all of the result registers out of their specified physreg.
3039   for (unsigned i = 0; i != RVLocs.size(); ++i) {
3040     CCValAssign VA = RVLocs[i];
3041
3042     // Pass 'this' value directly from the argument to return value, to avoid
3043     // reg unit interference
3044     if (i == 0 && isThisReturn) {
3045       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
3046              "unexpected return calling convention register assignment");
3047       InVals.push_back(ThisVal);
3048       continue;
3049     }
3050
3051     SDValue Val =
3052         DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
3053     Chain = Val.getValue(1);
3054     InFlag = Val.getValue(2);
3055
3056     switch (VA.getLocInfo()) {
3057     default:
3058       llvm_unreachable("Unknown loc info!");
3059     case CCValAssign::Full:
3060       break;
3061     case CCValAssign::BCvt:
3062       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3063       break;
3064     }
3065
3066     InVals.push_back(Val);
3067   }
3068
3069   return Chain;
3070 }
3071
3072 /// Return true if the calling convention is one that we can guarantee TCO for.
3073 static bool canGuaranteeTCO(CallingConv::ID CC) {
3074   return CC == CallingConv::Fast;
3075 }
3076
3077 /// Return true if we might ever do TCO for calls with this calling convention.
3078 static bool mayTailCallThisCC(CallingConv::ID CC) {
3079   switch (CC) {
3080   case CallingConv::C:
3081   case CallingConv::PreserveMost:
3082   case CallingConv::Swift:
3083     return true;
3084   default:
3085     return canGuaranteeTCO(CC);
3086   }
3087 }
3088
3089 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
3090     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3091     const SmallVectorImpl<ISD::OutputArg> &Outs,
3092     const SmallVectorImpl<SDValue> &OutVals,
3093     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3094   if (!mayTailCallThisCC(CalleeCC))
3095     return false;
3096
3097   MachineFunction &MF = DAG.getMachineFunction();
3098   const Function &CallerF = MF.getFunction();
3099   CallingConv::ID CallerCC = CallerF.getCallingConv();
3100   bool CCMatch = CallerCC == CalleeCC;
3101
3102   // Byval parameters hand the function a pointer directly into the stack area
3103   // we want to reuse during a tail call. Working around this *is* possible (see
3104   // X86) but less efficient and uglier in LowerCall.
3105   for (Function::const_arg_iterator i = CallerF.arg_begin(),
3106                                     e = CallerF.arg_end();
3107        i != e; ++i)
3108     if (i->hasByValAttr())
3109       return false;
3110
3111   if (getTargetMachine().Options.GuaranteedTailCallOpt)
3112     return canGuaranteeTCO(CalleeCC) && CCMatch;
3113
3114   // Externally-defined functions with weak linkage should not be
3115   // tail-called on AArch64 when the OS does not support dynamic
3116   // pre-emption of symbols, as the AAELF spec requires normal calls
3117   // to undefined weak functions to be replaced with a NOP or jump to the
3118   // next instruction. The behaviour of branch instructions in this
3119   // situation (as used for tail calls) is implementation-defined, so we
3120   // cannot rely on the linker replacing the tail call with a return.
3121   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3122     const GlobalValue *GV = G->getGlobal();
3123     const Triple &TT = getTargetMachine().getTargetTriple();
3124     if (GV->hasExternalWeakLinkage() &&
3125         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3126       return false;
3127   }
3128
3129   // Now we search for cases where we can use a tail call without changing the
3130   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
3131   // concept.
3132
3133   // I want anyone implementing a new calling convention to think long and hard
3134   // about this assert.
3135   assert((!isVarArg || CalleeCC == CallingConv::C) &&
3136          "Unexpected variadic calling convention");
3137
3138   LLVMContext &C = *DAG.getContext();
3139   if (isVarArg && !Outs.empty()) {
3140     // At least two cases here: if caller is fastcc then we can't have any
3141     // memory arguments (we'd be expected to clean up the stack afterwards). If
3142     // caller is C then we could potentially use its argument area.
3143
3144     // FIXME: for now we take the most conservative of these in both cases:
3145     // disallow all variadic memory operands.
3146     SmallVector<CCValAssign, 16> ArgLocs;
3147     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3148
3149     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
3150     for (const CCValAssign &ArgLoc : ArgLocs)
3151       if (!ArgLoc.isRegLoc())
3152         return false;
3153   }
3154
3155   // Check that the call results are passed in the same way.
3156   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3157                                   CCAssignFnForCall(CalleeCC, isVarArg),
3158                                   CCAssignFnForCall(CallerCC, isVarArg)))
3159     return false;
3160   // The callee has to preserve all registers the caller needs to preserve.
3161   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3162   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3163   if (!CCMatch) {
3164     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3165     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3166       return false;
3167   }
3168
3169   // Nothing more to check if the callee is taking no arguments
3170   if (Outs.empty())
3171     return true;
3172
3173   SmallVector<CCValAssign, 16> ArgLocs;
3174   CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3175
3176   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3177
3178   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3179
3180   // If the stack arguments for this call do not fit into our own save area then
3181   // the call cannot be made tail.
3182   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3183     return false;
3184
3185   const MachineRegisterInfo &MRI = MF.getRegInfo();
3186   if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3187     return false;
3188
3189   return true;
3190 }
3191
3192 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
3193                                                    SelectionDAG &DAG,
3194                                                    MachineFrameInfo &MFI,
3195                                                    int ClobberedFI) const {
3196   SmallVector<SDValue, 8> ArgChains;
3197   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
3198   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
3199
3200   // Include the original chain at the beginning of the list. When this is
3201   // used by target LowerCall hooks, this helps legalize find the
3202   // CALLSEQ_BEGIN node.
3203   ArgChains.push_back(Chain);
3204
3205   // Add a chain value for each stack argument corresponding
3206   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
3207                             UE = DAG.getEntryNode().getNode()->use_end();
3208        U != UE; ++U)
3209     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
3210       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
3211         if (FI->getIndex() < 0) {
3212           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
3213           int64_t InLastByte = InFirstByte;
3214           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
3215
3216           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
3217               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
3218             ArgChains.push_back(SDValue(L, 1));
3219         }
3220
3221   // Build a tokenfactor for all the chains.
3222   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
3223 }
3224
3225 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
3226                                                    bool TailCallOpt) const {
3227   return CallCC == CallingConv::Fast && TailCallOpt;
3228 }
3229
3230 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
3231 /// and add input and output parameter nodes.
3232 SDValue
3233 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
3234                                  SmallVectorImpl<SDValue> &InVals) const {
3235   SelectionDAG &DAG = CLI.DAG;
3236   SDLoc &DL = CLI.DL;
3237   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3238   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3239   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3240   SDValue Chain = CLI.Chain;
3241   SDValue Callee = CLI.Callee;
3242   bool &IsTailCall = CLI.IsTailCall;
3243   CallingConv::ID CallConv = CLI.CallConv;
3244   bool IsVarArg = CLI.IsVarArg;
3245
3246   MachineFunction &MF = DAG.getMachineFunction();
3247   bool IsThisReturn = false;
3248
3249   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
3250   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3251   bool IsSibCall = false;
3252
3253   if (IsTailCall) {
3254     // Check if it's really possible to do a tail call.
3255     IsTailCall = isEligibleForTailCallOptimization(
3256         Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3257     if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
3258       report_fatal_error("failed to perform tail call elimination on a call "
3259                          "site marked musttail");
3260
3261     // A sibling call is one where we're under the usual C ABI and not planning
3262     // to change that but can still do a tail call:
3263     if (!TailCallOpt && IsTailCall)
3264       IsSibCall = true;
3265
3266     if (IsTailCall)
3267       ++NumTailCalls;
3268   }
3269
3270   // Analyze operands of the call, assigning locations to each operand.
3271   SmallVector<CCValAssign, 16> ArgLocs;
3272   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
3273                  *DAG.getContext());
3274
3275   if (IsVarArg) {
3276     // Handle fixed and variable vector arguments differently.
3277     // Variable vector arguments always go into memory.
3278     unsigned NumArgs = Outs.size();
3279
3280     for (unsigned i = 0; i != NumArgs; ++i) {
3281       MVT ArgVT = Outs[i].VT;
3282       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3283       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
3284                                                /*IsVarArg=*/ !Outs[i].IsFixed);
3285       bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
3286       assert(!Res && "Call operand has unhandled type");
3287       (void)Res;
3288     }
3289   } else {
3290     // At this point, Outs[].VT may already be promoted to i32. To correctly
3291     // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
3292     // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
3293     // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
3294     // we use a special version of AnalyzeCallOperands to pass in ValVT and
3295     // LocVT.
3296     unsigned NumArgs = Outs.size();
3297     for (unsigned i = 0; i != NumArgs; ++i) {
3298       MVT ValVT = Outs[i].VT;
3299       // Get type of the original argument.
3300       EVT ActualVT = getValueType(DAG.getDataLayout(),
3301                                   CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
3302                                   /*AllowUnknown*/ true);
3303       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
3304       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
3305       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
3306       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
3307         ValVT = MVT::i8;
3308       else if (ActualMVT == MVT::i16)
3309         ValVT = MVT::i16;
3310
3311       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
3312       bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
3313       assert(!Res && "Call operand has unhandled type");
3314       (void)Res;
3315     }
3316   }
3317
3318   // Get a count of how many bytes are to be pushed on the stack.
3319   unsigned NumBytes = CCInfo.getNextStackOffset();
3320
3321   if (IsSibCall) {
3322     // Since we're not changing the ABI to make this a tail call, the memory
3323     // operands are already available in the caller's incoming argument space.
3324     NumBytes = 0;
3325   }
3326
3327   // FPDiff is the byte offset of the call's argument area from the callee's.
3328   // Stores to callee stack arguments will be placed in FixedStackSlots offset
3329   // by this amount for a tail call. In a sibling call it must be 0 because the
3330   // caller will deallocate the entire stack and the callee still expects its
3331   // arguments to begin at SP+0. Completely unused for non-tail calls.
3332   int FPDiff = 0;
3333
3334   if (IsTailCall && !IsSibCall) {
3335     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
3336
3337     // Since callee will pop argument stack as a tail call, we must keep the
3338     // popped size 16-byte aligned.
3339     NumBytes = alignTo(NumBytes, 16);
3340
3341     // FPDiff will be negative if this tail call requires more space than we
3342     // would automatically have in our incoming argument space. Positive if we
3343     // can actually shrink the stack.
3344     FPDiff = NumReusableBytes - NumBytes;
3345
3346     // The stack pointer must be 16-byte aligned at all times it's used for a
3347     // memory operation, which in practice means at *all* times and in
3348     // particular across call boundaries. Therefore our own arguments started at
3349     // a 16-byte aligned SP and the delta applied for the tail call should
3350     // satisfy the same constraint.
3351     assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
3352   }
3353
3354   // Adjust the stack pointer for the new arguments...
3355   // These operations are automatically eliminated by the prolog/epilog pass
3356   if (!IsSibCall)
3357     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
3358
3359   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
3360                                         getPointerTy(DAG.getDataLayout()));
3361
3362   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3363   SmallVector<SDValue, 8> MemOpChains;
3364   auto PtrVT = getPointerTy(DAG.getDataLayout());
3365
3366   // Walk the register/memloc assignments, inserting copies/loads.
3367   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
3368        ++i, ++realArgIdx) {
3369     CCValAssign &VA = ArgLocs[i];
3370     SDValue Arg = OutVals[realArgIdx];
3371     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3372
3373     // Promote the value if needed.
3374     switch (VA.getLocInfo()) {
3375     default:
3376       llvm_unreachable("Unknown loc info!");
3377     case CCValAssign::Full:
3378       break;
3379     case CCValAssign::SExt:
3380       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3381       break;
3382     case CCValAssign::ZExt:
3383       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3384       break;
3385     case CCValAssign::AExt:
3386       if (Outs[realArgIdx].ArgVT == MVT::i1) {
3387         // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
3388         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3389         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
3390       }
3391       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3392       break;
3393     case CCValAssign::BCvt:
3394       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3395       break;
3396     case CCValAssign::FPExt:
3397       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3398       break;
3399     }
3400
3401     if (VA.isRegLoc()) {
3402       if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
3403           Outs[0].VT == MVT::i64) {
3404         assert(VA.getLocVT() == MVT::i64 &&
3405                "unexpected calling convention register assignment");
3406         assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
3407                "unexpected use of 'returned'");
3408         IsThisReturn = true;
3409       }
3410       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3411     } else {
3412       assert(VA.isMemLoc());
3413
3414       SDValue DstAddr;
3415       MachinePointerInfo DstInfo;
3416
3417       // FIXME: This works on big-endian for composite byvals, which are the
3418       // common case. It should also work for fundamental types too.
3419       uint32_t BEAlign = 0;
3420       unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
3421                                         : VA.getValVT().getSizeInBits();
3422       OpSize = (OpSize + 7) / 8;
3423       if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
3424           !Flags.isInConsecutiveRegs()) {
3425         if (OpSize < 8)
3426           BEAlign = 8 - OpSize;
3427       }
3428       unsigned LocMemOffset = VA.getLocMemOffset();
3429       int32_t Offset = LocMemOffset + BEAlign;
3430       SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3431       PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3432
3433       if (IsTailCall) {
3434         Offset = Offset + FPDiff;
3435         int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3436
3437         DstAddr = DAG.getFrameIndex(FI, PtrVT);
3438         DstInfo =
3439             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
3440
3441         // Make sure any stack arguments overlapping with where we're storing
3442         // are loaded before this eventual operation. Otherwise they'll be
3443         // clobbered.
3444         Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
3445       } else {
3446         SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
3447
3448         DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
3449         DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
3450                                                LocMemOffset);
3451       }
3452
3453       if (Outs[i].Flags.isByVal()) {
3454         SDValue SizeNode =
3455             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
3456         SDValue Cpy = DAG.getMemcpy(
3457             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
3458             /*isVol = */ false, /*AlwaysInline = */ false,
3459             /*isTailCall = */ false,
3460             DstInfo, MachinePointerInfo());
3461
3462         MemOpChains.push_back(Cpy);
3463       } else {
3464         // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
3465         // promoted to a legal register type i32, we should truncate Arg back to
3466         // i1/i8/i16.
3467         if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
3468             VA.getValVT() == MVT::i16)
3469           Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
3470
3471         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
3472         MemOpChains.push_back(Store);
3473       }
3474     }
3475   }
3476
3477   if (!MemOpChains.empty())
3478     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3479
3480   // Build a sequence of copy-to-reg nodes chained together with token chain
3481   // and flag operands which copy the outgoing args into the appropriate regs.
3482   SDValue InFlag;
3483   for (auto &RegToPass : RegsToPass) {
3484     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3485                              RegToPass.second, InFlag);
3486     InFlag = Chain.getValue(1);
3487   }
3488
3489   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
3490   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
3491   // node so that legalize doesn't hack it.
3492   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3493     auto GV = G->getGlobal();
3494     if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
3495         AArch64II::MO_GOT) {
3496       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
3497       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3498     } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
3499       assert(Subtarget->isTargetWindows() &&
3500              "Windows is the only supported COFF target");
3501       Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
3502     } else {
3503       const GlobalValue *GV = G->getGlobal();
3504       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
3505     }
3506   } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3507     if (getTargetMachine().getCodeModel() == CodeModel::Large &&
3508         Subtarget->isTargetMachO()) {
3509       const char *Sym = S->getSymbol();
3510       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
3511       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
3512     } else {
3513       const char *Sym = S->getSymbol();
3514       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
3515     }
3516   }
3517
3518   // We don't usually want to end the call-sequence here because we would tidy
3519   // the frame up *after* the call, however in the ABI-changing tail-call case
3520   // we've carefully laid out the parameters so that when sp is reset they'll be
3521   // in the correct location.
3522   if (IsTailCall && !IsSibCall) {
3523     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3524                                DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
3525     InFlag = Chain.getValue(1);
3526   }
3527
3528   std::vector<SDValue> Ops;
3529   Ops.push_back(Chain);
3530   Ops.push_back(Callee);
3531
3532   if (IsTailCall) {
3533     // Each tail call may have to adjust the stack by a different amount, so
3534     // this information must travel along with the operation for eventual
3535     // consumption by emitEpilogue.
3536     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3537   }
3538
3539   // Add argument registers to the end of the list so that they are known live
3540   // into the call.
3541   for (auto &RegToPass : RegsToPass)
3542     Ops.push_back(DAG.getRegister(RegToPass.first,
3543                                   RegToPass.second.getValueType()));
3544
3545   // Add a register mask operand representing the call-preserved registers.
3546   const uint32_t *Mask;
3547   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3548   if (IsThisReturn) {
3549     // For 'this' returns, use the X0-preserving mask if applicable
3550     Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
3551     if (!Mask) {
3552       IsThisReturn = false;
3553       Mask = TRI->getCallPreservedMask(MF, CallConv);
3554     }
3555   } else
3556     Mask = TRI->getCallPreservedMask(MF, CallConv);
3557
3558   assert(Mask && "Missing call preserved mask for calling convention");
3559   Ops.push_back(DAG.getRegisterMask(Mask));
3560
3561   if (InFlag.getNode())
3562     Ops.push_back(InFlag);
3563
3564   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3565
3566   // If we're doing a tall call, use a TC_RETURN here rather than an
3567   // actual call instruction.
3568   if (IsTailCall) {
3569     MF.getFrameInfo().setHasTailCall();
3570     return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
3571   }
3572
3573   // Returns a chain and a flag for retval copy to use.
3574   Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
3575   InFlag = Chain.getValue(1);
3576
3577   uint64_t CalleePopBytes =
3578       DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
3579
3580   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
3581                              DAG.getIntPtrConstant(CalleePopBytes, DL, true),
3582                              InFlag, DL);
3583   if (!Ins.empty())
3584     InFlag = Chain.getValue(1);
3585
3586   // Handle result values, copying them out of physregs into vregs that we
3587   // return.
3588   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3589                          InVals, IsThisReturn,
3590                          IsThisReturn ? OutVals[0] : SDValue());
3591 }
3592
3593 bool AArch64TargetLowering::CanLowerReturn(
3594     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3595     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3596   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3597                           ? RetCC_AArch64_WebKit_JS
3598                           : RetCC_AArch64_AAPCS;
3599   SmallVector<CCValAssign, 16> RVLocs;
3600   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3601   return CCInfo.CheckReturn(Outs, RetCC);
3602 }
3603
3604 SDValue
3605 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3606                                    bool isVarArg,
3607                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
3608                                    const SmallVectorImpl<SDValue> &OutVals,
3609                                    const SDLoc &DL, SelectionDAG &DAG) const {
3610   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
3611                           ? RetCC_AArch64_WebKit_JS
3612                           : RetCC_AArch64_AAPCS;
3613   SmallVector<CCValAssign, 16> RVLocs;
3614   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3615                  *DAG.getContext());
3616   CCInfo.AnalyzeReturn(Outs, RetCC);
3617
3618   // Copy the result values into the output registers.
3619   SDValue Flag;
3620   SmallVector<SDValue, 4> RetOps(1, Chain);
3621   for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
3622        ++i, ++realRVLocIdx) {
3623     CCValAssign &VA = RVLocs[i];
3624     assert(VA.isRegLoc() && "Can only return in registers!");
3625     SDValue Arg = OutVals[realRVLocIdx];
3626
3627     switch (VA.getLocInfo()) {
3628     default:
3629       llvm_unreachable("Unknown loc info!");
3630     case CCValAssign::Full:
3631       if (Outs[i].ArgVT == MVT::i1) {
3632         // AAPCS requires i1 to be zero-extended to i8 by the producer of the
3633         // value. This is strictly redundant on Darwin (which uses "zeroext
3634         // i1"), but will be optimised out before ISel.
3635         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
3636         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3637       }
3638       break;
3639     case CCValAssign::BCvt:
3640       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3641       break;
3642     }
3643
3644     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
3645     Flag = Chain.getValue(1);
3646     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3647   }
3648   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3649   const MCPhysReg *I =
3650       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3651   if (I) {
3652     for (; *I; ++I) {
3653       if (AArch64::GPR64RegClass.contains(*I))
3654         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3655       else if (AArch64::FPR64RegClass.contains(*I))
3656         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
3657       else
3658         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3659     }
3660   }
3661
3662   RetOps[0] = Chain; // Update chain.
3663
3664   // Add the flag if we have it.
3665   if (Flag.getNode())
3666     RetOps.push_back(Flag);
3667
3668   return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
3669 }
3670
3671 //===----------------------------------------------------------------------===//
3672 //  Other Lowering Code
3673 //===----------------------------------------------------------------------===//
3674
3675 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
3676                                              SelectionDAG &DAG,
3677                                              unsigned Flag) const {
3678   return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
3679 }
3680
3681 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
3682                                              SelectionDAG &DAG,
3683                                              unsigned Flag) const {
3684   return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
3685 }
3686
3687 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
3688                                              SelectionDAG &DAG,
3689                                              unsigned Flag) const {
3690   return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
3691                                    N->getOffset(), Flag);
3692 }
3693
3694 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
3695                                              SelectionDAG &DAG,
3696                                              unsigned Flag) const {
3697   return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
3698 }
3699
3700 // (loadGOT sym)
3701 template <class NodeTy>
3702 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
3703                                       unsigned Flags) const {
3704   DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
3705   SDLoc DL(N);
3706   EVT Ty = getPointerTy(DAG.getDataLayout());
3707   SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
3708   // FIXME: Once remat is capable of dealing with instructions with register
3709   // operands, expand this into two nodes instead of using a wrapper node.
3710   return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
3711 }
3712
3713 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
3714 template <class NodeTy>
3715 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
3716                                             unsigned Flags) const {
3717   DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
3718   SDLoc DL(N);
3719   EVT Ty = getPointerTy(DAG.getDataLayout());
3720   const unsigned char MO_NC = AArch64II::MO_NC;
3721   return DAG.getNode(
3722       AArch64ISD::WrapperLarge, DL, Ty,
3723       getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
3724       getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
3725       getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
3726       getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
3727 }
3728
3729 // (addlow (adrp %hi(sym)) %lo(sym))
3730 template <class NodeTy>
3731 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
3732                                        unsigned Flags) const {
3733   DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
3734   SDLoc DL(N);
3735   EVT Ty = getPointerTy(DAG.getDataLayout());
3736   SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
3737   SDValue Lo = getTargetNode(N, Ty, DAG,
3738                              AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
3739   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
3740   return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
3741 }
3742
3743 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
3744                                                   SelectionDAG &DAG) const {
3745   GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
3746   const GlobalValue *GV = GN->getGlobal();
3747   const AArch64II::TOF TargetFlags =
3748       (GV->hasDLLImportStorageClass() ? AArch64II::MO_DLLIMPORT
3749                                       : AArch64II::MO_NO_FLAG);
3750   unsigned char OpFlags =
3751       Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
3752
3753   assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
3754          "unexpected offset in global node");
3755
3756   // This also catches the large code model case for Darwin.
3757   if ((OpFlags & AArch64II::MO_GOT) != 0) {
3758     return getGOT(GN, DAG, TargetFlags);
3759   }
3760
3761   SDValue Result;
3762   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
3763     Result = getAddrLarge(GN, DAG, TargetFlags);
3764   } else {
3765     Result = getAddr(GN, DAG, TargetFlags);
3766   }
3767   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3768   SDLoc DL(GN);
3769   if (GV->hasDLLImportStorageClass())
3770     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3771                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3772   return Result;
3773 }
3774
3775 /// \brief Convert a TLS address reference into the correct sequence of loads
3776 /// and calls to compute the variable's address (for Darwin, currently) and
3777 /// return an SDValue containing the final node.
3778
3779 /// Darwin only has one TLS scheme which must be capable of dealing with the
3780 /// fully general situation, in the worst case. This means:
3781 ///     + "extern __thread" declaration.
3782 ///     + Defined in a possibly unknown dynamic library.
3783 ///
3784 /// The general system is that each __thread variable has a [3 x i64] descriptor
3785 /// which contains information used by the runtime to calculate the address. The
3786 /// only part of this the compiler needs to know about is the first xword, which
3787 /// contains a function pointer that must be called with the address of the
3788 /// entire descriptor in "x0".
3789 ///
3790 /// Since this descriptor may be in a different unit, in general even the
3791 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
3792 /// is:
3793 ///     adrp x0, _var@TLVPPAGE
3794 ///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
3795 ///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
3796 ///                                      ; the function pointer
3797 ///     blr x1                           ; Uses descriptor address in x0
3798 ///     ; Address of _var is now in x0.
3799 ///
3800 /// If the address of _var's descriptor *is* known to the linker, then it can
3801 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
3802 /// a slight efficiency gain.
3803 SDValue
3804 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
3805                                                    SelectionDAG &DAG) const {
3806   assert(Subtarget->isTargetDarwin() &&
3807          "This function expects a Darwin target");
3808
3809   SDLoc DL(Op);
3810   MVT PtrVT = getPointerTy(DAG.getDataLayout());
3811   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3812
3813   SDValue TLVPAddr =
3814       DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
3815   SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
3816
3817   // The first entry in the descriptor is a function pointer that we must call
3818   // to obtain the address of the variable.
3819   SDValue Chain = DAG.getEntryNode();
3820   SDValue FuncTLVGet = DAG.getLoad(
3821       MVT::i64, DL, Chain, DescAddr,
3822       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
3823       /* Alignment = */ 8,
3824       MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant |
3825           MachineMemOperand::MODereferenceable);
3826   Chain = FuncTLVGet.getValue(1);
3827
3828   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
3829   MFI.setAdjustsStack(true);
3830
3831   // TLS calls preserve all registers except those that absolutely must be
3832   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3833   // silly).
3834   const uint32_t *Mask =
3835       Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
3836
3837   // Finally, we can make the call. This is just a degenerate version of a
3838   // normal AArch64 call node: x0 takes the address of the descriptor, and
3839   // returns the address of the variable in this thread.
3840   Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
3841   Chain =
3842       DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3843                   Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
3844                   DAG.getRegisterMask(Mask), Chain.getValue(1));
3845   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
3846 }
3847
3848 /// When accessing thread-local variables under either the general-dynamic or
3849 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
3850 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
3851 /// is a function pointer to carry out the resolution.
3852 ///
3853 /// The sequence is:
3854 ///    adrp  x0, :tlsdesc:var
3855 ///    ldr   x1, [x0, #:tlsdesc_lo12:var]
3856 ///    add   x0, x0, #:tlsdesc_lo12:var
3857 ///    .tlsdesccall var
3858 ///    blr   x1
3859 ///    (TPIDR_EL0 offset now in x0)
3860 ///
3861 ///  The above sequence must be produced unscheduled, to enable the linker to
3862 ///  optimize/relax this sequence.
3863 ///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
3864 ///  above sequence, and expanded really late in the compilation flow, to ensure
3865 ///  the sequence is produced as per above.
3866 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
3867                                                       const SDLoc &DL,
3868                                                       SelectionDAG &DAG) const {
3869   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3870
3871   SDValue Chain = DAG.getEntryNode();
3872   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3873
3874   Chain =
3875       DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
3876   SDValue Glue = Chain.getValue(1);
3877
3878   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
3879 }
3880
3881 SDValue
3882 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
3883                                                 SelectionDAG &DAG) const {
3884   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
3885   assert(Subtarget->useSmallAddressing() &&
3886          "ELF TLS only supported in small memory model");
3887   // Different choices can be made for the maximum size of the TLS area for a
3888   // module. For the small address model, the default TLS size is 16MiB and the
3889   // maximum TLS size is 4GiB.
3890   // FIXME: add -mtls-size command line option and make it control the 16MiB
3891   // vs. 4GiB code sequence generation.
3892   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3893
3894   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
3895
3896   if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
3897     if (Model == TLSModel::LocalDynamic)
3898       Model = TLSModel::GeneralDynamic;
3899   }
3900
3901   SDValue TPOff;
3902   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3903   SDLoc DL(Op);
3904   const GlobalValue *GV = GA->getGlobal();
3905
3906   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
3907
3908   if (Model == TLSModel::LocalExec) {
3909     SDValue HiVar = DAG.getTargetGlobalAddress(
3910         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
3911     SDValue LoVar = DAG.getTargetGlobalAddress(
3912         GV, DL, PtrVT, 0,
3913         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
3914
3915     SDValue TPWithOff_lo =
3916         SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
3917                                    HiVar,
3918                                    DAG.getTargetConstant(0, DL, MVT::i32)),
3919                 0);
3920     SDValue TPWithOff =
3921         SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
3922                                    LoVar,
3923                                    DAG.getTargetConstant(0, DL, MVT::i32)),
3924                 0);
3925     return TPWithOff;
3926   } else if (Model == TLSModel::InitialExec) {
3927     TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
3928     TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
3929   } else if (Model == TLSModel::LocalDynamic) {
3930     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
3931     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
3932     // the beginning of the module's TLS region, followed by a DTPREL offset
3933     // calculation.
3934
3935     // These accesses will need deduplicating if there's more than one.
3936     AArch64FunctionInfo *MFI =
3937         DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
3938     MFI->incNumLocalDynamicTLSAccesses();
3939
3940     // The call needs a relocation too for linker relaxation. It doesn't make
3941     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
3942     // the address.
3943     SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
3944                                                   AArch64II::MO_TLS);
3945
3946     // Now we can calculate the offset from TPIDR_EL0 to this module's
3947     // thread-local area.
3948     TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
3949
3950     // Now use :dtprel_whatever: operations to calculate this variable's offset
3951     // in its thread-storage area.
3952     SDValue HiVar = DAG.getTargetGlobalAddress(
3953         GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
3954     SDValue LoVar = DAG.getTargetGlobalAddress(
3955         GV, DL, MVT::i64, 0,
3956         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
3957
3958     TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
3959                                        DAG.getTargetConstant(0, DL, MVT::i32)),
3960                     0);
3961     TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
3962                                        DAG.getTargetConstant(0, DL, MVT::i32)),
3963                     0);
3964   } else if (Model == TLSModel::GeneralDynamic) {
3965     // The call needs a relocation too for linker relaxation. It doesn't make
3966     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
3967     // the address.
3968     SDValue SymAddr =
3969         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
3970
3971     // Finally we can make a call to calculate the offset from tpidr_el0.
3972     TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
3973   } else
3974     llvm_unreachable("Unsupported ELF TLS access model");
3975
3976   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
3977 }
3978
3979 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
3980                                                      SelectionDAG &DAG) const {
3981   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3982   if (DAG.getTarget().Options.EmulatedTLS)
3983     return LowerToTLSEmulatedModel(GA, DAG);
3984
3985   if (Subtarget->isTargetDarwin())
3986     return LowerDarwinGlobalTLSAddress(Op, DAG);
3987   if (Subtarget->isTargetELF())
3988     return LowerELFGlobalTLSAddress(Op, DAG);
3989
3990   llvm_unreachable("Unexpected platform trying to use TLS");
3991 }
3992
3993 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
3994   SDValue Chain = Op.getOperand(0);
3995   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3996   SDValue LHS = Op.getOperand(2);
3997   SDValue RHS = Op.getOperand(3);
3998   SDValue Dest = Op.getOperand(4);
3999   SDLoc dl(Op);
4000
4001   // Handle f128 first, since lowering it will result in comparing the return
4002   // value of a libcall against zero, which is just what the rest of LowerBR_CC
4003   // is expecting to deal with.
4004   if (LHS.getValueType() == MVT::f128) {
4005     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
4006
4007     // If softenSetCCOperands returned a scalar, we need to compare the result
4008     // against zero to select between true and false values.
4009     if (!RHS.getNode()) {
4010       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4011       CC = ISD::SETNE;
4012     }
4013   }
4014
4015   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
4016   // instruction.
4017   if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
4018       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
4019     // Only lower legal XALUO ops.
4020     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
4021       return SDValue();
4022
4023     // The actual operation with overflow check.
4024     AArch64CC::CondCode OFCC;
4025     SDValue Value, Overflow;
4026     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
4027
4028     if (CC == ISD::SETNE)
4029       OFCC = getInvertedCondCode(OFCC);
4030     SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
4031
4032     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
4033                        Overflow);
4034   }
4035
4036   if (LHS.getValueType().isInteger()) {
4037     assert((LHS.getValueType() == RHS.getValueType()) &&
4038            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
4039
4040     // If the RHS of the comparison is zero, we can potentially fold this
4041     // to a specialized branch.
4042     const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
4043     if (RHSC && RHSC->getZExtValue() == 0) {
4044       if (CC == ISD::SETEQ) {
4045         // See if we can use a TBZ to fold in an AND as well.
4046         // TBZ has a smaller branch displacement than CBZ.  If the offset is
4047         // out of bounds, a late MI-layer pass rewrites branches.
4048         // 403.gcc is an example that hits this case.
4049         if (LHS.getOpcode() == ISD::AND &&
4050             isa<ConstantSDNode>(LHS.getOperand(1)) &&
4051             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
4052           SDValue Test = LHS.getOperand(0);
4053           uint64_t Mask = LHS.getConstantOperandVal(1);
4054           return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
4055                              DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
4056                              Dest);
4057         }
4058
4059         return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
4060       } else if (CC == ISD::SETNE) {
4061         // See if we can use a TBZ to fold in an AND as well.
4062         // TBZ has a smaller branch displacement than CBZ.  If the offset is
4063         // out of bounds, a late MI-layer pass rewrites branches.
4064         // 403.gcc is an example that hits this case.
4065         if (LHS.getOpcode() == ISD::AND &&
4066             isa<ConstantSDNode>(LHS.getOperand(1)) &&
4067             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
4068           SDValue Test = LHS.getOperand(0);
4069           uint64_t Mask = LHS.getConstantOperandVal(1);
4070           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
4071                              DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
4072                              Dest);
4073         }
4074
4075         return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
4076       } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
4077         // Don't combine AND since emitComparison converts the AND to an ANDS
4078         // (a.k.a. TST) and the test in the test bit and branch instruction
4079         // becomes redundant.  This would also increase register pressure.
4080         uint64_t Mask = LHS.getValueSizeInBits() - 1;
4081         return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
4082                            DAG.getConstant(Mask, dl, MVT::i64), Dest);
4083       }
4084     }
4085     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
4086         LHS.getOpcode() != ISD::AND) {
4087       // Don't combine AND since emitComparison converts the AND to an ANDS
4088       // (a.k.a. TST) and the test in the test bit and branch instruction
4089       // becomes redundant.  This would also increase register pressure.
4090       uint64_t Mask = LHS.getValueSizeInBits() - 1;
4091       return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
4092                          DAG.getConstant(Mask, dl, MVT::i64), Dest);
4093     }
4094
4095     SDValue CCVal;
4096     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
4097     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
4098                        Cmp);
4099   }
4100
4101   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
4102          LHS.getValueType() == MVT::f64);
4103
4104   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
4105   // clean.  Some of them require two branches to implement.
4106   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
4107   AArch64CC::CondCode CC1, CC2;
4108   changeFPCCToAArch64CC(CC, CC1, CC2);
4109   SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
4110   SDValue BR1 =
4111       DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
4112   if (CC2 != AArch64CC::AL) {
4113     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
4114     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
4115                        Cmp);
4116   }
4117
4118   return BR1;
4119 }
4120
4121 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
4122                                               SelectionDAG &DAG) const {
4123   EVT VT = Op.getValueType();
4124   SDLoc DL(Op);
4125
4126   SDValue In1 = Op.getOperand(0);
4127   SDValue In2 = Op.getOperand(1);
4128   EVT SrcVT = In2.getValueType();
4129
4130   if (SrcVT.bitsLT(VT))
4131     In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
4132   else if (SrcVT.bitsGT(VT))
4133     In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
4134
4135   EVT VecVT;
4136   uint64_t EltMask;
4137   SDValue VecVal1, VecVal2;
4138
4139   auto setVecVal = [&] (int Idx) {
4140     if (!VT.isVector()) {
4141       VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
4142                                           DAG.getUNDEF(VecVT), In1);
4143       VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
4144                                           DAG.getUNDEF(VecVT), In2);
4145     } else {
4146       VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
4147       VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
4148     }
4149   };
4150
4151   if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
4152     VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
4153     EltMask = 0x80000000ULL;
4154     setVecVal(AArch64::ssub);
4155   } else if (VT == MVT::f64 || VT == MVT::v2f64) {
4156     VecVT = MVT::v2i64;
4157
4158     // We want to materialize a mask with the high bit set, but the AdvSIMD
4159     // immediate moves cannot materialize that in a single instruction for
4160     // 64-bit elements. Instead, materialize zero and then negate it.
4161     EltMask = 0;
4162
4163     setVecVal(AArch64::dsub);
4164   } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) {
4165     VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
4166     EltMask = 0x8000ULL;
4167     setVecVal(AArch64::hsub);
4168   } else {
4169     llvm_unreachable("Invalid type for copysign!");
4170   }
4171
4172   SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
4173
4174   // If we couldn't materialize the mask above, then the mask vector will be
4175   // the zero vector, and we need to negate it here.
4176   if (VT == MVT::f64 || VT == MVT::v2f64) {
4177     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
4178     BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
4179     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
4180   }
4181
4182   SDValue Sel =
4183       DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
4184
4185   if (VT == MVT::f16)
4186     return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
4187   if (VT == MVT::f32)
4188     return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
4189   else if (VT == MVT::f64)
4190     return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
4191   else
4192     return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
4193 }
4194
4195 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
4196   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
4197           Attribute::NoImplicitFloat))
4198     return SDValue();
4199
4200   if (!Subtarget->hasNEON())
4201     return SDValue();
4202
4203   // While there is no integer popcount instruction, it can
4204   // be more efficiently lowered to the following sequence that uses
4205   // AdvSIMD registers/instructions as long as the copies to/from
4206   // the AdvSIMD registers are cheap.
4207   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
4208   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
4209   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
4210   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
4211   SDValue Val = Op.getOperand(0);
4212   SDLoc DL(Op);
4213   EVT VT = Op.getValueType();
4214
4215   if (VT == MVT::i32)
4216     Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
4217   Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
4218
4219   SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
4220   SDValue UaddLV = DAG.getNode(
4221       ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
4222       DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
4223
4224   if (VT == MVT::i64)
4225     UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
4226   return UaddLV;
4227 }
4228
4229 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
4230
4231   if (Op.getValueType().isVector())
4232     return LowerVSETCC(Op, DAG);
4233
4234   SDValue LHS = Op.getOperand(0);
4235   SDValue RHS = Op.getOperand(1);
4236   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
4237   SDLoc dl(Op);
4238
4239   // We chose ZeroOrOneBooleanContents, so use zero and one.
4240   EVT VT = Op.getValueType();
4241   SDValue TVal = DAG.getConstant(1, dl, VT);
4242   SDValue FVal = DAG.getConstant(0, dl, VT);
4243
4244   // Handle f128 first, since one possible outcome is a normal integer
4245   // comparison which gets picked up by the next if statement.
4246   if (LHS.getValueType() == MVT::f128) {
4247     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
4248
4249     // If softenSetCCOperands returned a scalar, use it.
4250     if (!RHS.getNode()) {
4251       assert(LHS.getValueType() == Op.getValueType() &&
4252              "Unexpected setcc expansion!");
4253       return LHS;
4254     }
4255   }
4256
4257   if (LHS.getValueType().isInteger()) {
4258     SDValue CCVal;
4259     SDValue Cmp =
4260         getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
4261
4262     // Note that we inverted the condition above, so we reverse the order of
4263     // the true and false operands here.  This will allow the setcc to be
4264     // matched to a single CSINC instruction.
4265     return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
4266   }
4267
4268   // Now we know we're dealing with FP values.
4269   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
4270          LHS.getValueType() == MVT::f64);
4271
4272   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
4273   // and do the comparison.
4274   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
4275
4276   AArch64CC::CondCode CC1, CC2;
4277   changeFPCCToAArch64CC(CC, CC1, CC2);
4278   if (CC2 == AArch64CC::AL) {
4279     changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
4280     SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
4281
4282     // Note that we inverted the condition above, so we reverse the order of
4283     // the true and false operands here.  This will allow the setcc to be
4284     // matched to a single CSINC instruction.
4285     return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
4286   } else {
4287     // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
4288     // totally clean.  Some of them require two CSELs to implement.  As is in
4289     // this case, we emit the first CSEL and then emit a second using the output
4290     // of the first as the RHS.  We're effectively OR'ing the two CC's together.
4291
4292     // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
4293     SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
4294     SDValue CS1 =
4295         DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
4296
4297     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
4298     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
4299   }
4300 }
4301
4302 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
4303                                               SDValue RHS, SDValue TVal,
4304                                               SDValue FVal, const SDLoc &dl,
4305                                               SelectionDAG &DAG) const {
4306   // Handle f128 first, because it will result in a comparison of some RTLIB
4307   // call result against zero.
4308   if (LHS.getValueType() == MVT::f128) {
4309     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
4310
4311     // If softenSetCCOperands returned a scalar, we need to compare the result
4312     // against zero to select between true and false values.
4313     if (!RHS.getNode()) {
4314       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4315       CC = ISD::SETNE;
4316     }
4317   }
4318
4319   // Also handle f16, for which we need to do a f32 comparison.
4320   if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4321     LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
4322     RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
4323   }
4324
4325   // Next, handle integers.
4326   if (LHS.getValueType().isInteger()) {
4327     assert((LHS.getValueType() == RHS.getValueType()) &&
4328            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
4329
4330     unsigned Opcode = AArch64ISD::CSEL;
4331
4332     // If both the TVal and the FVal are constants, see if we can swap them in
4333     // order to for a CSINV or CSINC out of them.
4334     ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4335     ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4336
4337     if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
4338       std::swap(TVal, FVal);
4339       std::swap(CTVal, CFVal);
4340       CC = ISD::getSetCCInverse(CC, true);
4341     } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
4342       std::swap(TVal, FVal);
4343       std::swap(CTVal, CFVal);
4344       CC = ISD::getSetCCInverse(CC, true);
4345     } else if (TVal.getOpcode() == ISD::XOR) {
4346       // If TVal is a NOT we want to swap TVal and FVal so that we can match
4347       // with a CSINV rather than a CSEL.
4348       if (isAllOnesConstant(TVal.getOperand(1))) {
4349         std::swap(TVal, FVal);
4350         std::swap(CTVal, CFVal);
4351         CC = ISD::getSetCCInverse(CC, true);
4352       }
4353     } else if (TVal.getOpcode() == ISD::SUB) {
4354       // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
4355       // that we can match with a CSNEG rather than a CSEL.
4356       if (isNullConstant(TVal.getOperand(0))) {
4357         std::swap(TVal, FVal);
4358         std::swap(CTVal, CFVal);
4359         CC = ISD::getSetCCInverse(CC, true);
4360       }
4361     } else if (CTVal && CFVal) {
4362       const int64_t TrueVal = CTVal->getSExtValue();
4363       const int64_t FalseVal = CFVal->getSExtValue();
4364       bool Swap = false;
4365
4366       // If both TVal and FVal are constants, see if FVal is the
4367       // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
4368       // instead of a CSEL in that case.
4369       if (TrueVal == ~FalseVal) {
4370         Opcode = AArch64ISD::CSINV;
4371       } else if (TrueVal == -FalseVal) {
4372         Opcode = AArch64ISD::CSNEG;
4373       } else if (TVal.getValueType() == MVT::i32) {
4374         // If our operands are only 32-bit wide, make sure we use 32-bit
4375         // arithmetic for the check whether we can use CSINC. This ensures that
4376         // the addition in the check will wrap around properly in case there is
4377         // an overflow (which would not be the case if we do the check with
4378         // 64-bit arithmetic).
4379         const uint32_t TrueVal32 = CTVal->getZExtValue();
4380         const uint32_t FalseVal32 = CFVal->getZExtValue();
4381
4382         if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
4383           Opcode = AArch64ISD::CSINC;
4384
4385           if (TrueVal32 > FalseVal32) {
4386             Swap = true;
4387           }
4388         }
4389         // 64-bit check whether we can use CSINC.
4390       } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
4391         Opcode = AArch64ISD::CSINC;
4392
4393         if (TrueVal > FalseVal) {
4394           Swap = true;
4395         }
4396       }
4397
4398       // Swap TVal and FVal if necessary.
4399       if (Swap) {
4400         std::swap(TVal, FVal);
4401         std::swap(CTVal, CFVal);
4402         CC = ISD::getSetCCInverse(CC, true);
4403       }
4404
4405       if (Opcode != AArch64ISD::CSEL) {
4406         // Drop FVal since we can get its value by simply inverting/negating
4407         // TVal.
4408         FVal = TVal;
4409       }
4410     }
4411
4412     // Avoid materializing a constant when possible by reusing a known value in
4413     // a register.  However, don't perform this optimization if the known value
4414     // is one, zero or negative one in the case of a CSEL.  We can always
4415     // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
4416     // FVal, respectively.
4417     ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
4418     if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
4419         !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
4420       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
4421       // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
4422       // "a != C ? x : a" to avoid materializing C.
4423       if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
4424         TVal = LHS;
4425       else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
4426         FVal = LHS;
4427     } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
4428       assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
4429       // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
4430       // avoid materializing C.
4431       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
4432       if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
4433         Opcode = AArch64ISD::CSINV;
4434         TVal = LHS;
4435         FVal = DAG.getConstant(0, dl, FVal.getValueType());
4436       }
4437     }
4438
4439     SDValue CCVal;
4440     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
4441     EVT VT = TVal.getValueType();
4442     return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
4443   }
4444
4445   // Now we know we're dealing with FP values.
4446   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
4447          LHS.getValueType() == MVT::f64);
4448   assert(LHS.getValueType() == RHS.getValueType());
4449   EVT VT = TVal.getValueType();
4450   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
4451
4452   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
4453   // clean.  Some of them require two CSELs to implement.
4454   AArch64CC::CondCode CC1, CC2;
4455   changeFPCCToAArch64CC(CC, CC1, CC2);
4456
4457   if (DAG.getTarget().Options.UnsafeFPMath) {
4458     // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
4459     // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
4460     ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
4461     if (RHSVal && RHSVal->isZero()) {
4462       ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
4463       ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
4464
4465       if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
4466           CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
4467         TVal = LHS;
4468       else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
4469                CFVal && CFVal->isZero() &&
4470                FVal.getValueType() == LHS.getValueType())
4471         FVal = LHS;
4472     }
4473   }
4474
4475   // Emit first, and possibly only, CSEL.
4476   SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
4477   SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
4478
4479   // If we need a second CSEL, emit it, using the output of the first as the
4480   // RHS.  We're effectively OR'ing the two CC's together.
4481   if (CC2 != AArch64CC::AL) {
4482     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
4483     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
4484   }
4485
4486   // Otherwise, return the output of the first CSEL.
4487   return CS1;
4488 }
4489
4490 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
4491                                               SelectionDAG &DAG) const {
4492   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4493   SDValue LHS = Op.getOperand(0);
4494   SDValue RHS = Op.getOperand(1);
4495   SDValue TVal = Op.getOperand(2);
4496   SDValue FVal = Op.getOperand(3);
4497   SDLoc DL(Op);
4498   return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
4499 }
4500
4501 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
4502                                            SelectionDAG &DAG) const {
4503   SDValue CCVal = Op->getOperand(0);
4504   SDValue TVal = Op->getOperand(1);
4505   SDValue FVal = Op->getOperand(2);
4506   SDLoc DL(Op);
4507
4508   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
4509   // instruction.
4510   if (isOverflowIntrOpRes(CCVal)) {
4511     // Only lower legal XALUO ops.
4512     if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
4513       return SDValue();
4514
4515     AArch64CC::CondCode OFCC;
4516     SDValue Value, Overflow;
4517     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
4518     SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
4519
4520     return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4521                        CCVal, Overflow);
4522   }
4523
4524   // Lower it the same way as we would lower a SELECT_CC node.
4525   ISD::CondCode CC;
4526   SDValue LHS, RHS;
4527   if (CCVal.getOpcode() == ISD::SETCC) {
4528     LHS = CCVal.getOperand(0);
4529     RHS = CCVal.getOperand(1);
4530     CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
4531   } else {
4532     LHS = CCVal;
4533     RHS = DAG.getConstant(0, DL, CCVal.getValueType());
4534     CC = ISD::SETNE;
4535   }
4536   return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
4537 }
4538
4539 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
4540                                               SelectionDAG &DAG) const {
4541   // Jump table entries as PC relative offsets. No additional tweaking
4542   // is necessary here. Just get the address of the jump table.
4543   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
4544
4545   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
4546       !Subtarget->isTargetMachO()) {
4547     return getAddrLarge(JT, DAG);
4548   }
4549   return getAddr(JT, DAG);
4550 }
4551
4552 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
4553                                                  SelectionDAG &DAG) const {
4554   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
4555
4556   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
4557     // Use the GOT for the large code model on iOS.
4558     if (Subtarget->isTargetMachO()) {
4559       return getGOT(CP, DAG);
4560     }
4561     return getAddrLarge(CP, DAG);
4562   } else {
4563     return getAddr(CP, DAG);
4564   }
4565 }
4566
4567 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
4568                                                SelectionDAG &DAG) const {
4569   BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
4570   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
4571       !Subtarget->isTargetMachO()) {
4572     return getAddrLarge(BA, DAG);
4573   } else {
4574     return getAddr(BA, DAG);
4575   }
4576 }
4577
4578 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
4579                                                  SelectionDAG &DAG) const {
4580   AArch64FunctionInfo *FuncInfo =
4581       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
4582
4583   SDLoc DL(Op);
4584   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
4585                                  getPointerTy(DAG.getDataLayout()));
4586   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4587   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
4588                       MachinePointerInfo(SV));
4589 }
4590
4591 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
4592                                                   SelectionDAG &DAG) const {
4593   AArch64FunctionInfo *FuncInfo =
4594       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
4595
4596   SDLoc DL(Op);
4597   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
4598                                      ? FuncInfo->getVarArgsGPRIndex()
4599                                      : FuncInfo->getVarArgsStackIndex(),
4600                                  getPointerTy(DAG.getDataLayout()));
4601   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4602   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
4603                       MachinePointerInfo(SV));
4604 }
4605
4606 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
4607                                                 SelectionDAG &DAG) const {
4608   // The layout of the va_list struct is specified in the AArch64 Procedure Call
4609   // Standard, section B.3.
4610   MachineFunction &MF = DAG.getMachineFunction();
4611   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
4612   auto PtrVT = getPointerTy(DAG.getDataLayout());
4613   SDLoc DL(Op);
4614
4615   SDValue Chain = Op.getOperand(0);
4616   SDValue VAList = Op.getOperand(1);
4617   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4618   SmallVector<SDValue, 4> MemOps;
4619
4620   // void *__stack at offset 0
4621   SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
4622   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
4623                                 MachinePointerInfo(SV), /* Alignment = */ 8));
4624
4625   // void *__gr_top at offset 8
4626   int GPRSize = FuncInfo->getVarArgsGPRSize();
4627   if (GPRSize > 0) {
4628     SDValue GRTop, GRTopAddr;
4629
4630     GRTopAddr =
4631         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
4632
4633     GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
4634     GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
4635                         DAG.getConstant(GPRSize, DL, PtrVT));
4636
4637     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
4638                                   MachinePointerInfo(SV, 8),
4639                                   /* Alignment = */ 8));
4640   }
4641
4642   // void *__vr_top at offset 16
4643   int FPRSize = FuncInfo->getVarArgsFPRSize();
4644   if (FPRSize > 0) {
4645     SDValue VRTop, VRTopAddr;
4646     VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
4647                             DAG.getConstant(16, DL, PtrVT));
4648
4649     VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
4650     VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
4651                         DAG.getConstant(FPRSize, DL, PtrVT));
4652
4653     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
4654                                   MachinePointerInfo(SV, 16),
4655                                   /* Alignment = */ 8));
4656   }
4657
4658   // int __gr_offs at offset 24
4659   SDValue GROffsAddr =
4660       DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
4661   MemOps.push_back(DAG.getStore(
4662       Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
4663       MachinePointerInfo(SV, 24), /* Alignment = */ 4));
4664
4665   // int __vr_offs at offset 28
4666   SDValue VROffsAddr =
4667       DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
4668   MemOps.push_back(DAG.getStore(
4669       Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
4670       MachinePointerInfo(SV, 28), /* Alignment = */ 4));
4671
4672   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
4673 }
4674
4675 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
4676                                             SelectionDAG &DAG) const {
4677   MachineFunction &MF = DAG.getMachineFunction();
4678
4679   if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
4680     return LowerWin64_VASTART(Op, DAG);
4681   else if (Subtarget->isTargetDarwin())
4682     return LowerDarwin_VASTART(Op, DAG);
4683   else
4684     return LowerAAPCS_VASTART(Op, DAG);
4685 }
4686
4687 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
4688                                            SelectionDAG &DAG) const {
4689   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
4690   // pointer.
4691   SDLoc DL(Op);
4692   unsigned VaListSize =
4693       Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32;
4694   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
4695   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
4696
4697   return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
4698                        Op.getOperand(2),
4699                        DAG.getConstant(VaListSize, DL, MVT::i32),
4700                        8, false, false, false, MachinePointerInfo(DestSV),
4701                        MachinePointerInfo(SrcSV));
4702 }
4703
4704 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
4705   assert(Subtarget->isTargetDarwin() &&
4706          "automatic va_arg instruction only works on Darwin");
4707
4708   const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4709   EVT VT = Op.getValueType();
4710   SDLoc DL(Op);
4711   SDValue Chain = Op.getOperand(0);
4712   SDValue Addr = Op.getOperand(1);
4713   unsigned Align = Op.getConstantOperandVal(3);
4714   auto PtrVT = getPointerTy(DAG.getDataLayout());
4715
4716   SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
4717   Chain = VAList.getValue(1);
4718
4719   if (Align > 8) {
4720     assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
4721     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
4722                          DAG.getConstant(Align - 1, DL, PtrVT));
4723     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
4724                          DAG.getConstant(-(int64_t)Align, DL, PtrVT));
4725   }
4726
4727   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
4728   uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
4729
4730   // Scalar integer and FP values smaller than 64 bits are implicitly extended
4731   // up to 64 bits.  At the very least, we have to increase the striding of the
4732   // vaargs list to match this, and for FP values we need to introduce
4733   // FP_ROUND nodes as well.
4734   if (VT.isInteger() && !VT.isVector())
4735     ArgSize = 8;
4736   bool NeedFPTrunc = false;
4737   if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
4738     ArgSize = 8;
4739     NeedFPTrunc = true;
4740   }
4741
4742   // Increment the pointer, VAList, to the next vaarg
4743   SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
4744                                DAG.getConstant(ArgSize, DL, PtrVT));
4745   // Store the incremented VAList to the legalized pointer
4746   SDValue APStore =
4747       DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
4748
4749   // Load the actual argument out of the pointer VAList
4750   if (NeedFPTrunc) {
4751     // Load the value as an f64.
4752     SDValue WideFP =
4753         DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
4754     // Round the value down to an f32.
4755     SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
4756                                    DAG.getIntPtrConstant(1, DL));
4757     SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
4758     // Merge the rounded value with the chain output of the load.
4759     return DAG.getMergeValues(Ops, DL);
4760   }
4761
4762   return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
4763 }
4764
4765 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
4766                                               SelectionDAG &DAG) const {
4767   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
4768   MFI.setFrameAddressIsTaken(true);
4769
4770   EVT VT = Op.getValueType();
4771   SDLoc DL(Op);
4772   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4773   SDValue FrameAddr =
4774       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
4775   while (Depth--)
4776     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
4777                             MachinePointerInfo());
4778   return FrameAddr;
4779 }
4780
4781 // FIXME? Maybe this could be a TableGen attribute on some registers and
4782 // this table could be generated automatically from RegInfo.
4783 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
4784                                                   SelectionDAG &DAG) const {
4785   unsigned Reg = StringSwitch<unsigned>(RegName)
4786                        .Case("sp", AArch64::SP)
4787                        .Case("x18", AArch64::X18)
4788                        .Case("w18", AArch64::W18)
4789                        .Default(0);
4790   if ((Reg == AArch64::X18 || Reg == AArch64::W18) &&
4791       !Subtarget->isX18Reserved())
4792     Reg = 0;
4793   if (Reg)
4794     return Reg;
4795   report_fatal_error(Twine("Invalid register name \""
4796                               + StringRef(RegName)  + "\"."));
4797 }
4798
4799 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
4800                                                SelectionDAG &DAG) const {
4801   MachineFunction &MF = DAG.getMachineFunction();
4802   MachineFrameInfo &MFI = MF.getFrameInfo();
4803   MFI.setReturnAddressIsTaken(true);
4804
4805   EVT VT = Op.getValueType();
4806   SDLoc DL(Op);
4807   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4808   if (Depth) {
4809     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
4810     SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
4811     return DAG.getLoad(VT, DL, DAG.getEntryNode(),
4812                        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
4813                        MachinePointerInfo());
4814   }
4815
4816   // Return LR, which contains the return address. Mark it an implicit live-in.
4817   unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
4818   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
4819 }
4820
4821 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
4822 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
4823 SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
4824                                                     SelectionDAG &DAG) const {
4825   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4826   EVT VT = Op.getValueType();
4827   unsigned VTBits = VT.getSizeInBits();
4828   SDLoc dl(Op);
4829   SDValue ShOpLo = Op.getOperand(0);
4830   SDValue ShOpHi = Op.getOperand(1);
4831   SDValue ShAmt = Op.getOperand(2);
4832   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
4833
4834   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
4835
4836   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
4837                                  DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
4838   SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
4839
4840   // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
4841   // is "undef". We wanted 0, so CSEL it directly.
4842   SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
4843                                ISD::SETEQ, dl, DAG);
4844   SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
4845   HiBitsForLo =
4846       DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
4847                   HiBitsForLo, CCVal, Cmp);
4848
4849   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
4850                                    DAG.getConstant(VTBits, dl, MVT::i64));
4851
4852   SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
4853   SDValue LoForNormalShift =
4854       DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
4855
4856   Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
4857                        dl, DAG);
4858   CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
4859   SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
4860   SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
4861                            LoForNormalShift, CCVal, Cmp);
4862
4863   // AArch64 shifts larger than the register width are wrapped rather than
4864   // clamped, so we can't just emit "hi >> x".
4865   SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
4866   SDValue HiForBigShift =
4867       Opc == ISD::SRA
4868           ? DAG.getNode(Opc, dl, VT, ShOpHi,
4869                         DAG.getConstant(VTBits - 1, dl, MVT::i64))
4870           : DAG.getConstant(0, dl, VT);
4871   SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
4872                            HiForNormalShift, CCVal, Cmp);
4873
4874   SDValue Ops[2] = { Lo, Hi };
4875   return DAG.getMergeValues(Ops, dl);
4876 }
4877
4878 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
4879 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
4880 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
4881                                                    SelectionDAG &DAG) const {
4882   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
4883   EVT VT = Op.getValueType();
4884   unsigned VTBits = VT.getSizeInBits();
4885   SDLoc dl(Op);
4886   SDValue ShOpLo = Op.getOperand(0);
4887   SDValue ShOpHi = Op.getOperand(1);
4888   SDValue ShAmt = Op.getOperand(2);
4889
4890   assert(Op.getOpcode() == ISD::SHL_PARTS);
4891   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
4892                                  DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
4893   SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
4894
4895   // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
4896   // is "undef". We wanted 0, so CSEL it directly.
4897   SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
4898                                ISD::SETEQ, dl, DAG);
4899   SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
4900   LoBitsForHi =
4901       DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
4902                   LoBitsForHi, CCVal, Cmp);
4903
4904   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
4905                                    DAG.getConstant(VTBits, dl, MVT::i64));
4906   SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
4907   SDValue HiForNormalShift =
4908       DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
4909
4910   SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
4911
4912   Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
4913                        dl, DAG);
4914   CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
4915   SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
4916                            HiForNormalShift, CCVal, Cmp);
4917
4918   // AArch64 shifts of larger than register sizes are wrapped rather than
4919   // clamped, so we can't just emit "lo << a" if a is too big.
4920   SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
4921   SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
4922   SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
4923                            LoForNormalShift, CCVal, Cmp);
4924
4925   SDValue Ops[2] = { Lo, Hi };
4926   return DAG.getMergeValues(Ops, dl);
4927 }
4928
4929 bool AArch64TargetLowering::isOffsetFoldingLegal(
4930     const GlobalAddressSDNode *GA) const {
4931   DEBUG(dbgs() << "Skipping offset folding global address: ");
4932   DEBUG(GA->dump());
4933   DEBUG(dbgs() << "AArch64 doesn't support folding offsets into global "
4934         "addresses\n");
4935   return false;
4936 }
4937
4938 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4939   // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
4940   // FIXME: We should be able to handle f128 as well with a clever lowering.
4941   if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32 ||
4942                           (VT == MVT::f16 && Subtarget->hasFullFP16()))) {
4943     DEBUG(dbgs() << "Legal fp imm: materialize 0 using the zero register\n");
4944     return true;
4945   }
4946
4947   StringRef FPType;
4948   bool IsLegal = false;
4949   SmallString<128> ImmStrVal;
4950   Imm.toString(ImmStrVal);
4951
4952   if (VT == MVT::f64) {
4953     FPType = "f64";
4954     IsLegal = AArch64_AM::getFP64Imm(Imm) != -1;
4955   } else if (VT == MVT::f32) {
4956     FPType = "f32";
4957     IsLegal = AArch64_AM::getFP32Imm(Imm) != -1;
4958   } else if (VT == MVT::f16 && Subtarget->hasFullFP16()) {
4959     FPType = "f16";
4960     IsLegal = AArch64_AM::getFP16Imm(Imm) != -1;
4961   }
4962
4963   if (IsLegal) {
4964     DEBUG(dbgs() << "Legal " << FPType << " imm value: " << ImmStrVal << "\n");
4965     return true;
4966   }
4967
4968   if (!FPType.empty())
4969     DEBUG(dbgs() << "Illegal " << FPType << " imm value: " << ImmStrVal << "\n");
4970   else
4971     DEBUG(dbgs() << "Illegal fp imm " << ImmStrVal << ": unsupported fp type\n");
4972
4973   return false;
4974 }
4975
4976 //===----------------------------------------------------------------------===//
4977 //                          AArch64 Optimization Hooks
4978 //===----------------------------------------------------------------------===//
4979
4980 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
4981                            SDValue Operand, SelectionDAG &DAG,
4982                            int &ExtraSteps) {
4983   EVT VT = Operand.getValueType();
4984   if (ST->hasNEON() &&
4985       (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
4986        VT == MVT::f32 || VT == MVT::v1f32 ||
4987        VT == MVT::v2f32 || VT == MVT::v4f32)) {
4988     if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
4989       // For the reciprocal estimates, convergence is quadratic, so the number
4990       // of digits is doubled after each iteration.  In ARMv8, the accuracy of
4991       // the initial estimate is 2^-8.  Thus the number of extra steps to refine
4992       // the result for float (23 mantissa bits) is 2 and for double (52
4993       // mantissa bits) is 3.
4994       ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
4995
4996     return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
4997   }
4998
4999   return SDValue();
5000 }
5001
5002 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
5003                                                SelectionDAG &DAG, int Enabled,
5004                                                int &ExtraSteps,
5005                                                bool &UseOneConst,
5006                                                bool Reciprocal) const {
5007   if (Enabled == ReciprocalEstimate::Enabled ||
5008       (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
5009     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
5010                                        DAG, ExtraSteps)) {
5011       SDLoc DL(Operand);
5012       EVT VT = Operand.getValueType();
5013
5014       SDNodeFlags Flags;
5015       Flags.setUnsafeAlgebra(true);
5016
5017       // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
5018       // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
5019       for (int i = ExtraSteps; i > 0; --i) {
5020         SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
5021                                    Flags);
5022         Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
5023         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
5024       }
5025       if (!Reciprocal) {
5026         EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
5027                                       VT);
5028         SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
5029         SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
5030
5031         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
5032         // Correct the result if the operand is 0.0.
5033         Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
5034                                VT, Eq, Operand, Estimate);
5035       }
5036
5037       ExtraSteps = 0;
5038       return Estimate;
5039     }
5040
5041   return SDValue();
5042 }
5043
5044 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
5045                                                 SelectionDAG &DAG, int Enabled,
5046                                                 int &ExtraSteps) const {
5047   if (Enabled == ReciprocalEstimate::Enabled)
5048     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
5049                                        DAG, ExtraSteps)) {
5050       SDLoc DL(Operand);
5051       EVT VT = Operand.getValueType();
5052
5053       SDNodeFlags Flags;
5054       Flags.setUnsafeAlgebra(true);
5055
5056       // Newton reciprocal iteration: E * (2 - X * E)
5057       // AArch64 reciprocal iteration instruction: (2 - M * N)
5058       for (int i = ExtraSteps; i > 0; --i) {
5059         SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
5060                                    Estimate, Flags);
5061         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
5062       }
5063
5064       ExtraSteps = 0;
5065       return Estimate;
5066     }
5067
5068   return SDValue();
5069 }
5070
5071 //===----------------------------------------------------------------------===//
5072 //                          AArch64 Inline Assembly Support
5073 //===----------------------------------------------------------------------===//
5074
5075 // Table of Constraints
5076 // TODO: This is the current set of constraints supported by ARM for the
5077 // compiler, not all of them may make sense, e.g. S may be difficult to support.
5078 //
5079 // r - A general register
5080 // w - An FP/SIMD register of some size in the range v0-v31
5081 // x - An FP/SIMD register of some size in the range v0-v15
5082 // I - Constant that can be used with an ADD instruction
5083 // J - Constant that can be used with a SUB instruction
5084 // K - Constant that can be used with a 32-bit logical instruction
5085 // L - Constant that can be used with a 64-bit logical instruction
5086 // M - Constant that can be used as a 32-bit MOV immediate
5087 // N - Constant that can be used as a 64-bit MOV immediate
5088 // Q - A memory reference with base register and no offset
5089 // S - A symbolic address
5090 // Y - Floating point constant zero
5091 // Z - Integer constant zero
5092 //
5093 //   Note that general register operands will be output using their 64-bit x
5094 // register name, whatever the size of the variable, unless the asm operand
5095 // is prefixed by the %w modifier. Floating-point and SIMD register operands
5096 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
5097 // %q modifier.
5098 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
5099   // At this point, we have to lower this constraint to something else, so we
5100   // lower it to an "r" or "w". However, by doing this we will force the result
5101   // to be in register, while the X constraint is much more permissive.
5102   //
5103   // Although we are correct (we are free to emit anything, without
5104   // constraints), we might break use cases that would expect us to be more
5105   // efficient and emit something else.
5106   if (!Subtarget->hasFPARMv8())
5107     return "r";
5108
5109   if (ConstraintVT.isFloatingPoint())
5110     return "w";
5111
5112   if (ConstraintVT.isVector() &&
5113      (ConstraintVT.getSizeInBits() == 64 ||
5114       ConstraintVT.getSizeInBits() == 128))
5115     return "w";
5116
5117   return "r";
5118 }
5119
5120 /// getConstraintType - Given a constraint letter, return the type of
5121 /// constraint it is for this target.
5122 AArch64TargetLowering::ConstraintType
5123 AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
5124   if (Constraint.size() == 1) {
5125     switch (Constraint[0]) {
5126     default:
5127       break;
5128     case 'z':
5129       return C_Other;
5130     case 'x':
5131     case 'w':
5132       return C_RegisterClass;
5133     // An address with a single base register. Due to the way we
5134     // currently handle addresses it is the same as 'r'.
5135     case 'Q':
5136       return C_Memory;
5137     }
5138   }
5139   return TargetLowering::getConstraintType(Constraint);
5140 }
5141
5142 /// Examine constraint type and operand type and determine a weight value.
5143 /// This object must already have been set up with the operand type
5144 /// and the current alternative constraint selected.
5145 TargetLowering::ConstraintWeight
5146 AArch64TargetLowering::getSingleConstraintMatchWeight(
5147     AsmOperandInfo &info, const char *constraint) const {
5148   ConstraintWeight weight = CW_Invalid;
5149   Value *CallOperandVal = info.CallOperandVal;
5150   // If we don't have a value, we can't do a match,
5151   // but allow it at the lowest weight.
5152   if (!CallOperandVal)
5153     return CW_Default;
5154   Type *type = CallOperandVal->getType();
5155   // Look at the constraint type.
5156   switch (*constraint) {
5157   default:
5158     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
5159     break;
5160   case 'x':
5161   case 'w':
5162     if (type->isFloatingPointTy() || type->isVectorTy())
5163       weight = CW_Register;
5164     break;
5165   case 'z':
5166     weight = CW_Constant;
5167     break;
5168   }
5169   return weight;
5170 }
5171
5172 std::pair<unsigned, const TargetRegisterClass *>
5173 AArch64TargetLowering::getRegForInlineAsmConstraint(
5174     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
5175   if (Constraint.size() == 1) {
5176     switch (Constraint[0]) {
5177     case 'r':
5178       if (VT.getSizeInBits() == 64)
5179         return std::make_pair(0U, &AArch64::GPR64commonRegClass);
5180       return std::make_pair(0U, &AArch64::GPR32commonRegClass);
5181     case 'w':
5182       if (VT.getSizeInBits() == 16)
5183         return std::make_pair(0U, &AArch64::FPR16RegClass);
5184       if (VT.getSizeInBits() == 32)
5185         return std::make_pair(0U, &AArch64::FPR32RegClass);
5186       if (VT.getSizeInBits() == 64)
5187         return std::make_pair(0U, &AArch64::FPR64RegClass);
5188       if (VT.getSizeInBits() == 128)
5189         return std::make_pair(0U, &AArch64::FPR128RegClass);
5190       break;
5191     // The instructions that this constraint is designed for can
5192     // only take 128-bit registers so just use that regclass.
5193     case 'x':
5194       if (VT.getSizeInBits() == 128)
5195         return std::make_pair(0U, &AArch64::FPR128_loRegClass);
5196       break;
5197     }
5198   }
5199   if (StringRef("{cc}").equals_lower(Constraint))
5200     return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
5201
5202   // Use the default implementation in TargetLowering to convert the register
5203   // constraint into a member of a register class.
5204   std::pair<unsigned, const TargetRegisterClass *> Res;
5205   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5206
5207   // Not found as a standard register?
5208   if (!Res.second) {
5209     unsigned Size = Constraint.size();
5210     if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
5211         tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
5212       int RegNo;
5213       bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
5214       if (!Failed && RegNo >= 0 && RegNo <= 31) {
5215         // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
5216         // By default we'll emit v0-v31 for this unless there's a modifier where
5217         // we'll emit the correct register as well.
5218         if (VT != MVT::Other && VT.getSizeInBits() == 64) {
5219           Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
5220           Res.second = &AArch64::FPR64RegClass;
5221         } else {
5222           Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
5223           Res.second = &AArch64::FPR128RegClass;
5224         }
5225       }
5226     }
5227   }
5228
5229   return Res;
5230 }
5231
5232 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
5233 /// vector.  If it is invalid, don't add anything to Ops.
5234 void AArch64TargetLowering::LowerAsmOperandForConstraint(
5235     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
5236     SelectionDAG &DAG) const {
5237   SDValue Result;
5238
5239   // Currently only support length 1 constraints.
5240   if (Constraint.length() != 1)
5241     return;
5242
5243   char ConstraintLetter = Constraint[0];
5244   switch (ConstraintLetter) {
5245   default:
5246     break;
5247
5248   // This set of constraints deal with valid constants for various instructions.
5249   // Validate and return a target constant for them if we can.
5250   case 'z': {
5251     // 'z' maps to xzr or wzr so it needs an input of 0.
5252     if (!isNullConstant(Op))
5253       return;
5254
5255     if (Op.getValueType() == MVT::i64)
5256       Result = DAG.getRegister(AArch64::XZR, MVT::i64);
5257     else
5258       Result = DAG.getRegister(AArch64::WZR, MVT::i32);
5259     break;
5260   }
5261
5262   case 'I':
5263   case 'J':
5264   case 'K':
5265   case 'L':
5266   case 'M':
5267   case 'N':
5268     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
5269     if (!C)
5270       return;
5271
5272     // Grab the value and do some validation.
5273     uint64_t CVal = C->getZExtValue();
5274     switch (ConstraintLetter) {
5275     // The I constraint applies only to simple ADD or SUB immediate operands:
5276     // i.e. 0 to 4095 with optional shift by 12
5277     // The J constraint applies only to ADD or SUB immediates that would be
5278     // valid when negated, i.e. if [an add pattern] were to be output as a SUB
5279     // instruction [or vice versa], in other words -1 to -4095 with optional
5280     // left shift by 12.
5281     case 'I':
5282       if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
5283         break;
5284       return;
5285     case 'J': {
5286       uint64_t NVal = -C->getSExtValue();
5287       if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
5288         CVal = C->getSExtValue();
5289         break;
5290       }
5291       return;
5292     }
5293     // The K and L constraints apply *only* to logical immediates, including
5294     // what used to be the MOVI alias for ORR (though the MOVI alias has now
5295     // been removed and MOV should be used). So these constraints have to
5296     // distinguish between bit patterns that are valid 32-bit or 64-bit
5297     // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
5298     // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
5299     // versa.
5300     case 'K':
5301       if (AArch64_AM::isLogicalImmediate(CVal, 32))
5302         break;
5303       return;
5304     case 'L':
5305       if (AArch64_AM::isLogicalImmediate(CVal, 64))
5306         break;
5307       return;
5308     // The M and N constraints are a superset of K and L respectively, for use
5309     // with the MOV (immediate) alias. As well as the logical immediates they
5310     // also match 32 or 64-bit immediates that can be loaded either using a
5311     // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
5312     // (M) or 64-bit 0x1234000000000000 (N) etc.
5313     // As a note some of this code is liberally stolen from the asm parser.
5314     case 'M': {
5315       if (!isUInt<32>(CVal))
5316         return;
5317       if (AArch64_AM::isLogicalImmediate(CVal, 32))
5318         break;
5319       if ((CVal & 0xFFFF) == CVal)
5320         break;
5321       if ((CVal & 0xFFFF0000ULL) == CVal)
5322         break;
5323       uint64_t NCVal = ~(uint32_t)CVal;
5324       if ((NCVal & 0xFFFFULL) == NCVal)
5325         break;
5326       if ((NCVal & 0xFFFF0000ULL) == NCVal)
5327         break;
5328       return;
5329     }
5330     case 'N': {
5331       if (AArch64_AM::isLogicalImmediate(CVal, 64))
5332         break;
5333       if ((CVal & 0xFFFFULL) == CVal)
5334         break;
5335       if ((CVal & 0xFFFF0000ULL) == CVal)
5336         break;
5337       if ((CVal & 0xFFFF00000000ULL) == CVal)
5338         break;
5339       if ((CVal & 0xFFFF000000000000ULL) == CVal)
5340         break;
5341       uint64_t NCVal = ~CVal;
5342       if ((NCVal & 0xFFFFULL) == NCVal)
5343         break;
5344       if ((NCVal & 0xFFFF0000ULL) == NCVal)
5345         break;
5346       if ((NCVal & 0xFFFF00000000ULL) == NCVal)
5347         break;
5348       if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
5349         break;
5350       return;
5351     }
5352     default:
5353       return;
5354     }
5355
5356     // All assembler immediates are 64-bit integers.
5357     Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
5358     break;
5359   }
5360
5361   if (Result.getNode()) {
5362     Ops.push_back(Result);
5363     return;
5364   }
5365
5366   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
5367 }
5368
5369 //===----------------------------------------------------------------------===//
5370 //                     AArch64 Advanced SIMD Support
5371 //===----------------------------------------------------------------------===//
5372
5373 /// WidenVector - Given a value in the V64 register class, produce the
5374 /// equivalent value in the V128 register class.
5375 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
5376   EVT VT = V64Reg.getValueType();
5377   unsigned NarrowSize = VT.getVectorNumElements();
5378   MVT EltTy = VT.getVectorElementType().getSimpleVT();
5379   MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
5380   SDLoc DL(V64Reg);
5381
5382   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
5383                      V64Reg, DAG.getConstant(0, DL, MVT::i32));
5384 }
5385
5386 /// getExtFactor - Determine the adjustment factor for the position when
5387 /// generating an "extract from vector registers" instruction.
5388 static unsigned getExtFactor(SDValue &V) {
5389   EVT EltType = V.getValueType().getVectorElementType();
5390   return EltType.getSizeInBits() / 8;
5391 }
5392
5393 /// NarrowVector - Given a value in the V128 register class, produce the
5394 /// equivalent value in the V64 register class.
5395 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
5396   EVT VT = V128Reg.getValueType();
5397   unsigned WideSize = VT.getVectorNumElements();
5398   MVT EltTy = VT.getVectorElementType().getSimpleVT();
5399   MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
5400   SDLoc DL(V128Reg);
5401
5402   return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
5403 }
5404
5405 // Gather data to see if the operation can be modelled as a
5406 // shuffle in combination with VEXTs.
5407 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
5408                                                   SelectionDAG &DAG) const {
5409   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
5410   DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
5411   SDLoc dl(Op);
5412   EVT VT = Op.getValueType();
5413   unsigned NumElts = VT.getVectorNumElements();
5414
5415   struct ShuffleSourceInfo {
5416     SDValue Vec;
5417     unsigned MinElt;
5418     unsigned MaxElt;
5419
5420     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
5421     // be compatible with the shuffle we intend to construct. As a result
5422     // ShuffleVec will be some sliding window into the original Vec.
5423     SDValue ShuffleVec;
5424
5425     // Code should guarantee that element i in Vec starts at element "WindowBase
5426     // + i * WindowScale in ShuffleVec".
5427     int WindowBase;
5428     int WindowScale;
5429
5430     ShuffleSourceInfo(SDValue Vec)
5431       : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
5432           ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
5433
5434     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
5435   };
5436
5437   // First gather all vectors used as an immediate source for this BUILD_VECTOR
5438   // node.
5439   SmallVector<ShuffleSourceInfo, 2> Sources;
5440   for (unsigned i = 0; i < NumElts; ++i) {
5441     SDValue V = Op.getOperand(i);
5442     if (V.isUndef())
5443       continue;
5444     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5445              !isa<ConstantSDNode>(V.getOperand(1))) {
5446       DEBUG(dbgs() << "Reshuffle failed: "
5447                       "a shuffle can only come from building a vector from "
5448                       "various elements of other vectors, provided their "
5449                       "indices are constant\n");
5450       return SDValue();
5451     }
5452
5453     // Add this element source to the list if it's not already there.
5454     SDValue SourceVec = V.getOperand(0);
5455     auto Source = find(Sources, SourceVec);
5456     if (Source == Sources.end())
5457       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
5458
5459     // Update the minimum and maximum lane number seen.
5460     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
5461     Source->MinElt = std::min(Source->MinElt, EltNo);
5462     Source->MaxElt = std::max(Source->MaxElt, EltNo);
5463   }
5464
5465   if (Sources.size() > 2) {
5466     DEBUG(dbgs() << "Reshuffle failed: currently only do something sane when at "
5467                     "most two source vectors are involved\n");
5468     return SDValue();
5469   }
5470
5471   // Find out the smallest element size among result and two sources, and use
5472   // it as element size to build the shuffle_vector.
5473   EVT SmallestEltTy = VT.getVectorElementType();
5474   for (auto &Source : Sources) {
5475     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
5476     if (SrcEltTy.bitsLT(SmallestEltTy)) {
5477       SmallestEltTy = SrcEltTy;
5478     }
5479   }
5480   unsigned ResMultiplier =
5481       VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
5482   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
5483   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
5484
5485   // If the source vector is too wide or too narrow, we may nevertheless be able
5486   // to construct a compatible shuffle either by concatenating it with UNDEF or
5487   // extracting a suitable range of elements.
5488   for (auto &Src : Sources) {
5489     EVT SrcVT = Src.ShuffleVec.getValueType();
5490
5491     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
5492       continue;
5493
5494     // This stage of the search produces a source with the same element type as
5495     // the original, but with a total width matching the BUILD_VECTOR output.
5496     EVT EltVT = SrcVT.getVectorElementType();
5497     unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
5498     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
5499
5500     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
5501       assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
5502       // We can pad out the smaller vector for free, so if it's part of a
5503       // shuffle...
5504       Src.ShuffleVec =
5505           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
5506                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
5507       continue;
5508     }
5509
5510     assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
5511
5512     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
5513       DEBUG(dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
5514       return SDValue();
5515     }
5516
5517     if (Src.MinElt >= NumSrcElts) {
5518       // The extraction can just take the second half
5519       Src.ShuffleVec =
5520           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
5521                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
5522       Src.WindowBase = -NumSrcElts;
5523     } else if (Src.MaxElt < NumSrcElts) {
5524       // The extraction can just take the first half
5525       Src.ShuffleVec =
5526           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
5527                       DAG.getConstant(0, dl, MVT::i64));
5528     } else {
5529       // An actual VEXT is needed
5530       SDValue VEXTSrc1 =
5531           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
5532                       DAG.getConstant(0, dl, MVT::i64));
5533       SDValue VEXTSrc2 =
5534           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
5535                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
5536       unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
5537
5538       Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
5539                                    VEXTSrc2,
5540                                    DAG.getConstant(Imm, dl, MVT::i32));
5541       Src.WindowBase = -Src.MinElt;
5542     }
5543   }
5544
5545   // Another possible incompatibility occurs from the vector element types. We
5546   // can fix this by bitcasting the source vectors to the same type we intend
5547   // for the shuffle.
5548   for (auto &Src : Sources) {
5549     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
5550     if (SrcEltTy == SmallestEltTy)
5551       continue;
5552     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
5553     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
5554     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
5555     Src.WindowBase *= Src.WindowScale;
5556   }
5557
5558   // Final sanity check before we try to actually produce a shuffle.
5559   DEBUG(
5560     for (auto Src : Sources)
5561       assert(Src.ShuffleVec.getValueType() == ShuffleVT);
5562   );
5563
5564   // The stars all align, our next step is to produce the mask for the shuffle.
5565   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
5566   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
5567   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
5568     SDValue Entry = Op.getOperand(i);
5569     if (Entry.isUndef())
5570       continue;
5571
5572     auto Src = find(Sources, Entry.getOperand(0));
5573     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
5574
5575     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
5576     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
5577     // segment.
5578     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
5579     int BitsDefined =
5580         std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
5581     int LanesDefined = BitsDefined / BitsPerShuffleLane;
5582
5583     // This source is expected to fill ResMultiplier lanes of the final shuffle,
5584     // starting at the appropriate offset.
5585     int *LaneMask = &Mask[i * ResMultiplier];
5586
5587     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
5588     ExtractBase += NumElts * (Src - Sources.begin());
5589     for (int j = 0; j < LanesDefined; ++j)
5590       LaneMask[j] = ExtractBase + j;
5591   }
5592
5593   // Final check before we try to produce nonsense...
5594   if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
5595     DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
5596     return SDValue();
5597   }
5598
5599   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
5600   for (unsigned i = 0; i < Sources.size(); ++i)
5601     ShuffleOps[i] = Sources[i].ShuffleVec;
5602
5603   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
5604                                          ShuffleOps[1], Mask);
5605   SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
5606
5607   DEBUG(
5608     dbgs() << "Reshuffle, creating node: ";
5609     Shuffle.dump();
5610     dbgs() << "Reshuffle, creating node: ";
5611     V.dump();
5612   );
5613
5614   return V;
5615 }
5616
5617 // check if an EXT instruction can handle the shuffle mask when the
5618 // vector sources of the shuffle are the same.
5619 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
5620   unsigned NumElts = VT.getVectorNumElements();
5621
5622   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
5623   if (M[0] < 0)
5624     return false;
5625
5626   Imm = M[0];
5627
5628   // If this is a VEXT shuffle, the immediate value is the index of the first
5629   // element.  The other shuffle indices must be the successive elements after
5630   // the first one.
5631   unsigned ExpectedElt = Imm;
5632   for (unsigned i = 1; i < NumElts; ++i) {
5633     // Increment the expected index.  If it wraps around, just follow it
5634     // back to index zero and keep going.
5635     ++ExpectedElt;
5636     if (ExpectedElt == NumElts)
5637       ExpectedElt = 0;
5638
5639     if (M[i] < 0)
5640       continue; // ignore UNDEF indices
5641     if (ExpectedElt != static_cast<unsigned>(M[i]))
5642       return false;
5643   }
5644
5645   return true;
5646 }
5647
5648 // check if an EXT instruction can handle the shuffle mask when the
5649 // vector sources of the shuffle are different.
5650 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
5651                       unsigned &Imm) {
5652   // Look for the first non-undef element.
5653   const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
5654
5655   // Benefit form APInt to handle overflow when calculating expected element.
5656   unsigned NumElts = VT.getVectorNumElements();
5657   unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
5658   APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
5659   // The following shuffle indices must be the successive elements after the
5660   // first real element.
5661   const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
5662       [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
5663   if (FirstWrongElt != M.end())
5664     return false;
5665
5666   // The index of an EXT is the first element if it is not UNDEF.
5667   // Watch out for the beginning UNDEFs. The EXT index should be the expected
5668   // value of the first element.  E.g.
5669   // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
5670   // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
5671   // ExpectedElt is the last mask index plus 1.
5672   Imm = ExpectedElt.getZExtValue();
5673
5674   // There are two difference cases requiring to reverse input vectors.
5675   // For example, for vector <4 x i32> we have the following cases,
5676   // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
5677   // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
5678   // For both cases, we finally use mask <5, 6, 7, 0>, which requires
5679   // to reverse two input vectors.
5680   if (Imm < NumElts)
5681     ReverseEXT = true;
5682   else
5683     Imm -= NumElts;
5684
5685   return true;
5686 }
5687
5688 /// isREVMask - Check if a vector shuffle corresponds to a REV
5689 /// instruction with the specified blocksize.  (The order of the elements
5690 /// within each block of the vector is reversed.)
5691 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
5692   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
5693          "Only possible block sizes for REV are: 16, 32, 64");
5694
5695   unsigned EltSz = VT.getScalarSizeInBits();
5696   if (EltSz == 64)
5697     return false;
5698
5699   unsigned NumElts = VT.getVectorNumElements();
5700   unsigned BlockElts = M[0] + 1;
5701   // If the first shuffle index is UNDEF, be optimistic.
5702   if (M[0] < 0)
5703     BlockElts = BlockSize / EltSz;
5704
5705   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
5706     return false;
5707
5708   for (unsigned i = 0; i < NumElts; ++i) {
5709     if (M[i] < 0)
5710       continue; // ignore UNDEF indices
5711     if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
5712       return false;
5713   }
5714
5715   return true;
5716 }
5717
5718 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5719   unsigned NumElts = VT.getVectorNumElements();
5720   WhichResult = (M[0] == 0 ? 0 : 1);
5721   unsigned Idx = WhichResult * NumElts / 2;
5722   for (unsigned i = 0; i != NumElts; i += 2) {
5723     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
5724         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
5725       return false;
5726     Idx += 1;
5727   }
5728
5729   return true;
5730 }
5731
5732 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5733   unsigned NumElts = VT.getVectorNumElements();
5734   WhichResult = (M[0] == 0 ? 0 : 1);
5735   for (unsigned i = 0; i != NumElts; ++i) {
5736     if (M[i] < 0)
5737       continue; // ignore UNDEF indices
5738     if ((unsigned)M[i] != 2 * i + WhichResult)
5739       return false;
5740   }
5741
5742   return true;
5743 }
5744
5745 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5746   unsigned NumElts = VT.getVectorNumElements();
5747   WhichResult = (M[0] == 0 ? 0 : 1);
5748   for (unsigned i = 0; i < NumElts; i += 2) {
5749     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
5750         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
5751       return false;
5752   }
5753   return true;
5754 }
5755
5756 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
5757 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5758 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
5759 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5760   unsigned NumElts = VT.getVectorNumElements();
5761   WhichResult = (M[0] == 0 ? 0 : 1);
5762   unsigned Idx = WhichResult * NumElts / 2;
5763   for (unsigned i = 0; i != NumElts; i += 2) {
5764     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
5765         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
5766       return false;
5767     Idx += 1;
5768   }
5769
5770   return true;
5771 }
5772
5773 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
5774 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5775 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
5776 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5777   unsigned Half = VT.getVectorNumElements() / 2;
5778   WhichResult = (M[0] == 0 ? 0 : 1);
5779   for (unsigned j = 0; j != 2; ++j) {
5780     unsigned Idx = WhichResult;
5781     for (unsigned i = 0; i != Half; ++i) {
5782       int MIdx = M[i + j * Half];
5783       if (MIdx >= 0 && (unsigned)MIdx != Idx)
5784         return false;
5785       Idx += 2;
5786     }
5787   }
5788
5789   return true;
5790 }
5791
5792 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
5793 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
5794 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
5795 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
5796   unsigned NumElts = VT.getVectorNumElements();
5797   WhichResult = (M[0] == 0 ? 0 : 1);
5798   for (unsigned i = 0; i < NumElts; i += 2) {
5799     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
5800         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
5801       return false;
5802   }
5803   return true;
5804 }
5805
5806 static bool isINSMask(ArrayRef<int> M, int NumInputElements,
5807                       bool &DstIsLeft, int &Anomaly) {
5808   if (M.size() != static_cast<size_t>(NumInputElements))
5809     return false;
5810
5811   int NumLHSMatch = 0, NumRHSMatch = 0;
5812   int LastLHSMismatch = -1, LastRHSMismatch = -1;
5813
5814   for (int i = 0; i < NumInputElements; ++i) {
5815     if (M[i] == -1) {
5816       ++NumLHSMatch;
5817       ++NumRHSMatch;
5818       continue;
5819     }
5820
5821     if (M[i] == i)
5822       ++NumLHSMatch;
5823     else
5824       LastLHSMismatch = i;
5825
5826     if (M[i] == i + NumInputElements)
5827       ++NumRHSMatch;
5828     else
5829       LastRHSMismatch = i;
5830   }
5831
5832   if (NumLHSMatch == NumInputElements - 1) {
5833     DstIsLeft = true;
5834     Anomaly = LastLHSMismatch;
5835     return true;
5836   } else if (NumRHSMatch == NumInputElements - 1) {
5837     DstIsLeft = false;
5838     Anomaly = LastRHSMismatch;
5839     return true;
5840   }
5841
5842   return false;
5843 }
5844
5845 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
5846   if (VT.getSizeInBits() != 128)
5847     return false;
5848
5849   unsigned NumElts = VT.getVectorNumElements();
5850
5851   for (int I = 0, E = NumElts / 2; I != E; I++) {
5852     if (Mask[I] != I)
5853       return false;
5854   }
5855
5856   int Offset = NumElts / 2;
5857   for (int I = NumElts / 2, E = NumElts; I != E; I++) {
5858     if (Mask[I] != I + SplitLHS * Offset)
5859       return false;
5860   }
5861
5862   return true;
5863 }
5864
5865 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
5866   SDLoc DL(Op);
5867   EVT VT = Op.getValueType();
5868   SDValue V0 = Op.getOperand(0);
5869   SDValue V1 = Op.getOperand(1);
5870   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
5871
5872   if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
5873       VT.getVectorElementType() != V1.getValueType().getVectorElementType())
5874     return SDValue();
5875
5876   bool SplitV0 = V0.getValueSizeInBits() == 128;
5877
5878   if (!isConcatMask(Mask, VT, SplitV0))
5879     return SDValue();
5880
5881   EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
5882                                 VT.getVectorNumElements() / 2);
5883   if (SplitV0) {
5884     V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
5885                      DAG.getConstant(0, DL, MVT::i64));
5886   }
5887   if (V1.getValueSizeInBits() == 128) {
5888     V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
5889                      DAG.getConstant(0, DL, MVT::i64));
5890   }
5891   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
5892 }
5893
5894 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
5895 /// the specified operations to build the shuffle.
5896 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
5897                                       SDValue RHS, SelectionDAG &DAG,
5898                                       const SDLoc &dl) {
5899   unsigned OpNum = (PFEntry >> 26) & 0x0F;
5900   unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
5901   unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
5902
5903   enum {
5904     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
5905     OP_VREV,
5906     OP_VDUP0,
5907     OP_VDUP1,
5908     OP_VDUP2,
5909     OP_VDUP3,
5910     OP_VEXT1,
5911     OP_VEXT2,
5912     OP_VEXT3,
5913     OP_VUZPL, // VUZP, left result
5914     OP_VUZPR, // VUZP, right result
5915     OP_VZIPL, // VZIP, left result
5916     OP_VZIPR, // VZIP, right result
5917     OP_VTRNL, // VTRN, left result
5918     OP_VTRNR  // VTRN, right result
5919   };
5920
5921   if (OpNum == OP_COPY) {
5922     if (LHSID == (1 * 9 + 2) * 9 + 3)
5923       return LHS;
5924     assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
5925     return RHS;
5926   }
5927
5928   SDValue OpLHS, OpRHS;
5929   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
5930   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
5931   EVT VT = OpLHS.getValueType();
5932
5933   switch (OpNum) {
5934   default:
5935     llvm_unreachable("Unknown shuffle opcode!");
5936   case OP_VREV:
5937     // VREV divides the vector in half and swaps within the half.
5938     if (VT.getVectorElementType() == MVT::i32 ||
5939         VT.getVectorElementType() == MVT::f32)
5940       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
5941     // vrev <4 x i16> -> REV32
5942     if (VT.getVectorElementType() == MVT::i16 ||
5943         VT.getVectorElementType() == MVT::f16)
5944       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
5945     // vrev <4 x i8> -> REV16
5946     assert(VT.getVectorElementType() == MVT::i8);
5947     return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
5948   case OP_VDUP0:
5949   case OP_VDUP1:
5950   case OP_VDUP2:
5951   case OP_VDUP3: {
5952     EVT EltTy = VT.getVectorElementType();
5953     unsigned Opcode;
5954     if (EltTy == MVT::i8)
5955       Opcode = AArch64ISD::DUPLANE8;
5956     else if (EltTy == MVT::i16 || EltTy == MVT::f16)
5957       Opcode = AArch64ISD::DUPLANE16;
5958     else if (EltTy == MVT::i32 || EltTy == MVT::f32)
5959       Opcode = AArch64ISD::DUPLANE32;
5960     else if (EltTy == MVT::i64 || EltTy == MVT::f64)
5961       Opcode = AArch64ISD::DUPLANE64;
5962     else
5963       llvm_unreachable("Invalid vector element type?");
5964
5965     if (VT.getSizeInBits() == 64)
5966       OpLHS = WidenVector(OpLHS, DAG);
5967     SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
5968     return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
5969   }
5970   case OP_VEXT1:
5971   case OP_VEXT2:
5972   case OP_VEXT3: {
5973     unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
5974     return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
5975                        DAG.getConstant(Imm, dl, MVT::i32));
5976   }
5977   case OP_VUZPL:
5978     return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
5979                        OpRHS);
5980   case OP_VUZPR:
5981     return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
5982                        OpRHS);
5983   case OP_VZIPL:
5984     return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
5985                        OpRHS);
5986   case OP_VZIPR:
5987     return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
5988                        OpRHS);
5989   case OP_VTRNL:
5990     return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
5991                        OpRHS);
5992   case OP_VTRNR:
5993     return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
5994                        OpRHS);
5995   }
5996 }
5997
5998 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
5999                            SelectionDAG &DAG) {
6000   // Check to see if we can use the TBL instruction.
6001   SDValue V1 = Op.getOperand(0);
6002   SDValue V2 = Op.getOperand(1);
6003   SDLoc DL(Op);
6004
6005   EVT EltVT = Op.getValueType().getVectorElementType();
6006   unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
6007
6008   SmallVector<SDValue, 8> TBLMask;
6009   for (int Val : ShuffleMask) {
6010     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
6011       unsigned Offset = Byte + Val * BytesPerElt;
6012       TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
6013     }
6014   }
6015
6016   MVT IndexVT = MVT::v8i8;
6017   unsigned IndexLen = 8;
6018   if (Op.getValueSizeInBits() == 128) {
6019     IndexVT = MVT::v16i8;
6020     IndexLen = 16;
6021   }
6022
6023   SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
6024   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
6025
6026   SDValue Shuffle;
6027   if (V2.getNode()->isUndef()) {
6028     if (IndexLen == 8)
6029       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
6030     Shuffle = DAG.getNode(
6031         ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
6032         DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
6033         DAG.getBuildVector(IndexVT, DL,
6034                            makeArrayRef(TBLMask.data(), IndexLen)));
6035   } else {
6036     if (IndexLen == 8) {
6037       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
6038       Shuffle = DAG.getNode(
6039           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
6040           DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
6041           DAG.getBuildVector(IndexVT, DL,
6042                              makeArrayRef(TBLMask.data(), IndexLen)));
6043     } else {
6044       // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
6045       // cannot currently represent the register constraints on the input
6046       // table registers.
6047       //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
6048       //                   DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
6049       //                   IndexLen));
6050       Shuffle = DAG.getNode(
6051           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
6052           DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
6053           V2Cst, DAG.getBuildVector(IndexVT, DL,
6054                                     makeArrayRef(TBLMask.data(), IndexLen)));
6055     }
6056   }
6057   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
6058 }
6059
6060 static unsigned getDUPLANEOp(EVT EltType) {
6061   if (EltType == MVT::i8)
6062     return AArch64ISD::DUPLANE8;
6063   if (EltType == MVT::i16 || EltType == MVT::f16)
6064     return AArch64ISD::DUPLANE16;
6065   if (EltType == MVT::i32 || EltType == MVT::f32)
6066     return AArch64ISD::DUPLANE32;
6067   if (EltType == MVT::i64 || EltType == MVT::f64)
6068     return AArch64ISD::DUPLANE64;
6069
6070   llvm_unreachable("Invalid vector element type?");
6071 }
6072
6073 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
6074                                                    SelectionDAG &DAG) const {
6075   SDLoc dl(Op);
6076   EVT VT = Op.getValueType();
6077
6078   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
6079
6080   // Convert shuffles that are directly supported on NEON to target-specific
6081   // DAG nodes, instead of keeping them as shuffles and matching them again
6082   // during code selection.  This is more efficient and avoids the possibility
6083   // of inconsistencies between legalization and selection.
6084   ArrayRef<int> ShuffleMask = SVN->getMask();
6085
6086   SDValue V1 = Op.getOperand(0);
6087   SDValue V2 = Op.getOperand(1);
6088
6089   if (SVN->isSplat()) {
6090     int Lane = SVN->getSplatIndex();
6091     // If this is undef splat, generate it via "just" vdup, if possible.
6092     if (Lane == -1)
6093       Lane = 0;
6094
6095     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
6096       return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
6097                          V1.getOperand(0));
6098     // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
6099     // constant. If so, we can just reference the lane's definition directly.
6100     if (V1.getOpcode() == ISD::BUILD_VECTOR &&
6101         !isa<ConstantSDNode>(V1.getOperand(Lane)))
6102       return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
6103
6104     // Otherwise, duplicate from the lane of the input vector.
6105     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
6106
6107     // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
6108     // to make a vector of the same size as this SHUFFLE. We can ignore the
6109     // extract entirely, and canonicalise the concat using WidenVector.
6110     if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
6111       Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
6112       V1 = V1.getOperand(0);
6113     } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
6114       unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
6115       Lane -= Idx * VT.getVectorNumElements() / 2;
6116       V1 = WidenVector(V1.getOperand(Idx), DAG);
6117     } else if (VT.getSizeInBits() == 64)
6118       V1 = WidenVector(V1, DAG);
6119
6120     return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
6121   }
6122
6123   if (isREVMask(ShuffleMask, VT, 64))
6124     return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
6125   if (isREVMask(ShuffleMask, VT, 32))
6126     return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
6127   if (isREVMask(ShuffleMask, VT, 16))
6128     return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
6129
6130   bool ReverseEXT = false;
6131   unsigned Imm;
6132   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
6133     if (ReverseEXT)
6134       std::swap(V1, V2);
6135     Imm *= getExtFactor(V1);
6136     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
6137                        DAG.getConstant(Imm, dl, MVT::i32));
6138   } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
6139     Imm *= getExtFactor(V1);
6140     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
6141                        DAG.getConstant(Imm, dl, MVT::i32));
6142   }
6143
6144   unsigned WhichResult;
6145   if (isZIPMask(ShuffleMask, VT, WhichResult)) {
6146     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
6147     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
6148   }
6149   if (isUZPMask(ShuffleMask, VT, WhichResult)) {
6150     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
6151     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
6152   }
6153   if (isTRNMask(ShuffleMask, VT, WhichResult)) {
6154     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
6155     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
6156   }
6157
6158   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
6159     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
6160     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
6161   }
6162   if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
6163     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
6164     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
6165   }
6166   if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
6167     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
6168     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
6169   }
6170
6171   if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
6172     return Concat;
6173
6174   bool DstIsLeft;
6175   int Anomaly;
6176   int NumInputElements = V1.getValueType().getVectorNumElements();
6177   if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
6178     SDValue DstVec = DstIsLeft ? V1 : V2;
6179     SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
6180
6181     SDValue SrcVec = V1;
6182     int SrcLane = ShuffleMask[Anomaly];
6183     if (SrcLane >= NumInputElements) {
6184       SrcVec = V2;
6185       SrcLane -= VT.getVectorNumElements();
6186     }
6187     SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
6188
6189     EVT ScalarVT = VT.getVectorElementType();
6190
6191     if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
6192       ScalarVT = MVT::i32;
6193
6194     return DAG.getNode(
6195         ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6196         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
6197         DstLaneV);
6198   }
6199
6200   // If the shuffle is not directly supported and it has 4 elements, use
6201   // the PerfectShuffle-generated table to synthesize it from other shuffles.
6202   unsigned NumElts = VT.getVectorNumElements();
6203   if (NumElts == 4) {
6204     unsigned PFIndexes[4];
6205     for (unsigned i = 0; i != 4; ++i) {
6206       if (ShuffleMask[i] < 0)
6207         PFIndexes[i] = 8;
6208       else
6209         PFIndexes[i] = ShuffleMask[i];
6210     }
6211
6212     // Compute the index in the perfect shuffle table.
6213     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
6214                             PFIndexes[2] * 9 + PFIndexes[3];
6215     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
6216     unsigned Cost = (PFEntry >> 30);
6217
6218     if (Cost <= 4)
6219       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
6220   }
6221
6222   return GenerateTBL(Op, ShuffleMask, DAG);
6223 }
6224
6225 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
6226                                APInt &UndefBits) {
6227   EVT VT = BVN->getValueType(0);
6228   APInt SplatBits, SplatUndef;
6229   unsigned SplatBitSize;
6230   bool HasAnyUndefs;
6231   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
6232     unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
6233
6234     for (unsigned i = 0; i < NumSplats; ++i) {
6235       CnstBits <<= SplatBitSize;
6236       UndefBits <<= SplatBitSize;
6237       CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
6238       UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
6239     }
6240
6241     return true;
6242   }
6243
6244   return false;
6245 }
6246
6247 SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
6248                                               SelectionDAG &DAG) const {
6249   BuildVectorSDNode *BVN =
6250       dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
6251   SDValue LHS = Op.getOperand(0);
6252   SDLoc dl(Op);
6253   EVT VT = Op.getValueType();
6254
6255   if (!BVN)
6256     return Op;
6257
6258   APInt CnstBits(VT.getSizeInBits(), 0);
6259   APInt UndefBits(VT.getSizeInBits(), 0);
6260   if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
6261     // We only have BIC vector immediate instruction, which is and-not.
6262     CnstBits = ~CnstBits;
6263
6264     // We make use of a little bit of goto ickiness in order to avoid having to
6265     // duplicate the immediate matching logic for the undef toggled case.
6266     bool SecondTry = false;
6267   AttemptModImm:
6268
6269     if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
6270       CnstBits = CnstBits.zextOrTrunc(64);
6271       uint64_t CnstVal = CnstBits.getZExtValue();
6272
6273       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
6274         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
6275         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6276         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
6277                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6278                                   DAG.getConstant(0, dl, MVT::i32));
6279         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6280       }
6281
6282       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
6283         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
6284         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6285         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
6286                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6287                                   DAG.getConstant(8, dl, MVT::i32));
6288         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6289       }
6290
6291       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
6292         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
6293         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6294         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
6295                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6296                                   DAG.getConstant(16, dl, MVT::i32));
6297         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6298       }
6299
6300       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
6301         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
6302         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6303         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
6304                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6305                                   DAG.getConstant(24, dl, MVT::i32));
6306         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6307       }
6308
6309       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
6310         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
6311         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
6312         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
6313                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6314                                   DAG.getConstant(0, dl, MVT::i32));
6315         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6316       }
6317
6318       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
6319         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
6320         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
6321         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
6322                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6323                                   DAG.getConstant(8, dl, MVT::i32));
6324         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6325       }
6326     }
6327
6328     if (SecondTry)
6329       goto FailedModImm;
6330     SecondTry = true;
6331     CnstBits = ~UndefBits;
6332     goto AttemptModImm;
6333   }
6334
6335 // We can always fall back to a non-immediate AND.
6336 FailedModImm:
6337   return Op;
6338 }
6339
6340 // Specialized code to quickly find if PotentialBVec is a BuildVector that
6341 // consists of only the same constant int value, returned in reference arg
6342 // ConstVal
6343 static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
6344                                      uint64_t &ConstVal) {
6345   BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
6346   if (!Bvec)
6347     return false;
6348   ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
6349   if (!FirstElt)
6350     return false;
6351   EVT VT = Bvec->getValueType(0);
6352   unsigned NumElts = VT.getVectorNumElements();
6353   for (unsigned i = 1; i < NumElts; ++i)
6354     if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
6355       return false;
6356   ConstVal = FirstElt->getZExtValue();
6357   return true;
6358 }
6359
6360 static unsigned getIntrinsicID(const SDNode *N) {
6361   unsigned Opcode = N->getOpcode();
6362   switch (Opcode) {
6363   default:
6364     return Intrinsic::not_intrinsic;
6365   case ISD::INTRINSIC_WO_CHAIN: {
6366     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
6367     if (IID < Intrinsic::num_intrinsics)
6368       return IID;
6369     return Intrinsic::not_intrinsic;
6370   }
6371   }
6372 }
6373
6374 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
6375 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
6376 // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
6377 // Also, logical shift right -> sri, with the same structure.
6378 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
6379   EVT VT = N->getValueType(0);
6380
6381   if (!VT.isVector())
6382     return SDValue();
6383
6384   SDLoc DL(N);
6385
6386   // Is the first op an AND?
6387   const SDValue And = N->getOperand(0);
6388   if (And.getOpcode() != ISD::AND)
6389     return SDValue();
6390
6391   // Is the second op an shl or lshr?
6392   SDValue Shift = N->getOperand(1);
6393   // This will have been turned into: AArch64ISD::VSHL vector, #shift
6394   // or AArch64ISD::VLSHR vector, #shift
6395   unsigned ShiftOpc = Shift.getOpcode();
6396   if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
6397     return SDValue();
6398   bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
6399
6400   // Is the shift amount constant?
6401   ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
6402   if (!C2node)
6403     return SDValue();
6404
6405   // Is the and mask vector all constant?
6406   uint64_t C1;
6407   if (!isAllConstantBuildVector(And.getOperand(1), C1))
6408     return SDValue();
6409
6410   // Is C1 == ~C2, taking into account how much one can shift elements of a
6411   // particular size?
6412   uint64_t C2 = C2node->getZExtValue();
6413   unsigned ElemSizeInBits = VT.getScalarSizeInBits();
6414   if (C2 > ElemSizeInBits)
6415     return SDValue();
6416   unsigned ElemMask = (1 << ElemSizeInBits) - 1;
6417   if ((C1 & ElemMask) != (~C2 & ElemMask))
6418     return SDValue();
6419
6420   SDValue X = And.getOperand(0);
6421   SDValue Y = Shift.getOperand(0);
6422
6423   unsigned Intrin =
6424       IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
6425   SDValue ResultSLI =
6426       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
6427                   DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
6428                   Shift.getOperand(1));
6429
6430   DEBUG(dbgs() << "aarch64-lower: transformed: \n");
6431   DEBUG(N->dump(&DAG));
6432   DEBUG(dbgs() << "into: \n");
6433   DEBUG(ResultSLI->dump(&DAG));
6434
6435   ++NumShiftInserts;
6436   return ResultSLI;
6437 }
6438
6439 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
6440                                              SelectionDAG &DAG) const {
6441   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
6442   if (EnableAArch64SlrGeneration) {
6443     if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
6444       return Res;
6445   }
6446
6447   BuildVectorSDNode *BVN =
6448       dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
6449   SDValue LHS = Op.getOperand(1);
6450   SDLoc dl(Op);
6451   EVT VT = Op.getValueType();
6452
6453   // OR commutes, so try swapping the operands.
6454   if (!BVN) {
6455     LHS = Op.getOperand(0);
6456     BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
6457   }
6458   if (!BVN)
6459     return Op;
6460
6461   APInt CnstBits(VT.getSizeInBits(), 0);
6462   APInt UndefBits(VT.getSizeInBits(), 0);
6463   if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
6464     // We make use of a little bit of goto ickiness in order to avoid having to
6465     // duplicate the immediate matching logic for the undef toggled case.
6466     bool SecondTry = false;
6467   AttemptModImm:
6468
6469     if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
6470       CnstBits = CnstBits.zextOrTrunc(64);
6471       uint64_t CnstVal = CnstBits.getZExtValue();
6472
6473       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
6474         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
6475         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6476         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
6477                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6478                                   DAG.getConstant(0, dl, MVT::i32));
6479         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6480       }
6481
6482       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
6483         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
6484         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6485         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
6486                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6487                                   DAG.getConstant(8, dl, MVT::i32));
6488         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6489       }
6490
6491       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
6492         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
6493         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6494         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
6495                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6496                                   DAG.getConstant(16, dl, MVT::i32));
6497         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6498       }
6499
6500       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
6501         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
6502         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6503         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
6504                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6505                                   DAG.getConstant(24, dl, MVT::i32));
6506         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6507       }
6508
6509       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
6510         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
6511         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
6512         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
6513                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6514                                   DAG.getConstant(0, dl, MVT::i32));
6515         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6516       }
6517
6518       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
6519         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
6520         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
6521         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
6522                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6523                                   DAG.getConstant(8, dl, MVT::i32));
6524         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6525       }
6526     }
6527
6528     if (SecondTry)
6529       goto FailedModImm;
6530     SecondTry = true;
6531     CnstBits = UndefBits;
6532     goto AttemptModImm;
6533   }
6534
6535 // We can always fall back to a non-immediate OR.
6536 FailedModImm:
6537   return Op;
6538 }
6539
6540 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
6541 // be truncated to fit element width.
6542 static SDValue NormalizeBuildVector(SDValue Op,
6543                                     SelectionDAG &DAG) {
6544   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
6545   SDLoc dl(Op);
6546   EVT VT = Op.getValueType();
6547   EVT EltTy= VT.getVectorElementType();
6548
6549   if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
6550     return Op;
6551
6552   SmallVector<SDValue, 16> Ops;
6553   for (SDValue Lane : Op->ops()) {
6554     if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
6555       APInt LowBits(EltTy.getSizeInBits(),
6556                     CstLane->getZExtValue());
6557       Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
6558     }
6559     Ops.push_back(Lane);
6560   }
6561   return DAG.getBuildVector(VT, dl, Ops);
6562 }
6563
6564 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
6565                                                  SelectionDAG &DAG) const {
6566   SDLoc dl(Op);
6567   EVT VT = Op.getValueType();
6568   Op = NormalizeBuildVector(Op, DAG);
6569   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
6570
6571   APInt CnstBits(VT.getSizeInBits(), 0);
6572   APInt UndefBits(VT.getSizeInBits(), 0);
6573   if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
6574     // We make use of a little bit of goto ickiness in order to avoid having to
6575     // duplicate the immediate matching logic for the undef toggled case.
6576     bool SecondTry = false;
6577   AttemptModImm:
6578
6579     if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
6580       CnstBits = CnstBits.zextOrTrunc(64);
6581       uint64_t CnstVal = CnstBits.getZExtValue();
6582
6583       // Certain magic vector constants (used to express things like NOT
6584       // and NEG) are passed through unmodified.  This allows codegen patterns
6585       // for these operations to match.  Special-purpose patterns will lower
6586       // these immediates to MOVIs if it proves necessary.
6587       if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
6588         return Op;
6589
6590       // The many faces of MOVI...
6591       if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
6592         CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
6593         if (VT.getSizeInBits() == 128) {
6594           SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
6595                                     DAG.getConstant(CnstVal, dl, MVT::i32));
6596           return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6597         }
6598
6599         // Support the V64 version via subregister insertion.
6600         SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
6601                                   DAG.getConstant(CnstVal, dl, MVT::i32));
6602         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6603       }
6604
6605       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
6606         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
6607         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6608         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
6609                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6610                                   DAG.getConstant(0, dl, MVT::i32));
6611         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6612       }
6613
6614       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
6615         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
6616         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6617         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
6618                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6619                                   DAG.getConstant(8, dl, MVT::i32));
6620         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6621       }
6622
6623       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
6624         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
6625         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6626         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
6627                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6628                                   DAG.getConstant(16, dl, MVT::i32));
6629         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6630       }
6631
6632       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
6633         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
6634         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6635         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
6636                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6637                                   DAG.getConstant(24, dl, MVT::i32));
6638         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6639       }
6640
6641       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
6642         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
6643         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
6644         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
6645                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6646                                   DAG.getConstant(0, dl, MVT::i32));
6647         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6648       }
6649
6650       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
6651         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
6652         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
6653         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
6654                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6655                                   DAG.getConstant(8, dl, MVT::i32));
6656         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6657       }
6658
6659       if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
6660         CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
6661         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6662         SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
6663                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6664                                   DAG.getConstant(264, dl, MVT::i32));
6665         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6666       }
6667
6668       if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
6669         CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
6670         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6671         SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
6672                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6673                                   DAG.getConstant(272, dl, MVT::i32));
6674         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6675       }
6676
6677       if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
6678         CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
6679         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
6680         SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
6681                                   DAG.getConstant(CnstVal, dl, MVT::i32));
6682         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6683       }
6684
6685       // The few faces of FMOV...
6686       if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
6687         CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
6688         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
6689         SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
6690                                   DAG.getConstant(CnstVal, dl, MVT::i32));
6691         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6692       }
6693
6694       if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
6695           VT.getSizeInBits() == 128) {
6696         CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
6697         SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
6698                                   DAG.getConstant(CnstVal, dl, MVT::i32));
6699         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6700       }
6701
6702       // The many faces of MVNI...
6703       CnstVal = ~CnstVal;
6704       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
6705         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
6706         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6707         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
6708                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6709                                   DAG.getConstant(0, dl, MVT::i32));
6710         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6711       }
6712
6713       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
6714         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
6715         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6716         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
6717                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6718                                   DAG.getConstant(8, dl, MVT::i32));
6719         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6720       }
6721
6722       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
6723         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
6724         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6725         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
6726                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6727                                   DAG.getConstant(16, dl, MVT::i32));
6728         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6729       }
6730
6731       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
6732         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
6733         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6734         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
6735                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6736                                   DAG.getConstant(24, dl, MVT::i32));
6737         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6738       }
6739
6740       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
6741         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
6742         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
6743         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
6744                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6745                                   DAG.getConstant(0, dl, MVT::i32));
6746         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6747       }
6748
6749       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
6750         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
6751         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
6752         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
6753                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6754                                   DAG.getConstant(8, dl, MVT::i32));
6755         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6756       }
6757
6758       if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
6759         CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
6760         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6761         SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
6762                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6763                                   DAG.getConstant(264, dl, MVT::i32));
6764         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6765       }
6766
6767       if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
6768         CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
6769         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
6770         SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
6771                                   DAG.getConstant(CnstVal, dl, MVT::i32),
6772                                   DAG.getConstant(272, dl, MVT::i32));
6773         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
6774       }
6775     }
6776
6777     if (SecondTry)
6778       goto FailedModImm;
6779     SecondTry = true;
6780     CnstBits = UndefBits;
6781     goto AttemptModImm;
6782   }
6783 FailedModImm:
6784
6785   // Scan through the operands to find some interesting properties we can
6786   // exploit:
6787   //   1) If only one value is used, we can use a DUP, or
6788   //   2) if only the low element is not undef, we can just insert that, or
6789   //   3) if only one constant value is used (w/ some non-constant lanes),
6790   //      we can splat the constant value into the whole vector then fill
6791   //      in the non-constant lanes.
6792   //   4) FIXME: If different constant values are used, but we can intelligently
6793   //             select the values we'll be overwriting for the non-constant
6794   //             lanes such that we can directly materialize the vector
6795   //             some other way (MOVI, e.g.), we can be sneaky.
6796   unsigned NumElts = VT.getVectorNumElements();
6797   bool isOnlyLowElement = true;
6798   bool usesOnlyOneValue = true;
6799   bool usesOnlyOneConstantValue = true;
6800   bool isConstant = true;
6801   unsigned NumConstantLanes = 0;
6802   SDValue Value;
6803   SDValue ConstantValue;
6804   for (unsigned i = 0; i < NumElts; ++i) {
6805     SDValue V = Op.getOperand(i);
6806     if (V.isUndef())
6807       continue;
6808     if (i > 0)
6809       isOnlyLowElement = false;
6810     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
6811       isConstant = false;
6812
6813     if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
6814       ++NumConstantLanes;
6815       if (!ConstantValue.getNode())
6816         ConstantValue = V;
6817       else if (ConstantValue != V)
6818         usesOnlyOneConstantValue = false;
6819     }
6820
6821     if (!Value.getNode())
6822       Value = V;
6823     else if (V != Value)
6824       usesOnlyOneValue = false;
6825   }
6826
6827   if (!Value.getNode()) {
6828     DEBUG(dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
6829     return DAG.getUNDEF(VT);
6830   }
6831
6832   if (isOnlyLowElement) {
6833     DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
6834                     "SCALAR_TO_VECTOR node\n");
6835     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
6836   }
6837
6838   // Use DUP for non-constant splats. For f32 constant splats, reduce to
6839   // i32 and try again.
6840   if (usesOnlyOneValue) {
6841     if (!isConstant) {
6842       if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6843           Value.getValueType() != VT) {
6844         DEBUG(dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
6845         return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
6846       }
6847
6848       // This is actually a DUPLANExx operation, which keeps everything vectory.
6849
6850       SDValue Lane = Value.getOperand(1);
6851       Value = Value.getOperand(0);
6852       if (Value.getValueSizeInBits() == 64) {
6853         DEBUG(dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
6854                         "widening it\n");
6855         Value = WidenVector(Value, DAG);
6856       }
6857
6858       unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
6859       return DAG.getNode(Opcode, dl, VT, Value, Lane);
6860     }
6861
6862     if (VT.getVectorElementType().isFloatingPoint()) {
6863       SmallVector<SDValue, 8> Ops;
6864       EVT EltTy = VT.getVectorElementType();
6865       assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
6866               "Unsupported floating-point vector type");
6867       DEBUG(dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
6868                       "BITCASTS, and try again\n");
6869       MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
6870       for (unsigned i = 0; i < NumElts; ++i)
6871         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
6872       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
6873       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
6874       DEBUG(
6875         dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
6876         Val.dump();
6877       );
6878       Val = LowerBUILD_VECTOR(Val, DAG);
6879       if (Val.getNode())
6880         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6881     }
6882   }
6883
6884   // If there was only one constant value used and for more than one lane,
6885   // start by splatting that value, then replace the non-constant lanes. This
6886   // is better than the default, which will perform a separate initialization
6887   // for each lane.
6888   if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
6889     SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
6890     // Now insert the non-constant lanes.
6891     for (unsigned i = 0; i < NumElts; ++i) {
6892       SDValue V = Op.getOperand(i);
6893       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
6894       if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
6895         // Note that type legalization likely mucked about with the VT of the
6896         // source operand, so we may have to convert it here before inserting.
6897         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
6898       }
6899     }
6900     return Val;
6901   }
6902
6903   // This will generate a load from the constant pool.
6904   if (isConstant) {
6905     DEBUG(dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
6906                     "expansion\n");
6907     return SDValue();
6908   }
6909
6910   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
6911   if (NumElts >= 4) {
6912     if (SDValue shuffle = ReconstructShuffle(Op, DAG))
6913       return shuffle;
6914   }
6915
6916   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
6917   // know the default expansion would otherwise fall back on something even
6918   // worse. For a vector with one or two non-undef values, that's
6919   // scalar_to_vector for the elements followed by a shuffle (provided the
6920   // shuffle is valid for the target) and materialization element by element
6921   // on the stack followed by a load for everything else.
6922   if (!isConstant && !usesOnlyOneValue) {
6923     DEBUG(dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
6924                     "of INSERT_VECTOR_ELT\n");
6925
6926     SDValue Vec = DAG.getUNDEF(VT);
6927     SDValue Op0 = Op.getOperand(0);
6928     unsigned i = 0;
6929
6930     // Use SCALAR_TO_VECTOR for lane zero to
6931     // a) Avoid a RMW dependency on the full vector register, and
6932     // b) Allow the register coalescer to fold away the copy if the
6933     //    value is already in an S or D register, and we're forced to emit an
6934     //    INSERT_SUBREG that we can't fold anywhere.
6935     //
6936     // We also allow types like i8 and i16 which are illegal scalar but legal
6937     // vector element types. After type-legalization the inserted value is
6938     // extended (i32) and it is safe to cast them to the vector type by ignoring
6939     // the upper bits of the lowest lane (e.g. v8i8, v4i16).
6940     if (!Op0.isUndef()) {
6941       DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
6942       Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
6943       ++i;
6944     }
6945     DEBUG(
6946       if (i < NumElts)
6947         dbgs() << "Creating nodes for the other vector elements:\n";
6948     );
6949     for (; i < NumElts; ++i) {
6950       SDValue V = Op.getOperand(i);
6951       if (V.isUndef())
6952         continue;
6953       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
6954       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
6955     }
6956     return Vec;
6957   }
6958
6959   DEBUG(dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
6960                   "better alternative\n");
6961   return SDValue();
6962 }
6963
6964 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
6965                                                       SelectionDAG &DAG) const {
6966   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
6967
6968   // Check for non-constant or out of range lane.
6969   EVT VT = Op.getOperand(0).getValueType();
6970   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6971   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
6972     return SDValue();
6973
6974
6975   // Insertion/extraction are legal for V128 types.
6976   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
6977       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
6978       VT == MVT::v8f16)
6979     return Op;
6980
6981   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
6982       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
6983     return SDValue();
6984
6985   // For V64 types, we perform insertion by expanding the value
6986   // to a V128 type and perform the insertion on that.
6987   SDLoc DL(Op);
6988   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
6989   EVT WideTy = WideVec.getValueType();
6990
6991   SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
6992                              Op.getOperand(1), Op.getOperand(2));
6993   // Re-narrow the resultant vector.
6994   return NarrowVector(Node, DAG);
6995 }
6996
6997 SDValue
6998 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
6999                                                SelectionDAG &DAG) const {
7000   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
7001
7002   // Check for non-constant or out of range lane.
7003   EVT VT = Op.getOperand(0).getValueType();
7004   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
7005   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
7006     return SDValue();
7007
7008
7009   // Insertion/extraction are legal for V128 types.
7010   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
7011       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
7012       VT == MVT::v8f16)
7013     return Op;
7014
7015   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
7016       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
7017     return SDValue();
7018
7019   // For V64 types, we perform extraction by expanding the value
7020   // to a V128 type and perform the extraction on that.
7021   SDLoc DL(Op);
7022   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
7023   EVT WideTy = WideVec.getValueType();
7024
7025   EVT ExtrTy = WideTy.getVectorElementType();
7026   if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
7027     ExtrTy = MVT::i32;
7028
7029   // For extractions, we just return the result directly.
7030   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
7031                      Op.getOperand(1));
7032 }
7033
7034 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
7035                                                       SelectionDAG &DAG) const {
7036   EVT VT = Op.getOperand(0).getValueType();
7037   SDLoc dl(Op);
7038   // Just in case...
7039   if (!VT.isVector())
7040     return SDValue();
7041
7042   ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
7043   if (!Cst)
7044     return SDValue();
7045   unsigned Val = Cst->getZExtValue();
7046
7047   unsigned Size = Op.getValueSizeInBits();
7048
7049   // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
7050   if (Val == 0)
7051     return Op;
7052
7053   // If this is extracting the upper 64-bits of a 128-bit vector, we match
7054   // that directly.
7055   if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
7056     return Op;
7057
7058   return SDValue();
7059 }
7060
7061 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
7062   if (VT.getVectorNumElements() == 4 &&
7063       (VT.is128BitVector() || VT.is64BitVector())) {
7064     unsigned PFIndexes[4];
7065     for (unsigned i = 0; i != 4; ++i) {
7066       if (M[i] < 0)
7067         PFIndexes[i] = 8;
7068       else
7069         PFIndexes[i] = M[i];
7070     }
7071
7072     // Compute the index in the perfect shuffle table.
7073     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
7074                             PFIndexes[2] * 9 + PFIndexes[3];
7075     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
7076     unsigned Cost = (PFEntry >> 30);
7077
7078     if (Cost <= 4)
7079       return true;
7080   }
7081
7082   bool DummyBool;
7083   int DummyInt;
7084   unsigned DummyUnsigned;
7085
7086   return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
7087           isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
7088           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
7089           // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
7090           isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
7091           isZIPMask(M, VT, DummyUnsigned) ||
7092           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
7093           isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
7094           isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
7095           isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
7096           isConcatMask(M, VT, VT.getSizeInBits() == 128));
7097 }
7098
7099 /// getVShiftImm - Check if this is a valid build_vector for the immediate
7100 /// operand of a vector shift operation, where all the elements of the
7101 /// build_vector must have the same constant integer value.
7102 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
7103   // Ignore bit_converts.
7104   while (Op.getOpcode() == ISD::BITCAST)
7105     Op = Op.getOperand(0);
7106   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
7107   APInt SplatBits, SplatUndef;
7108   unsigned SplatBitSize;
7109   bool HasAnyUndefs;
7110   if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
7111                                     HasAnyUndefs, ElementBits) ||
7112       SplatBitSize > ElementBits)
7113     return false;
7114   Cnt = SplatBits.getSExtValue();
7115   return true;
7116 }
7117
7118 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
7119 /// operand of a vector shift left operation.  That value must be in the range:
7120 ///   0 <= Value < ElementBits for a left shift; or
7121 ///   0 <= Value <= ElementBits for a long left shift.
7122 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
7123   assert(VT.isVector() && "vector shift count is not a vector type");
7124   int64_t ElementBits = VT.getScalarSizeInBits();
7125   if (!getVShiftImm(Op, ElementBits, Cnt))
7126     return false;
7127   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
7128 }
7129
7130 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
7131 /// operand of a vector shift right operation. The value must be in the range:
7132 ///   1 <= Value <= ElementBits for a right shift; or
7133 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
7134   assert(VT.isVector() && "vector shift count is not a vector type");
7135   int64_t ElementBits = VT.getScalarSizeInBits();
7136   if (!getVShiftImm(Op, ElementBits, Cnt))
7137     return false;
7138   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
7139 }
7140
7141 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
7142                                                       SelectionDAG &DAG) const {
7143   EVT VT = Op.getValueType();
7144   SDLoc DL(Op);
7145   int64_t Cnt;
7146
7147   if (!Op.getOperand(1).getValueType().isVector())
7148     return Op;
7149   unsigned EltSize = VT.getScalarSizeInBits();
7150
7151   switch (Op.getOpcode()) {
7152   default:
7153     llvm_unreachable("unexpected shift opcode");
7154
7155   case ISD::SHL:
7156     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
7157       return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
7158                          DAG.getConstant(Cnt, DL, MVT::i32));
7159     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
7160                        DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
7161                                        MVT::i32),
7162                        Op.getOperand(0), Op.getOperand(1));
7163   case ISD::SRA:
7164   case ISD::SRL:
7165     // Right shift immediate
7166     if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
7167       unsigned Opc =
7168           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
7169       return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
7170                          DAG.getConstant(Cnt, DL, MVT::i32));
7171     }
7172
7173     // Right shift register.  Note, there is not a shift right register
7174     // instruction, but the shift left register instruction takes a signed
7175     // value, where negative numbers specify a right shift.
7176     unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
7177                                                 : Intrinsic::aarch64_neon_ushl;
7178     // negate the shift amount
7179     SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
7180     SDValue NegShiftLeft =
7181         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
7182                     DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
7183                     NegShift);
7184     return NegShiftLeft;
7185   }
7186
7187   return SDValue();
7188 }
7189
7190 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
7191                                     AArch64CC::CondCode CC, bool NoNans, EVT VT,
7192                                     const SDLoc &dl, SelectionDAG &DAG) {
7193   EVT SrcVT = LHS.getValueType();
7194   assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
7195          "function only supposed to emit natural comparisons");
7196
7197   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
7198   APInt CnstBits(VT.getSizeInBits(), 0);
7199   APInt UndefBits(VT.getSizeInBits(), 0);
7200   bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
7201   bool IsZero = IsCnst && (CnstBits == 0);
7202
7203   if (SrcVT.getVectorElementType().isFloatingPoint()) {
7204     switch (CC) {
7205     default:
7206       return SDValue();
7207     case AArch64CC::NE: {
7208       SDValue Fcmeq;
7209       if (IsZero)
7210         Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
7211       else
7212         Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
7213       return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
7214     }
7215     case AArch64CC::EQ:
7216       if (IsZero)
7217         return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
7218       return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
7219     case AArch64CC::GE:
7220       if (IsZero)
7221         return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
7222       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
7223     case AArch64CC::GT:
7224       if (IsZero)
7225         return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
7226       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
7227     case AArch64CC::LS:
7228       if (IsZero)
7229         return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
7230       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
7231     case AArch64CC::LT:
7232       if (!NoNans)
7233         return SDValue();
7234       // If we ignore NaNs then we can use to the MI implementation.
7235       LLVM_FALLTHROUGH;
7236     case AArch64CC::MI:
7237       if (IsZero)
7238         return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
7239       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
7240     }
7241   }
7242
7243   switch (CC) {
7244   default:
7245     return SDValue();
7246   case AArch64CC::NE: {
7247     SDValue Cmeq;
7248     if (IsZero)
7249       Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
7250     else
7251       Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
7252     return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
7253   }
7254   case AArch64CC::EQ:
7255     if (IsZero)
7256       return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
7257     return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
7258   case AArch64CC::GE:
7259     if (IsZero)
7260       return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
7261     return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
7262   case AArch64CC::GT:
7263     if (IsZero)
7264       return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
7265     return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
7266   case AArch64CC::LE:
7267     if (IsZero)
7268       return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
7269     return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
7270   case AArch64CC::LS:
7271     return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
7272   case AArch64CC::LO:
7273     return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
7274   case AArch64CC::LT:
7275     if (IsZero)
7276       return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
7277     return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
7278   case AArch64CC::HI:
7279     return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
7280   case AArch64CC::HS:
7281     return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
7282   }
7283 }
7284
7285 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
7286                                            SelectionDAG &DAG) const {
7287   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7288   SDValue LHS = Op.getOperand(0);
7289   SDValue RHS = Op.getOperand(1);
7290   EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
7291   SDLoc dl(Op);
7292
7293   if (LHS.getValueType().getVectorElementType().isInteger()) {
7294     assert(LHS.getValueType() == RHS.getValueType());
7295     AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
7296     SDValue Cmp =
7297         EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
7298     return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
7299   }
7300
7301   const bool FullFP16 =
7302     static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
7303
7304   // Make v4f16 (only) fcmp operations utilise vector instructions
7305   // v8f16 support will be a litle more complicated
7306   if (LHS.getValueType().getVectorElementType() == MVT::f16) {
7307     if (!FullFP16 && LHS.getValueType().getVectorNumElements() == 4) {
7308       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
7309       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
7310       SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
7311       DAG.ReplaceAllUsesWith(Op, NewSetcc);
7312       CmpVT = MVT::v4i32;
7313     } else
7314       return SDValue();
7315   }
7316
7317   assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
7318          LHS.getValueType().getVectorElementType() == MVT::f64);
7319
7320   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
7321   // clean.  Some of them require two branches to implement.
7322   AArch64CC::CondCode CC1, CC2;
7323   bool ShouldInvert;
7324   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
7325
7326   bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
7327   SDValue Cmp =
7328       EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
7329   if (!Cmp.getNode())
7330     return SDValue();
7331
7332   if (CC2 != AArch64CC::AL) {
7333     SDValue Cmp2 =
7334         EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
7335     if (!Cmp2.getNode())
7336       return SDValue();
7337
7338     Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
7339   }
7340
7341   Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
7342
7343   if (ShouldInvert)
7344     return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
7345
7346   return Cmp;
7347 }
7348
7349 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
7350                                   SelectionDAG &DAG) {
7351   SDValue VecOp = ScalarOp.getOperand(0);
7352   auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
7353   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
7354                      DAG.getConstant(0, DL, MVT::i64));
7355 }
7356
7357 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
7358                                               SelectionDAG &DAG) const {
7359   SDLoc dl(Op);
7360   switch (Op.getOpcode()) {
7361   case ISD::VECREDUCE_ADD:
7362     return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
7363   case ISD::VECREDUCE_SMAX:
7364     return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
7365   case ISD::VECREDUCE_SMIN:
7366     return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
7367   case ISD::VECREDUCE_UMAX:
7368     return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
7369   case ISD::VECREDUCE_UMIN:
7370     return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
7371   case ISD::VECREDUCE_FMAX: {
7372     assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
7373     return DAG.getNode(
7374         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
7375         DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
7376         Op.getOperand(0));
7377   }
7378   case ISD::VECREDUCE_FMIN: {
7379     assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
7380     return DAG.getNode(
7381         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
7382         DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
7383         Op.getOperand(0));
7384   }
7385   default:
7386     llvm_unreachable("Unhandled reduction");
7387   }
7388 }
7389
7390 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
7391                                                     SelectionDAG &DAG) const {
7392   auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
7393   if (!Subtarget.hasLSE())
7394     return SDValue();
7395
7396   // LSE has an atomic load-add instruction, but not a load-sub.
7397   SDLoc dl(Op);
7398   MVT VT = Op.getSimpleValueType();
7399   SDValue RHS = Op.getOperand(2);
7400   AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
7401   RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
7402   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
7403                        Op.getOperand(0), Op.getOperand(1), RHS,
7404                        AN->getMemOperand());
7405 }
7406
7407 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
7408                                                     SelectionDAG &DAG) const {
7409   auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
7410   if (!Subtarget.hasLSE())
7411     return SDValue();
7412
7413   // LSE has an atomic load-clear instruction, but not a load-and.
7414   SDLoc dl(Op);
7415   MVT VT = Op.getSimpleValueType();
7416   SDValue RHS = Op.getOperand(2);
7417   AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
7418   RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
7419   return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
7420                        Op.getOperand(0), Op.getOperand(1), RHS,
7421                        AN->getMemOperand());
7422 }
7423
7424 SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
7425     SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
7426   SDLoc dl(Op);
7427   EVT PtrVT = getPointerTy(DAG.getDataLayout());
7428   SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
7429
7430   const uint32_t *Mask =
7431       Subtarget->getRegisterInfo()->getWindowsStackProbePreservedMask();
7432
7433   Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
7434                      DAG.getConstant(4, dl, MVT::i64));
7435   Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
7436   Chain =
7437       DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
7438                   Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
7439                   DAG.getRegisterMask(Mask), Chain.getValue(1));
7440   // To match the actual intent better, we should read the output from X15 here
7441   // again (instead of potentially spilling it to the stack), but rereading Size
7442   // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
7443   // here.
7444
7445   Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
7446                      DAG.getConstant(4, dl, MVT::i64));
7447   return Chain;
7448 }
7449
7450 SDValue
7451 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
7452                                                SelectionDAG &DAG) const {
7453   assert(Subtarget->isTargetWindows() &&
7454          "Only Windows alloca probing supported");
7455   SDLoc dl(Op);
7456   // Get the inputs.
7457   SDNode *Node = Op.getNode();
7458   SDValue Chain = Op.getOperand(0);
7459   SDValue Size = Op.getOperand(1);
7460   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
7461   EVT VT = Node->getValueType(0);
7462
7463   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
7464
7465   Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
7466
7467   SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
7468   Chain = SP.getValue(1);
7469   SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
7470   Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
7471
7472   if (Align) {
7473     SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
7474                      DAG.getConstant(-(uint64_t)Align, dl, VT));
7475     Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
7476   }
7477
7478   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
7479                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
7480
7481   SDValue Ops[2] = {SP, Chain};
7482   return DAG.getMergeValues(Ops, dl);
7483 }
7484
7485 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
7486 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
7487 /// specified in the intrinsic calls.
7488 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
7489                                                const CallInst &I,
7490                                                MachineFunction &MF,
7491                                                unsigned Intrinsic) const {
7492   auto &DL = I.getModule()->getDataLayout();
7493   switch (Intrinsic) {
7494   case Intrinsic::aarch64_neon_ld2:
7495   case Intrinsic::aarch64_neon_ld3:
7496   case Intrinsic::aarch64_neon_ld4:
7497   case Intrinsic::aarch64_neon_ld1x2:
7498   case Intrinsic::aarch64_neon_ld1x3:
7499   case Intrinsic::aarch64_neon_ld1x4:
7500   case Intrinsic::aarch64_neon_ld2lane:
7501   case Intrinsic::aarch64_neon_ld3lane:
7502   case Intrinsic::aarch64_neon_ld4lane:
7503   case Intrinsic::aarch64_neon_ld2r:
7504   case Intrinsic::aarch64_neon_ld3r:
7505   case Intrinsic::aarch64_neon_ld4r: {
7506     Info.opc = ISD::INTRINSIC_W_CHAIN;
7507     // Conservatively set memVT to the entire set of vectors loaded.
7508     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
7509     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
7510     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
7511     Info.offset = 0;
7512     Info.align = 0;
7513     // volatile loads with NEON intrinsics not supported
7514     Info.flags = MachineMemOperand::MOLoad;
7515     return true;
7516   }
7517   case Intrinsic::aarch64_neon_st2:
7518   case Intrinsic::aarch64_neon_st3:
7519   case Intrinsic::aarch64_neon_st4:
7520   case Intrinsic::aarch64_neon_st1x2:
7521   case Intrinsic::aarch64_neon_st1x3:
7522   case Intrinsic::aarch64_neon_st1x4:
7523   case Intrinsic::aarch64_neon_st2lane:
7524   case Intrinsic::aarch64_neon_st3lane:
7525   case Intrinsic::aarch64_neon_st4lane: {
7526     Info.opc = ISD::INTRINSIC_VOID;
7527     // Conservatively set memVT to the entire set of vectors stored.
7528     unsigned NumElts = 0;
7529     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
7530       Type *ArgTy = I.getArgOperand(ArgI)->getType();
7531       if (!ArgTy->isVectorTy())
7532         break;
7533       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
7534     }
7535     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
7536     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
7537     Info.offset = 0;
7538     Info.align = 0;
7539     // volatile stores with NEON intrinsics not supported
7540     Info.flags = MachineMemOperand::MOStore;
7541     return true;
7542   }
7543   case Intrinsic::aarch64_ldaxr:
7544   case Intrinsic::aarch64_ldxr: {
7545     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
7546     Info.opc = ISD::INTRINSIC_W_CHAIN;
7547     Info.memVT = MVT::getVT(PtrTy->getElementType());
7548     Info.ptrVal = I.getArgOperand(0);
7549     Info.offset = 0;
7550     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
7551     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
7552     return true;
7553   }
7554   case Intrinsic::aarch64_stlxr:
7555   case Intrinsic::aarch64_stxr: {
7556     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
7557     Info.opc = ISD::INTRINSIC_W_CHAIN;
7558     Info.memVT = MVT::getVT(PtrTy->getElementType());
7559     Info.ptrVal = I.getArgOperand(1);
7560     Info.offset = 0;
7561     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
7562     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
7563     return true;
7564   }
7565   case Intrinsic::aarch64_ldaxp:
7566   case Intrinsic::aarch64_ldxp:
7567     Info.opc = ISD::INTRINSIC_W_CHAIN;
7568     Info.memVT = MVT::i128;
7569     Info.ptrVal = I.getArgOperand(0);
7570     Info.offset = 0;
7571     Info.align = 16;
7572     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
7573     return true;
7574   case Intrinsic::aarch64_stlxp:
7575   case Intrinsic::aarch64_stxp:
7576     Info.opc = ISD::INTRINSIC_W_CHAIN;
7577     Info.memVT = MVT::i128;
7578     Info.ptrVal = I.getArgOperand(2);
7579     Info.offset = 0;
7580     Info.align = 16;
7581     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
7582     return true;
7583   default:
7584     break;
7585   }
7586
7587   return false;
7588 }
7589
7590 // Truncations from 64-bit GPR to 32-bit GPR is free.
7591 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
7592   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
7593     return false;
7594   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
7595   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
7596   return NumBits1 > NumBits2;
7597 }
7598 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
7599   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
7600     return false;
7601   unsigned NumBits1 = VT1.getSizeInBits();
7602   unsigned NumBits2 = VT2.getSizeInBits();
7603   return NumBits1 > NumBits2;
7604 }
7605
7606 /// Check if it is profitable to hoist instruction in then/else to if.
7607 /// Not profitable if I and it's user can form a FMA instruction
7608 /// because we prefer FMSUB/FMADD.
7609 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
7610   if (I->getOpcode() != Instruction::FMul)
7611     return true;
7612
7613   if (!I->hasOneUse())
7614     return true;
7615
7616   Instruction *User = I->user_back();
7617
7618   if (User &&
7619       !(User->getOpcode() == Instruction::FSub ||
7620         User->getOpcode() == Instruction::FAdd))
7621     return true;
7622
7623   const TargetOptions &Options = getTargetMachine().Options;
7624   const DataLayout &DL = I->getModule()->getDataLayout();
7625   EVT VT = getValueType(DL, User->getOperand(0)->getType());
7626
7627   return !(isFMAFasterThanFMulAndFAdd(VT) &&
7628            isOperationLegalOrCustom(ISD::FMA, VT) &&
7629            (Options.AllowFPOpFusion == FPOpFusion::Fast ||
7630             Options.UnsafeFPMath));
7631 }
7632
7633 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
7634 // 64-bit GPR.
7635 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
7636   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
7637     return false;
7638   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
7639   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
7640   return NumBits1 == 32 && NumBits2 == 64;
7641 }
7642 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
7643   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
7644     return false;
7645   unsigned NumBits1 = VT1.getSizeInBits();
7646   unsigned NumBits2 = VT2.getSizeInBits();
7647   return NumBits1 == 32 && NumBits2 == 64;
7648 }
7649
7650 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
7651   EVT VT1 = Val.getValueType();
7652   if (isZExtFree(VT1, VT2)) {
7653     return true;
7654   }
7655
7656   if (Val.getOpcode() != ISD::LOAD)
7657     return false;
7658
7659   // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
7660   return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
7661           VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
7662           VT1.getSizeInBits() <= 32);
7663 }
7664
7665 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
7666   if (isa<FPExtInst>(Ext))
7667     return false;
7668
7669   // Vector types are not free.
7670   if (Ext->getType()->isVectorTy())
7671     return false;
7672
7673   for (const Use &U : Ext->uses()) {
7674     // The extension is free if we can fold it with a left shift in an
7675     // addressing mode or an arithmetic operation: add, sub, and cmp.
7676
7677     // Is there a shift?
7678     const Instruction *Instr = cast<Instruction>(U.getUser());
7679
7680     // Is this a constant shift?
7681     switch (Instr->getOpcode()) {
7682     case Instruction::Shl:
7683       if (!isa<ConstantInt>(Instr->getOperand(1)))
7684         return false;
7685       break;
7686     case Instruction::GetElementPtr: {
7687       gep_type_iterator GTI = gep_type_begin(Instr);
7688       auto &DL = Ext->getModule()->getDataLayout();
7689       std::advance(GTI, U.getOperandNo()-1);
7690       Type *IdxTy = GTI.getIndexedType();
7691       // This extension will end up with a shift because of the scaling factor.
7692       // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
7693       // Get the shift amount based on the scaling factor:
7694       // log2(sizeof(IdxTy)) - log2(8).
7695       uint64_t ShiftAmt =
7696           countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
7697       // Is the constant foldable in the shift of the addressing mode?
7698       // I.e., shift amount is between 1 and 4 inclusive.
7699       if (ShiftAmt == 0 || ShiftAmt > 4)
7700         return false;
7701       break;
7702     }
7703     case Instruction::Trunc:
7704       // Check if this is a noop.
7705       // trunc(sext ty1 to ty2) to ty1.
7706       if (Instr->getType() == Ext->getOperand(0)->getType())
7707         continue;
7708       LLVM_FALLTHROUGH;
7709     default:
7710       return false;
7711     }
7712
7713     // At this point we can use the bfm family, so this extension is free
7714     // for that use.
7715   }
7716   return true;
7717 }
7718
7719 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
7720                                           unsigned &RequiredAligment) const {
7721   if (!LoadedType.isSimple() ||
7722       (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
7723     return false;
7724   // Cyclone supports unaligned accesses.
7725   RequiredAligment = 0;
7726   unsigned NumBits = LoadedType.getSizeInBits();
7727   return NumBits == 32 || NumBits == 64;
7728 }
7729
7730 /// A helper function for determining the number of interleaved accesses we
7731 /// will generate when lowering accesses of the given type.
7732 unsigned
7733 AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
7734                                                  const DataLayout &DL) const {
7735   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
7736 }
7737
7738 MachineMemOperand::Flags
7739 AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
7740   if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
7741       I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
7742     return MOStridedAccess;
7743   return MachineMemOperand::MONone;
7744 }
7745
7746 bool AArch64TargetLowering::isLegalInterleavedAccessType(
7747     VectorType *VecTy, const DataLayout &DL) const {
7748
7749   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
7750   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
7751
7752   // Ensure the number of vector elements is greater than 1.
7753   if (VecTy->getNumElements() < 2)
7754     return false;
7755
7756   // Ensure the element type is legal.
7757   if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
7758     return false;
7759
7760   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
7761   // 128 will be split into multiple interleaved accesses.
7762   return VecSize == 64 || VecSize % 128 == 0;
7763 }
7764
7765 /// \brief Lower an interleaved load into a ldN intrinsic.
7766 ///
7767 /// E.g. Lower an interleaved load (Factor = 2):
7768 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
7769 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
7770 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
7771 ///
7772 ///      Into:
7773 ///        %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
7774 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
7775 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
7776 bool AArch64TargetLowering::lowerInterleavedLoad(
7777     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
7778     ArrayRef<unsigned> Indices, unsigned Factor) const {
7779   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
7780          "Invalid interleave factor");
7781   assert(!Shuffles.empty() && "Empty shufflevector input");
7782   assert(Shuffles.size() == Indices.size() &&
7783          "Unmatched number of shufflevectors and indices");
7784
7785   const DataLayout &DL = LI->getModule()->getDataLayout();
7786
7787   VectorType *VecTy = Shuffles[0]->getType();
7788
7789   // Skip if we do not have NEON and skip illegal vector types. We can
7790   // "legalize" wide vector types into multiple interleaved accesses as long as
7791   // the vector types are divisible by 128.
7792   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
7793     return false;
7794
7795   unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
7796
7797   // A pointer vector can not be the return type of the ldN intrinsics. Need to
7798   // load integer vectors first and then convert to pointer vectors.
7799   Type *EltTy = VecTy->getVectorElementType();
7800   if (EltTy->isPointerTy())
7801     VecTy =
7802         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
7803
7804   IRBuilder<> Builder(LI);
7805
7806   // The base address of the load.
7807   Value *BaseAddr = LI->getPointerOperand();
7808
7809   if (NumLoads > 1) {
7810     // If we're going to generate more than one load, reset the sub-vector type
7811     // to something legal.
7812     VecTy = VectorType::get(VecTy->getVectorElementType(),
7813                             VecTy->getVectorNumElements() / NumLoads);
7814
7815     // We will compute the pointer operand of each load from the original base
7816     // address using GEPs. Cast the base address to a pointer to the scalar
7817     // element type.
7818     BaseAddr = Builder.CreateBitCast(
7819         BaseAddr, VecTy->getVectorElementType()->getPointerTo(
7820                       LI->getPointerAddressSpace()));
7821   }
7822
7823   Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
7824   Type *Tys[2] = {VecTy, PtrTy};
7825   static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
7826                                             Intrinsic::aarch64_neon_ld3,
7827                                             Intrinsic::aarch64_neon_ld4};
7828   Function *LdNFunc =
7829       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
7830
7831   // Holds sub-vectors extracted from the load intrinsic return values. The
7832   // sub-vectors are associated with the shufflevector instructions they will
7833   // replace.
7834   DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
7835
7836   for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
7837
7838     // If we're generating more than one load, compute the base address of
7839     // subsequent loads as an offset from the previous.
7840     if (LoadCount > 0)
7841       BaseAddr = Builder.CreateConstGEP1_32(
7842           BaseAddr, VecTy->getVectorNumElements() * Factor);
7843
7844     CallInst *LdN = Builder.CreateCall(
7845         LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
7846
7847     // Extract and store the sub-vectors returned by the load intrinsic.
7848     for (unsigned i = 0; i < Shuffles.size(); i++) {
7849       ShuffleVectorInst *SVI = Shuffles[i];
7850       unsigned Index = Indices[i];
7851
7852       Value *SubVec = Builder.CreateExtractValue(LdN, Index);
7853
7854       // Convert the integer vector to pointer vector if the element is pointer.
7855       if (EltTy->isPointerTy())
7856         SubVec = Builder.CreateIntToPtr(
7857             SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
7858                                     VecTy->getVectorNumElements()));
7859       SubVecs[SVI].push_back(SubVec);
7860     }
7861   }
7862
7863   // Replace uses of the shufflevector instructions with the sub-vectors
7864   // returned by the load intrinsic. If a shufflevector instruction is
7865   // associated with more than one sub-vector, those sub-vectors will be
7866   // concatenated into a single wide vector.
7867   for (ShuffleVectorInst *SVI : Shuffles) {
7868     auto &SubVec = SubVecs[SVI];
7869     auto *WideVec =
7870         SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
7871     SVI->replaceAllUsesWith(WideVec);
7872   }
7873
7874   return true;
7875 }
7876
7877 /// \brief Lower an interleaved store into a stN intrinsic.
7878 ///
7879 /// E.g. Lower an interleaved store (Factor = 3):
7880 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
7881 ///                 <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
7882 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
7883 ///
7884 ///      Into:
7885 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
7886 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
7887 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
7888 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
7889 ///
7890 /// Note that the new shufflevectors will be removed and we'll only generate one
7891 /// st3 instruction in CodeGen.
7892 ///
7893 /// Example for a more general valid mask (Factor 3). Lower:
7894 ///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
7895 ///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
7896 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
7897 ///
7898 ///      Into:
7899 ///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
7900 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
7901 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
7902 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
7903 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
7904                                                   ShuffleVectorInst *SVI,
7905                                                   unsigned Factor) const {
7906   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
7907          "Invalid interleave factor");
7908
7909   VectorType *VecTy = SVI->getType();
7910   assert(VecTy->getVectorNumElements() % Factor == 0 &&
7911          "Invalid interleaved store");
7912
7913   unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
7914   Type *EltTy = VecTy->getVectorElementType();
7915   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
7916
7917   const DataLayout &DL = SI->getModule()->getDataLayout();
7918
7919   // Skip if we do not have NEON and skip illegal vector types. We can
7920   // "legalize" wide vector types into multiple interleaved accesses as long as
7921   // the vector types are divisible by 128.
7922   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
7923     return false;
7924
7925   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
7926
7927   Value *Op0 = SVI->getOperand(0);
7928   Value *Op1 = SVI->getOperand(1);
7929   IRBuilder<> Builder(SI);
7930
7931   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
7932   // vectors to integer vectors.
7933   if (EltTy->isPointerTy()) {
7934     Type *IntTy = DL.getIntPtrType(EltTy);
7935     unsigned NumOpElts =
7936         dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
7937
7938     // Convert to the corresponding integer vector.
7939     Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
7940     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
7941     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
7942
7943     SubVecTy = VectorType::get(IntTy, LaneLen);
7944   }
7945
7946   // The base address of the store.
7947   Value *BaseAddr = SI->getPointerOperand();
7948
7949   if (NumStores > 1) {
7950     // If we're going to generate more than one store, reset the lane length
7951     // and sub-vector type to something legal.
7952     LaneLen /= NumStores;
7953     SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
7954
7955     // We will compute the pointer operand of each store from the original base
7956     // address using GEPs. Cast the base address to a pointer to the scalar
7957     // element type.
7958     BaseAddr = Builder.CreateBitCast(
7959         BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
7960                       SI->getPointerAddressSpace()));
7961   }
7962
7963   auto Mask = SVI->getShuffleMask();
7964
7965   Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
7966   Type *Tys[2] = {SubVecTy, PtrTy};
7967   static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
7968                                              Intrinsic::aarch64_neon_st3,
7969                                              Intrinsic::aarch64_neon_st4};
7970   Function *StNFunc =
7971       Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
7972
7973   for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
7974
7975     SmallVector<Value *, 5> Ops;
7976
7977     // Split the shufflevector operands into sub vectors for the new stN call.
7978     for (unsigned i = 0; i < Factor; i++) {
7979       unsigned IdxI = StoreCount * LaneLen * Factor + i;
7980       if (Mask[IdxI] >= 0) {
7981         Ops.push_back(Builder.CreateShuffleVector(
7982             Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
7983       } else {
7984         unsigned StartMask = 0;
7985         for (unsigned j = 1; j < LaneLen; j++) {
7986           unsigned IdxJ = StoreCount * LaneLen * Factor + j;
7987           if (Mask[IdxJ * Factor + IdxI] >= 0) {
7988             StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
7989             break;
7990           }
7991         }
7992         // Note: Filling undef gaps with random elements is ok, since
7993         // those elements were being written anyway (with undefs).
7994         // In the case of all undefs we're defaulting to using elems from 0
7995         // Note: StartMask cannot be negative, it's checked in
7996         // isReInterleaveMask
7997         Ops.push_back(Builder.CreateShuffleVector(
7998             Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
7999       }
8000     }
8001
8002     // If we generating more than one store, we compute the base address of
8003     // subsequent stores as an offset from the previous.
8004     if (StoreCount > 0)
8005       BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
8006
8007     Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
8008     Builder.CreateCall(StNFunc, Ops);
8009   }
8010   return true;
8011 }
8012
8013 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
8014                        unsigned AlignCheck) {
8015   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
8016           (DstAlign == 0 || DstAlign % AlignCheck == 0));
8017 }
8018
8019 EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
8020                                                unsigned SrcAlign, bool IsMemset,
8021                                                bool ZeroMemset,
8022                                                bool MemcpyStrSrc,
8023                                                MachineFunction &MF) const {
8024   // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
8025   // instruction to materialize the v2i64 zero and one store (with restrictive
8026   // addressing mode). Just do two i64 store of zero-registers.
8027   bool Fast;
8028   const Function &F = MF.getFunction();
8029   if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
8030       !F.hasFnAttribute(Attribute::NoImplicitFloat) &&
8031       (memOpAlign(SrcAlign, DstAlign, 16) ||
8032        (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
8033     return MVT::f128;
8034
8035   if (Size >= 8 &&
8036       (memOpAlign(SrcAlign, DstAlign, 8) ||
8037        (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast)))
8038     return MVT::i64;
8039
8040   if (Size >= 4 &&
8041       (memOpAlign(SrcAlign, DstAlign, 4) ||
8042        (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast)))
8043     return MVT::i32;
8044
8045   return MVT::Other;
8046 }
8047
8048 // 12-bit optionally shifted immediates are legal for adds.
8049 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
8050   if (Immed == std::numeric_limits<int64_t>::min()) {
8051     DEBUG(dbgs() << "Illegal add imm " << Immed << ": avoid UB for INT64_MIN\n");
8052     return false;
8053   }
8054   // Same encoding for add/sub, just flip the sign.
8055   Immed = std::abs(Immed);
8056   bool IsLegal = ((Immed >> 12) == 0 ||
8057                   ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
8058   DEBUG(dbgs() << "Is " << Immed << " legal add imm: " <<
8059         (IsLegal ? "yes" : "no") << "\n");
8060   return IsLegal;
8061 }
8062
8063 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
8064 // immediates is the same as for an add or a sub.
8065 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
8066   return isLegalAddImmediate(Immed);
8067 }
8068
8069 /// isLegalAddressingMode - Return true if the addressing mode represented
8070 /// by AM is legal for this target, for a load/store of the specified type.
8071 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
8072                                                   const AddrMode &AM, Type *Ty,
8073                                                   unsigned AS, Instruction *I) const {
8074   // AArch64 has five basic addressing modes:
8075   //  reg
8076   //  reg + 9-bit signed offset
8077   //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
8078   //  reg1 + reg2
8079   //  reg + SIZE_IN_BYTES * reg
8080
8081   // No global is ever allowed as a base.
8082   if (AM.BaseGV)
8083     return false;
8084
8085   // No reg+reg+imm addressing.
8086   if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
8087     return false;
8088
8089   // check reg + imm case:
8090   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
8091   uint64_t NumBytes = 0;
8092   if (Ty->isSized()) {
8093     uint64_t NumBits = DL.getTypeSizeInBits(Ty);
8094     NumBytes = NumBits / 8;
8095     if (!isPowerOf2_64(NumBits))
8096       NumBytes = 0;
8097   }
8098
8099   if (!AM.Scale) {
8100     int64_t Offset = AM.BaseOffs;
8101
8102     // 9-bit signed offset
8103     if (isInt<9>(Offset))
8104       return true;
8105
8106     // 12-bit unsigned offset
8107     unsigned shift = Log2_64(NumBytes);
8108     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
8109         // Must be a multiple of NumBytes (NumBytes is a power of 2)
8110         (Offset >> shift) << shift == Offset)
8111       return true;
8112     return false;
8113   }
8114
8115   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
8116
8117   return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
8118 }
8119
8120 int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
8121                                                 const AddrMode &AM, Type *Ty,
8122                                                 unsigned AS) const {
8123   // Scaling factors are not free at all.
8124   // Operands                     | Rt Latency
8125   // -------------------------------------------
8126   // Rt, [Xn, Xm]                 | 4
8127   // -------------------------------------------
8128   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
8129   // Rt, [Xn, Wm, <extend> #imm]  |
8130   if (isLegalAddressingMode(DL, AM, Ty, AS))
8131     // Scale represents reg2 * scale, thus account for 1 if
8132     // it is not equal to 0 or 1.
8133     return AM.Scale != 0 && AM.Scale != 1;
8134   return -1;
8135 }
8136
8137 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
8138   VT = VT.getScalarType();
8139
8140   if (!VT.isSimple())
8141     return false;
8142
8143   switch (VT.getSimpleVT().SimpleTy) {
8144   case MVT::f32:
8145   case MVT::f64:
8146     return true;
8147   default:
8148     break;
8149   }
8150
8151   return false;
8152 }
8153
8154 const MCPhysReg *
8155 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
8156   // LR is a callee-save register, but we must treat it as clobbered by any call
8157   // site. Hence we include LR in the scratch registers, which are in turn added
8158   // as implicit-defs for stackmaps and patchpoints.
8159   static const MCPhysReg ScratchRegs[] = {
8160     AArch64::X16, AArch64::X17, AArch64::LR, 0
8161   };
8162   return ScratchRegs;
8163 }
8164
8165 bool
8166 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
8167   EVT VT = N->getValueType(0);
8168     // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
8169     // it with shift to let it be lowered to UBFX.
8170   if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
8171       isa<ConstantSDNode>(N->getOperand(1))) {
8172     uint64_t TruncMask = N->getConstantOperandVal(1);
8173     if (isMask_64(TruncMask) &&
8174       N->getOperand(0).getOpcode() == ISD::SRL &&
8175       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
8176       return false;
8177   }
8178   return true;
8179 }
8180
8181 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
8182                                                               Type *Ty) const {
8183   assert(Ty->isIntegerTy());
8184
8185   unsigned BitSize = Ty->getPrimitiveSizeInBits();
8186   if (BitSize == 0)
8187     return false;
8188
8189   int64_t Val = Imm.getSExtValue();
8190   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
8191     return true;
8192
8193   if ((int64_t)Val < 0)
8194     Val = ~Val;
8195   if (BitSize == 32)
8196     Val &= (1LL << 32) - 1;
8197
8198   unsigned LZ = countLeadingZeros((uint64_t)Val);
8199   unsigned Shift = (63 - LZ) / 16;
8200   // MOVZ is free so return true for one or fewer MOVK.
8201   return Shift < 3;
8202 }
8203
8204 /// Turn vector tests of the signbit in the form of:
8205 ///   xor (sra X, elt_size(X)-1), -1
8206 /// into:
8207 ///   cmge X, X, #0
8208 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
8209                                          const AArch64Subtarget *Subtarget) {
8210   EVT VT = N->getValueType(0);
8211   if (!Subtarget->hasNEON() || !VT.isVector())
8212     return SDValue();
8213
8214   // There must be a shift right algebraic before the xor, and the xor must be a
8215   // 'not' operation.
8216   SDValue Shift = N->getOperand(0);
8217   SDValue Ones = N->getOperand(1);
8218   if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
8219       !ISD::isBuildVectorAllOnes(Ones.getNode()))
8220     return SDValue();
8221
8222   // The shift should be smearing the sign bit across each vector element.
8223   auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
8224   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
8225   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
8226     return SDValue();
8227
8228   return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
8229 }
8230
8231 // Generate SUBS and CSEL for integer abs.
8232 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
8233   EVT VT = N->getValueType(0);
8234
8235   SDValue N0 = N->getOperand(0);
8236   SDValue N1 = N->getOperand(1);
8237   SDLoc DL(N);
8238
8239   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
8240   // and change it to SUB and CSEL.
8241   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
8242       N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
8243       N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
8244     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
8245       if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
8246         SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
8247                                   N0.getOperand(0));
8248         // Generate SUBS & CSEL.
8249         SDValue Cmp =
8250             DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
8251                         N0.getOperand(0), DAG.getConstant(0, DL, VT));
8252         return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
8253                            DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
8254                            SDValue(Cmp.getNode(), 1));
8255       }
8256   return SDValue();
8257 }
8258
8259 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
8260                                  TargetLowering::DAGCombinerInfo &DCI,
8261                                  const AArch64Subtarget *Subtarget) {
8262   if (DCI.isBeforeLegalizeOps())
8263     return SDValue();
8264
8265   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
8266     return Cmp;
8267
8268   return performIntegerAbsCombine(N, DAG);
8269 }
8270
8271 SDValue
8272 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
8273                                      SelectionDAG &DAG,
8274                                      std::vector<SDNode *> *Created) const {
8275   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
8276   if (isIntDivCheap(N->getValueType(0), Attr))
8277     return SDValue(N,0); // Lower SDIV as SDIV
8278
8279   // fold (sdiv X, pow2)
8280   EVT VT = N->getValueType(0);
8281   if ((VT != MVT::i32 && VT != MVT::i64) ||
8282       !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
8283     return SDValue();
8284
8285   SDLoc DL(N);
8286   SDValue N0 = N->getOperand(0);
8287   unsigned Lg2 = Divisor.countTrailingZeros();
8288   SDValue Zero = DAG.getConstant(0, DL, VT);
8289   SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
8290
8291   // Add (N0 < 0) ? Pow2 - 1 : 0;
8292   SDValue CCVal;
8293   SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
8294   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
8295   SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
8296
8297   if (Created) {
8298     Created->push_back(Cmp.getNode());
8299     Created->push_back(Add.getNode());
8300     Created->push_back(CSel.getNode());
8301   }
8302
8303   // Divide by pow2.
8304   SDValue SRA =
8305       DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
8306
8307   // If we're dividing by a positive value, we're done.  Otherwise, we must
8308   // negate the result.
8309   if (Divisor.isNonNegative())
8310     return SRA;
8311
8312   if (Created)
8313     Created->push_back(SRA.getNode());
8314   return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
8315 }
8316
8317 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
8318                                  TargetLowering::DAGCombinerInfo &DCI,
8319                                  const AArch64Subtarget *Subtarget) {
8320   if (DCI.isBeforeLegalizeOps())
8321     return SDValue();
8322
8323   // The below optimizations require a constant RHS.
8324   if (!isa<ConstantSDNode>(N->getOperand(1)))
8325     return SDValue();
8326
8327   ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
8328   const APInt &ConstValue = C->getAPIntValue();
8329
8330   // Multiplication of a power of two plus/minus one can be done more
8331   // cheaply as as shift+add/sub. For now, this is true unilaterally. If
8332   // future CPUs have a cheaper MADD instruction, this may need to be
8333   // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
8334   // 64-bit is 5 cycles, so this is always a win.
8335   // More aggressively, some multiplications N0 * C can be lowered to
8336   // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
8337   // e.g. 6=3*2=(2+1)*2.
8338   // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
8339   // which equals to (1+2)*16-(1+2).
8340   SDValue N0 = N->getOperand(0);
8341   // TrailingZeroes is used to test if the mul can be lowered to
8342   // shift+add+shift.
8343   unsigned TrailingZeroes = ConstValue.countTrailingZeros();
8344   if (TrailingZeroes) {
8345     // Conservatively do not lower to shift+add+shift if the mul might be
8346     // folded into smul or umul.
8347     if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
8348                             isZeroExtended(N0.getNode(), DAG)))
8349       return SDValue();
8350     // Conservatively do not lower to shift+add+shift if the mul might be
8351     // folded into madd or msub.
8352     if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
8353                            N->use_begin()->getOpcode() == ISD::SUB))
8354       return SDValue();
8355   }
8356   // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
8357   // and shift+add+shift.
8358   APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
8359
8360   unsigned ShiftAmt, AddSubOpc;
8361   // Is the shifted value the LHS operand of the add/sub?
8362   bool ShiftValUseIsN0 = true;
8363   // Do we need to negate the result?
8364   bool NegateResult = false;
8365
8366   if (ConstValue.isNonNegative()) {
8367     // (mul x, 2^N + 1) => (add (shl x, N), x)
8368     // (mul x, 2^N - 1) => (sub (shl x, N), x)
8369     // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
8370     APInt SCVMinus1 = ShiftedConstValue - 1;
8371     APInt CVPlus1 = ConstValue + 1;
8372     if (SCVMinus1.isPowerOf2()) {
8373       ShiftAmt = SCVMinus1.logBase2();
8374       AddSubOpc = ISD::ADD;
8375     } else if (CVPlus1.isPowerOf2()) {
8376       ShiftAmt = CVPlus1.logBase2();
8377       AddSubOpc = ISD::SUB;
8378     } else
8379       return SDValue();
8380   } else {
8381     // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
8382     // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
8383     APInt CVNegPlus1 = -ConstValue + 1;
8384     APInt CVNegMinus1 = -ConstValue - 1;
8385     if (CVNegPlus1.isPowerOf2()) {
8386       ShiftAmt = CVNegPlus1.logBase2();
8387       AddSubOpc = ISD::SUB;
8388       ShiftValUseIsN0 = false;
8389     } else if (CVNegMinus1.isPowerOf2()) {
8390       ShiftAmt = CVNegMinus1.logBase2();
8391       AddSubOpc = ISD::ADD;
8392       NegateResult = true;
8393     } else
8394       return SDValue();
8395   }
8396
8397   SDLoc DL(N);
8398   EVT VT = N->getValueType(0);
8399   SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
8400                                    DAG.getConstant(ShiftAmt, DL, MVT::i64));
8401
8402   SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
8403   SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
8404   SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
8405   assert(!(NegateResult && TrailingZeroes) &&
8406          "NegateResult and TrailingZeroes cannot both be true for now.");
8407   // Negate the result.
8408   if (NegateResult)
8409     return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
8410   // Shift the result.
8411   if (TrailingZeroes)
8412     return DAG.getNode(ISD::SHL, DL, VT, Res,
8413                        DAG.getConstant(TrailingZeroes, DL, MVT::i64));
8414   return Res;
8415 }
8416
8417 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
8418                                                          SelectionDAG &DAG) {
8419   // Take advantage of vector comparisons producing 0 or -1 in each lane to
8420   // optimize away operation when it's from a constant.
8421   //
8422   // The general transformation is:
8423   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
8424   //       AND(VECTOR_CMP(x,y), constant2)
8425   //    constant2 = UNARYOP(constant)
8426
8427   // Early exit if this isn't a vector operation, the operand of the
8428   // unary operation isn't a bitwise AND, or if the sizes of the operations
8429   // aren't the same.
8430   EVT VT = N->getValueType(0);
8431   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
8432       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
8433       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
8434     return SDValue();
8435
8436   // Now check that the other operand of the AND is a constant. We could
8437   // make the transformation for non-constant splats as well, but it's unclear
8438   // that would be a benefit as it would not eliminate any operations, just
8439   // perform one more step in scalar code before moving to the vector unit.
8440   if (BuildVectorSDNode *BV =
8441           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
8442     // Bail out if the vector isn't a constant.
8443     if (!BV->isConstant())
8444       return SDValue();
8445
8446     // Everything checks out. Build up the new and improved node.
8447     SDLoc DL(N);
8448     EVT IntVT = BV->getValueType(0);
8449     // Create a new constant of the appropriate type for the transformed
8450     // DAG.
8451     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
8452     // The AND node needs bitcasts to/from an integer vector type around it.
8453     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
8454     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
8455                                  N->getOperand(0)->getOperand(0), MaskConst);
8456     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
8457     return Res;
8458   }
8459
8460   return SDValue();
8461 }
8462
8463 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
8464                                      const AArch64Subtarget *Subtarget) {
8465   // First try to optimize away the conversion when it's conditionally from
8466   // a constant. Vectors only.
8467   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
8468     return Res;
8469
8470   EVT VT = N->getValueType(0);
8471   if (VT != MVT::f32 && VT != MVT::f64)
8472     return SDValue();
8473
8474   // Only optimize when the source and destination types have the same width.
8475   if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
8476     return SDValue();
8477
8478   // If the result of an integer load is only used by an integer-to-float
8479   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
8480   // This eliminates an "integer-to-vector-move" UOP and improves throughput.
8481   SDValue N0 = N->getOperand(0);
8482   if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
8483       // Do not change the width of a volatile load.
8484       !cast<LoadSDNode>(N0)->isVolatile()) {
8485     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
8486     SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
8487                                LN0->getPointerInfo(), LN0->getAlignment(),
8488                                LN0->getMemOperand()->getFlags());
8489
8490     // Make sure successors of the original load stay after it by updating them
8491     // to use the new Chain.
8492     DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
8493
8494     unsigned Opcode =
8495         (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
8496     return DAG.getNode(Opcode, SDLoc(N), VT, Load);
8497   }
8498
8499   return SDValue();
8500 }
8501
8502 /// Fold a floating-point multiply by power of two into floating-point to
8503 /// fixed-point conversion.
8504 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
8505                                      TargetLowering::DAGCombinerInfo &DCI,
8506                                      const AArch64Subtarget *Subtarget) {
8507   if (!Subtarget->hasNEON())
8508     return SDValue();
8509
8510   SDValue Op = N->getOperand(0);
8511   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
8512       Op.getOpcode() != ISD::FMUL)
8513     return SDValue();
8514
8515   SDValue ConstVec = Op->getOperand(1);
8516   if (!isa<BuildVectorSDNode>(ConstVec))
8517     return SDValue();
8518
8519   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
8520   uint32_t FloatBits = FloatTy.getSizeInBits();
8521   if (FloatBits != 32 && FloatBits != 64)
8522     return SDValue();
8523
8524   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
8525   uint32_t IntBits = IntTy.getSizeInBits();
8526   if (IntBits != 16 && IntBits != 32 && IntBits != 64)
8527     return SDValue();
8528
8529   // Avoid conversions where iN is larger than the float (e.g., float -> i64).
8530   if (IntBits > FloatBits)
8531     return SDValue();
8532
8533   BitVector UndefElements;
8534   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
8535   int32_t Bits = IntBits == 64 ? 64 : 32;
8536   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
8537   if (C == -1 || C == 0 || C > Bits)
8538     return SDValue();
8539
8540   MVT ResTy;
8541   unsigned NumLanes = Op.getValueType().getVectorNumElements();
8542   switch (NumLanes) {
8543   default:
8544     return SDValue();
8545   case 2:
8546     ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
8547     break;
8548   case 4:
8549     ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
8550     break;
8551   }
8552
8553   if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
8554     return SDValue();
8555
8556   assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
8557          "Illegal vector type after legalization");
8558
8559   SDLoc DL(N);
8560   bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
8561   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
8562                                       : Intrinsic::aarch64_neon_vcvtfp2fxu;
8563   SDValue FixConv =
8564       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
8565                   DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
8566                   Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
8567   // We can handle smaller integers by generating an extra trunc.
8568   if (IntBits < FloatBits)
8569     FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
8570
8571   return FixConv;
8572 }
8573
8574 /// Fold a floating-point divide by power of two into fixed-point to
8575 /// floating-point conversion.
8576 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
8577                                   TargetLowering::DAGCombinerInfo &DCI,
8578                                   const AArch64Subtarget *Subtarget) {
8579   if (!Subtarget->hasNEON())
8580     return SDValue();
8581
8582   SDValue Op = N->getOperand(0);
8583   unsigned Opc = Op->getOpcode();
8584   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
8585       !Op.getOperand(0).getValueType().isSimple() ||
8586       (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
8587     return SDValue();
8588
8589   SDValue ConstVec = N->getOperand(1);
8590   if (!isa<BuildVectorSDNode>(ConstVec))
8591     return SDValue();
8592
8593   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
8594   int32_t IntBits = IntTy.getSizeInBits();
8595   if (IntBits != 16 && IntBits != 32 && IntBits != 64)
8596     return SDValue();
8597
8598   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
8599   int32_t FloatBits = FloatTy.getSizeInBits();
8600   if (FloatBits != 32 && FloatBits != 64)
8601     return SDValue();
8602
8603   // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
8604   if (IntBits > FloatBits)
8605     return SDValue();
8606
8607   BitVector UndefElements;
8608   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
8609   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
8610   if (C == -1 || C == 0 || C > FloatBits)
8611     return SDValue();
8612
8613   MVT ResTy;
8614   unsigned NumLanes = Op.getValueType().getVectorNumElements();
8615   switch (NumLanes) {
8616   default:
8617     return SDValue();
8618   case 2:
8619     ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
8620     break;
8621   case 4:
8622     ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
8623     break;
8624   }
8625
8626   if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
8627     return SDValue();
8628
8629   SDLoc DL(N);
8630   SDValue ConvInput = Op.getOperand(0);
8631   bool IsSigned = Opc == ISD::SINT_TO_FP;
8632   if (IntBits < FloatBits)
8633     ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
8634                             ResTy, ConvInput);
8635
8636   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
8637                                       : Intrinsic::aarch64_neon_vcvtfxu2fp;
8638   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
8639                      DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
8640                      DAG.getConstant(C, DL, MVT::i32));
8641 }
8642
8643 /// An EXTR instruction is made up of two shifts, ORed together. This helper
8644 /// searches for and classifies those shifts.
8645 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
8646                          bool &FromHi) {
8647   if (N.getOpcode() == ISD::SHL)
8648     FromHi = false;
8649   else if (N.getOpcode() == ISD::SRL)
8650     FromHi = true;
8651   else
8652     return false;
8653
8654   if (!isa<ConstantSDNode>(N.getOperand(1)))
8655     return false;
8656
8657   ShiftAmount = N->getConstantOperandVal(1);
8658   Src = N->getOperand(0);
8659   return true;
8660 }
8661
8662 /// EXTR instruction extracts a contiguous chunk of bits from two existing
8663 /// registers viewed as a high/low pair. This function looks for the pattern:
8664 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
8665 /// with an EXTR. Can't quite be done in TableGen because the two immediates
8666 /// aren't independent.
8667 static SDValue tryCombineToEXTR(SDNode *N,
8668                                 TargetLowering::DAGCombinerInfo &DCI) {
8669   SelectionDAG &DAG = DCI.DAG;
8670   SDLoc DL(N);
8671   EVT VT = N->getValueType(0);
8672
8673   assert(N->getOpcode() == ISD::OR && "Unexpected root");
8674
8675   if (VT != MVT::i32 && VT != MVT::i64)
8676     return SDValue();
8677
8678   SDValue LHS;
8679   uint32_t ShiftLHS = 0;
8680   bool LHSFromHi = false;
8681   if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
8682     return SDValue();
8683
8684   SDValue RHS;
8685   uint32_t ShiftRHS = 0;
8686   bool RHSFromHi = false;
8687   if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
8688     return SDValue();
8689
8690   // If they're both trying to come from the high part of the register, they're
8691   // not really an EXTR.
8692   if (LHSFromHi == RHSFromHi)
8693     return SDValue();
8694
8695   if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
8696     return SDValue();
8697
8698   if (LHSFromHi) {
8699     std::swap(LHS, RHS);
8700     std::swap(ShiftLHS, ShiftRHS);
8701   }
8702
8703   return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
8704                      DAG.getConstant(ShiftRHS, DL, MVT::i64));
8705 }
8706
8707 static SDValue tryCombineToBSL(SDNode *N,
8708                                 TargetLowering::DAGCombinerInfo &DCI) {
8709   EVT VT = N->getValueType(0);
8710   SelectionDAG &DAG = DCI.DAG;
8711   SDLoc DL(N);
8712
8713   if (!VT.isVector())
8714     return SDValue();
8715
8716   SDValue N0 = N->getOperand(0);
8717   if (N0.getOpcode() != ISD::AND)
8718     return SDValue();
8719
8720   SDValue N1 = N->getOperand(1);
8721   if (N1.getOpcode() != ISD::AND)
8722     return SDValue();
8723
8724   // We only have to look for constant vectors here since the general, variable
8725   // case can be handled in TableGen.
8726   unsigned Bits = VT.getScalarSizeInBits();
8727   uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
8728   for (int i = 1; i >= 0; --i)
8729     for (int j = 1; j >= 0; --j) {
8730       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
8731       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
8732       if (!BVN0 || !BVN1)
8733         continue;
8734
8735       bool FoundMatch = true;
8736       for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
8737         ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
8738         ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
8739         if (!CN0 || !CN1 ||
8740             CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
8741           FoundMatch = false;
8742           break;
8743         }
8744       }
8745
8746       if (FoundMatch)
8747         return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
8748                            N0->getOperand(1 - i), N1->getOperand(1 - j));
8749     }
8750
8751   return SDValue();
8752 }
8753
8754 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
8755                                 const AArch64Subtarget *Subtarget) {
8756   // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
8757   SelectionDAG &DAG = DCI.DAG;
8758   EVT VT = N->getValueType(0);
8759
8760   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
8761     return SDValue();
8762
8763   if (SDValue Res = tryCombineToEXTR(N, DCI))
8764     return Res;
8765
8766   if (SDValue Res = tryCombineToBSL(N, DCI))
8767     return Res;
8768
8769   return SDValue();
8770 }
8771
8772 static SDValue performSRLCombine(SDNode *N,
8773                                  TargetLowering::DAGCombinerInfo &DCI) {
8774   SelectionDAG &DAG = DCI.DAG;
8775   EVT VT = N->getValueType(0);
8776   if (VT != MVT::i32 && VT != MVT::i64)
8777     return SDValue();
8778
8779   // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
8780   // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
8781   // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
8782   SDValue N0 = N->getOperand(0);
8783   if (N0.getOpcode() == ISD::BSWAP) {
8784     SDLoc DL(N);
8785     SDValue N1 = N->getOperand(1);
8786     SDValue N00 = N0.getOperand(0);
8787     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
8788       uint64_t ShiftAmt = C->getZExtValue();
8789       if (VT == MVT::i32 && ShiftAmt == 16 &&
8790           DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
8791         return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
8792       if (VT == MVT::i64 && ShiftAmt == 32 &&
8793           DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
8794         return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
8795     }
8796   }
8797   return SDValue();
8798 }
8799
8800 static SDValue performBitcastCombine(SDNode *N,
8801                                      TargetLowering::DAGCombinerInfo &DCI,
8802                                      SelectionDAG &DAG) {
8803   // Wait 'til after everything is legalized to try this. That way we have
8804   // legal vector types and such.
8805   if (DCI.isBeforeLegalizeOps())
8806     return SDValue();
8807
8808   // Remove extraneous bitcasts around an extract_subvector.
8809   // For example,
8810   //    (v4i16 (bitconvert
8811   //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
8812   //  becomes
8813   //    (extract_subvector ((v8i16 ...), (i64 4)))
8814
8815   // Only interested in 64-bit vectors as the ultimate result.
8816   EVT VT = N->getValueType(0);
8817   if (!VT.isVector())
8818     return SDValue();
8819   if (VT.getSimpleVT().getSizeInBits() != 64)
8820     return SDValue();
8821   // Is the operand an extract_subvector starting at the beginning or halfway
8822   // point of the vector? A low half may also come through as an
8823   // EXTRACT_SUBREG, so look for that, too.
8824   SDValue Op0 = N->getOperand(0);
8825   if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
8826       !(Op0->isMachineOpcode() &&
8827         Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
8828     return SDValue();
8829   uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
8830   if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
8831     if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
8832       return SDValue();
8833   } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
8834     if (idx != AArch64::dsub)
8835       return SDValue();
8836     // The dsub reference is equivalent to a lane zero subvector reference.
8837     idx = 0;
8838   }
8839   // Look through the bitcast of the input to the extract.
8840   if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
8841     return SDValue();
8842   SDValue Source = Op0->getOperand(0)->getOperand(0);
8843   // If the source type has twice the number of elements as our destination
8844   // type, we know this is an extract of the high or low half of the vector.
8845   EVT SVT = Source->getValueType(0);
8846   if (!SVT.isVector() ||
8847       SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
8848     return SDValue();
8849
8850   DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
8851
8852   // Create the simplified form to just extract the low or high half of the
8853   // vector directly rather than bothering with the bitcasts.
8854   SDLoc dl(N);
8855   unsigned NumElements = VT.getVectorNumElements();
8856   if (idx) {
8857     SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
8858     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
8859   } else {
8860     SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
8861     return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
8862                                       Source, SubReg),
8863                    0);
8864   }
8865 }
8866
8867 static SDValue performConcatVectorsCombine(SDNode *N,
8868                                            TargetLowering::DAGCombinerInfo &DCI,
8869                                            SelectionDAG &DAG) {
8870   SDLoc dl(N);
8871   EVT VT = N->getValueType(0);
8872   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
8873
8874   // Optimize concat_vectors of truncated vectors, where the intermediate
8875   // type is illegal, to avoid said illegality,  e.g.,
8876   //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
8877   //                          (v2i16 (truncate (v2i64)))))
8878   // ->
8879   //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
8880   //                                    (v4i32 (bitcast (v2i64))),
8881   //                                    <0, 2, 4, 6>)))
8882   // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
8883   // on both input and result type, so we might generate worse code.
8884   // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
8885   if (N->getNumOperands() == 2 &&
8886       N0->getOpcode() == ISD::TRUNCATE &&
8887       N1->getOpcode() == ISD::TRUNCATE) {
8888     SDValue N00 = N0->getOperand(0);
8889     SDValue N10 = N1->getOperand(0);
8890     EVT N00VT = N00.getValueType();
8891
8892     if (N00VT == N10.getValueType() &&
8893         (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
8894         N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
8895       MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
8896       SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
8897       for (size_t i = 0; i < Mask.size(); ++i)
8898         Mask[i] = i * 2;
8899       return DAG.getNode(ISD::TRUNCATE, dl, VT,
8900                          DAG.getVectorShuffle(
8901                              MidVT, dl,
8902                              DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
8903                              DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
8904     }
8905   }
8906
8907   // Wait 'til after everything is legalized to try this. That way we have
8908   // legal vector types and such.
8909   if (DCI.isBeforeLegalizeOps())
8910     return SDValue();
8911
8912   // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
8913   // splat. The indexed instructions are going to be expecting a DUPLANE64, so
8914   // canonicalise to that.
8915   if (N0 == N1 && VT.getVectorNumElements() == 2) {
8916     assert(VT.getScalarSizeInBits() == 64);
8917     return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
8918                        DAG.getConstant(0, dl, MVT::i64));
8919   }
8920
8921   // Canonicalise concat_vectors so that the right-hand vector has as few
8922   // bit-casts as possible before its real operation. The primary matching
8923   // destination for these operations will be the narrowing "2" instructions,
8924   // which depend on the operation being performed on this right-hand vector.
8925   // For example,
8926   //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
8927   // becomes
8928   //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
8929
8930   if (N1->getOpcode() != ISD::BITCAST)
8931     return SDValue();
8932   SDValue RHS = N1->getOperand(0);
8933   MVT RHSTy = RHS.getValueType().getSimpleVT();
8934   // If the RHS is not a vector, this is not the pattern we're looking for.
8935   if (!RHSTy.isVector())
8936     return SDValue();
8937
8938   DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
8939
8940   MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
8941                                   RHSTy.getVectorNumElements() * 2);
8942   return DAG.getNode(ISD::BITCAST, dl, VT,
8943                      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
8944                                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
8945                                  RHS));
8946 }
8947
8948 static SDValue tryCombineFixedPointConvert(SDNode *N,
8949                                            TargetLowering::DAGCombinerInfo &DCI,
8950                                            SelectionDAG &DAG) {
8951   // Wait until after everything is legalized to try this. That way we have
8952   // legal vector types and such.
8953   if (DCI.isBeforeLegalizeOps())
8954     return SDValue();
8955   // Transform a scalar conversion of a value from a lane extract into a
8956   // lane extract of a vector conversion. E.g., from foo1 to foo2:
8957   // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
8958   // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
8959   //
8960   // The second form interacts better with instruction selection and the
8961   // register allocator to avoid cross-class register copies that aren't
8962   // coalescable due to a lane reference.
8963
8964   // Check the operand and see if it originates from a lane extract.
8965   SDValue Op1 = N->getOperand(1);
8966   if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
8967     // Yep, no additional predication needed. Perform the transform.
8968     SDValue IID = N->getOperand(0);
8969     SDValue Shift = N->getOperand(2);
8970     SDValue Vec = Op1.getOperand(0);
8971     SDValue Lane = Op1.getOperand(1);
8972     EVT ResTy = N->getValueType(0);
8973     EVT VecResTy;
8974     SDLoc DL(N);
8975
8976     // The vector width should be 128 bits by the time we get here, even
8977     // if it started as 64 bits (the extract_vector handling will have
8978     // done so).
8979     assert(Vec.getValueSizeInBits() == 128 &&
8980            "unexpected vector size on extract_vector_elt!");
8981     if (Vec.getValueType() == MVT::v4i32)
8982       VecResTy = MVT::v4f32;
8983     else if (Vec.getValueType() == MVT::v2i64)
8984       VecResTy = MVT::v2f64;
8985     else
8986       llvm_unreachable("unexpected vector type!");
8987
8988     SDValue Convert =
8989         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
8990     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
8991   }
8992   return SDValue();
8993 }
8994
8995 // AArch64 high-vector "long" operations are formed by performing the non-high
8996 // version on an extract_subvector of each operand which gets the high half:
8997 //
8998 //  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
8999 //
9000 // However, there are cases which don't have an extract_high explicitly, but
9001 // have another operation that can be made compatible with one for free. For
9002 // example:
9003 //
9004 //  (dupv64 scalar) --> (extract_high (dup128 scalar))
9005 //
9006 // This routine does the actual conversion of such DUPs, once outer routines
9007 // have determined that everything else is in order.
9008 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
9009 // similarly here.
9010 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
9011   switch (N.getOpcode()) {
9012   case AArch64ISD::DUP:
9013   case AArch64ISD::DUPLANE8:
9014   case AArch64ISD::DUPLANE16:
9015   case AArch64ISD::DUPLANE32:
9016   case AArch64ISD::DUPLANE64:
9017   case AArch64ISD::MOVI:
9018   case AArch64ISD::MOVIshift:
9019   case AArch64ISD::MOVIedit:
9020   case AArch64ISD::MOVImsl:
9021   case AArch64ISD::MVNIshift:
9022   case AArch64ISD::MVNImsl:
9023     break;
9024   default:
9025     // FMOV could be supported, but isn't very useful, as it would only occur
9026     // if you passed a bitcast' floating point immediate to an eligible long
9027     // integer op (addl, smull, ...).
9028     return SDValue();
9029   }
9030
9031   MVT NarrowTy = N.getSimpleValueType();
9032   if (!NarrowTy.is64BitVector())
9033     return SDValue();
9034
9035   MVT ElementTy = NarrowTy.getVectorElementType();
9036   unsigned NumElems = NarrowTy.getVectorNumElements();
9037   MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
9038
9039   SDLoc dl(N);
9040   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
9041                      DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
9042                      DAG.getConstant(NumElems, dl, MVT::i64));
9043 }
9044
9045 static bool isEssentiallyExtractSubvector(SDValue N) {
9046   if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
9047     return true;
9048
9049   return N.getOpcode() == ISD::BITCAST &&
9050          N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
9051 }
9052
9053 /// \brief Helper structure to keep track of ISD::SET_CC operands.
9054 struct GenericSetCCInfo {
9055   const SDValue *Opnd0;
9056   const SDValue *Opnd1;
9057   ISD::CondCode CC;
9058 };
9059
9060 /// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
9061 struct AArch64SetCCInfo {
9062   const SDValue *Cmp;
9063   AArch64CC::CondCode CC;
9064 };
9065
9066 /// \brief Helper structure to keep track of SetCC information.
9067 union SetCCInfo {
9068   GenericSetCCInfo Generic;
9069   AArch64SetCCInfo AArch64;
9070 };
9071
9072 /// \brief Helper structure to be able to read SetCC information.  If set to
9073 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
9074 /// GenericSetCCInfo.
9075 struct SetCCInfoAndKind {
9076   SetCCInfo Info;
9077   bool IsAArch64;
9078 };
9079
9080 /// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
9081 /// an
9082 /// AArch64 lowered one.
9083 /// \p SetCCInfo is filled accordingly.
9084 /// \post SetCCInfo is meanginfull only when this function returns true.
9085 /// \return True when Op is a kind of SET_CC operation.
9086 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
9087   // If this is a setcc, this is straight forward.
9088   if (Op.getOpcode() == ISD::SETCC) {
9089     SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
9090     SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
9091     SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
9092     SetCCInfo.IsAArch64 = false;
9093     return true;
9094   }
9095   // Otherwise, check if this is a matching csel instruction.
9096   // In other words:
9097   // - csel 1, 0, cc
9098   // - csel 0, 1, !cc
9099   if (Op.getOpcode() != AArch64ISD::CSEL)
9100     return false;
9101   // Set the information about the operands.
9102   // TODO: we want the operands of the Cmp not the csel
9103   SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
9104   SetCCInfo.IsAArch64 = true;
9105   SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
9106       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
9107
9108   // Check that the operands matches the constraints:
9109   // (1) Both operands must be constants.
9110   // (2) One must be 1 and the other must be 0.
9111   ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
9112   ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
9113
9114   // Check (1).
9115   if (!TValue || !FValue)
9116     return false;
9117
9118   // Check (2).
9119   if (!TValue->isOne()) {
9120     // Update the comparison when we are interested in !cc.
9121     std::swap(TValue, FValue);
9122     SetCCInfo.Info.AArch64.CC =
9123         AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
9124   }
9125   return TValue->isOne() && FValue->isNullValue();
9126 }
9127
9128 // Returns true if Op is setcc or zext of setcc.
9129 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
9130   if (isSetCC(Op, Info))
9131     return true;
9132   return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
9133     isSetCC(Op->getOperand(0), Info));
9134 }
9135
9136 // The folding we want to perform is:
9137 // (add x, [zext] (setcc cc ...) )
9138 //   -->
9139 // (csel x, (add x, 1), !cc ...)
9140 //
9141 // The latter will get matched to a CSINC instruction.
9142 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
9143   assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
9144   SDValue LHS = Op->getOperand(0);
9145   SDValue RHS = Op->getOperand(1);
9146   SetCCInfoAndKind InfoAndKind;
9147
9148   // If neither operand is a SET_CC, give up.
9149   if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
9150     std::swap(LHS, RHS);
9151     if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
9152       return SDValue();
9153   }
9154
9155   // FIXME: This could be generatized to work for FP comparisons.
9156   EVT CmpVT = InfoAndKind.IsAArch64
9157                   ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
9158                   : InfoAndKind.Info.Generic.Opnd0->getValueType();
9159   if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
9160     return SDValue();
9161
9162   SDValue CCVal;
9163   SDValue Cmp;
9164   SDLoc dl(Op);
9165   if (InfoAndKind.IsAArch64) {
9166     CCVal = DAG.getConstant(
9167         AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
9168         MVT::i32);
9169     Cmp = *InfoAndKind.Info.AArch64.Cmp;
9170   } else
9171     Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
9172                       *InfoAndKind.Info.Generic.Opnd1,
9173                       ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
9174                       CCVal, DAG, dl);
9175
9176   EVT VT = Op->getValueType(0);
9177   LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
9178   return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
9179 }
9180
9181 // The basic add/sub long vector instructions have variants with "2" on the end
9182 // which act on the high-half of their inputs. They are normally matched by
9183 // patterns like:
9184 //
9185 // (add (zeroext (extract_high LHS)),
9186 //      (zeroext (extract_high RHS)))
9187 // -> uaddl2 vD, vN, vM
9188 //
9189 // However, if one of the extracts is something like a duplicate, this
9190 // instruction can still be used profitably. This function puts the DAG into a
9191 // more appropriate form for those patterns to trigger.
9192 static SDValue performAddSubLongCombine(SDNode *N,
9193                                         TargetLowering::DAGCombinerInfo &DCI,
9194                                         SelectionDAG &DAG) {
9195   if (DCI.isBeforeLegalizeOps())
9196     return SDValue();
9197
9198   MVT VT = N->getSimpleValueType(0);
9199   if (!VT.is128BitVector()) {
9200     if (N->getOpcode() == ISD::ADD)
9201       return performSetccAddFolding(N, DAG);
9202     return SDValue();
9203   }
9204
9205   // Make sure both branches are extended in the same way.
9206   SDValue LHS = N->getOperand(0);
9207   SDValue RHS = N->getOperand(1);
9208   if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
9209        LHS.getOpcode() != ISD::SIGN_EXTEND) ||
9210       LHS.getOpcode() != RHS.getOpcode())
9211     return SDValue();
9212
9213   unsigned ExtType = LHS.getOpcode();
9214
9215   // It's not worth doing if at least one of the inputs isn't already an
9216   // extract, but we don't know which it'll be so we have to try both.
9217   if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
9218     RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
9219     if (!RHS.getNode())
9220       return SDValue();
9221
9222     RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
9223   } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
9224     LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
9225     if (!LHS.getNode())
9226       return SDValue();
9227
9228     LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
9229   }
9230
9231   return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
9232 }
9233
9234 // Massage DAGs which we can use the high-half "long" operations on into
9235 // something isel will recognize better. E.g.
9236 //
9237 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
9238 //   (aarch64_neon_umull (extract_high (v2i64 vec)))
9239 //                     (extract_high (v2i64 (dup128 scalar)))))
9240 //
9241 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
9242                                        TargetLowering::DAGCombinerInfo &DCI,
9243                                        SelectionDAG &DAG) {
9244   if (DCI.isBeforeLegalizeOps())
9245     return SDValue();
9246
9247   SDValue LHS = N->getOperand(1);
9248   SDValue RHS = N->getOperand(2);
9249   assert(LHS.getValueType().is64BitVector() &&
9250          RHS.getValueType().is64BitVector() &&
9251          "unexpected shape for long operation");
9252
9253   // Either node could be a DUP, but it's not worth doing both of them (you'd
9254   // just as well use the non-high version) so look for a corresponding extract
9255   // operation on the other "wing".
9256   if (isEssentiallyExtractSubvector(LHS)) {
9257     RHS = tryExtendDUPToExtractHigh(RHS, DAG);
9258     if (!RHS.getNode())
9259       return SDValue();
9260   } else if (isEssentiallyExtractSubvector(RHS)) {
9261     LHS = tryExtendDUPToExtractHigh(LHS, DAG);
9262     if (!LHS.getNode())
9263       return SDValue();
9264   }
9265
9266   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
9267                      N->getOperand(0), LHS, RHS);
9268 }
9269
9270 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
9271   MVT ElemTy = N->getSimpleValueType(0).getScalarType();
9272   unsigned ElemBits = ElemTy.getSizeInBits();
9273
9274   int64_t ShiftAmount;
9275   if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
9276     APInt SplatValue, SplatUndef;
9277     unsigned SplatBitSize;
9278     bool HasAnyUndefs;
9279     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
9280                               HasAnyUndefs, ElemBits) ||
9281         SplatBitSize != ElemBits)
9282       return SDValue();
9283
9284     ShiftAmount = SplatValue.getSExtValue();
9285   } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
9286     ShiftAmount = CVN->getSExtValue();
9287   } else
9288     return SDValue();
9289
9290   unsigned Opcode;
9291   bool IsRightShift;
9292   switch (IID) {
9293   default:
9294     llvm_unreachable("Unknown shift intrinsic");
9295   case Intrinsic::aarch64_neon_sqshl:
9296     Opcode = AArch64ISD::SQSHL_I;
9297     IsRightShift = false;
9298     break;
9299   case Intrinsic::aarch64_neon_uqshl:
9300     Opcode = AArch64ISD::UQSHL_I;
9301     IsRightShift = false;
9302     break;
9303   case Intrinsic::aarch64_neon_srshl:
9304     Opcode = AArch64ISD::SRSHR_I;
9305     IsRightShift = true;
9306     break;
9307   case Intrinsic::aarch64_neon_urshl:
9308     Opcode = AArch64ISD::URSHR_I;
9309     IsRightShift = true;
9310     break;
9311   case Intrinsic::aarch64_neon_sqshlu:
9312     Opcode = AArch64ISD::SQSHLU_I;
9313     IsRightShift = false;
9314     break;
9315   }
9316
9317   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
9318     SDLoc dl(N);
9319     return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
9320                        DAG.getConstant(-ShiftAmount, dl, MVT::i32));
9321   } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
9322     SDLoc dl(N);
9323     return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
9324                        DAG.getConstant(ShiftAmount, dl, MVT::i32));
9325   }
9326
9327   return SDValue();
9328 }
9329
9330 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
9331 // the intrinsics must be legal and take an i32, this means there's almost
9332 // certainly going to be a zext in the DAG which we can eliminate.
9333 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
9334   SDValue AndN = N->getOperand(2);
9335   if (AndN.getOpcode() != ISD::AND)
9336     return SDValue();
9337
9338   ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
9339   if (!CMask || CMask->getZExtValue() != Mask)
9340     return SDValue();
9341
9342   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
9343                      N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
9344 }
9345
9346 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
9347                                            SelectionDAG &DAG) {
9348   SDLoc dl(N);
9349   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
9350                      DAG.getNode(Opc, dl,
9351                                  N->getOperand(1).getSimpleValueType(),
9352                                  N->getOperand(1)),
9353                      DAG.getConstant(0, dl, MVT::i64));
9354 }
9355
9356 static SDValue performIntrinsicCombine(SDNode *N,
9357                                        TargetLowering::DAGCombinerInfo &DCI,
9358                                        const AArch64Subtarget *Subtarget) {
9359   SelectionDAG &DAG = DCI.DAG;
9360   unsigned IID = getIntrinsicID(N);
9361   switch (IID) {
9362   default:
9363     break;
9364   case Intrinsic::aarch64_neon_vcvtfxs2fp:
9365   case Intrinsic::aarch64_neon_vcvtfxu2fp:
9366     return tryCombineFixedPointConvert(N, DCI, DAG);
9367   case Intrinsic::aarch64_neon_saddv:
9368     return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
9369   case Intrinsic::aarch64_neon_uaddv:
9370     return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
9371   case Intrinsic::aarch64_neon_sminv:
9372     return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
9373   case Intrinsic::aarch64_neon_uminv:
9374     return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
9375   case Intrinsic::aarch64_neon_smaxv:
9376     return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
9377   case Intrinsic::aarch64_neon_umaxv:
9378     return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
9379   case Intrinsic::aarch64_neon_fmax:
9380     return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0),
9381                        N->getOperand(1), N->getOperand(2));
9382   case Intrinsic::aarch64_neon_fmin:
9383     return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
9384                        N->getOperand(1), N->getOperand(2));
9385   case Intrinsic::aarch64_neon_fmaxnm:
9386     return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
9387                        N->getOperand(1), N->getOperand(2));
9388   case Intrinsic::aarch64_neon_fminnm:
9389     return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
9390                        N->getOperand(1), N->getOperand(2));
9391   case Intrinsic::aarch64_neon_smull:
9392   case Intrinsic::aarch64_neon_umull:
9393   case Intrinsic::aarch64_neon_pmull:
9394   case Intrinsic::aarch64_neon_sqdmull:
9395     return tryCombineLongOpWithDup(IID, N, DCI, DAG);
9396   case Intrinsic::aarch64_neon_sqshl:
9397   case Intrinsic::aarch64_neon_uqshl:
9398   case Intrinsic::aarch64_neon_sqshlu:
9399   case Intrinsic::aarch64_neon_srshl:
9400   case Intrinsic::aarch64_neon_urshl:
9401     return tryCombineShiftImm(IID, N, DAG);
9402   case Intrinsic::aarch64_crc32b:
9403   case Intrinsic::aarch64_crc32cb:
9404     return tryCombineCRC32(0xff, N, DAG);
9405   case Intrinsic::aarch64_crc32h:
9406   case Intrinsic::aarch64_crc32ch:
9407     return tryCombineCRC32(0xffff, N, DAG);
9408   }
9409   return SDValue();
9410 }
9411
9412 static SDValue performExtendCombine(SDNode *N,
9413                                     TargetLowering::DAGCombinerInfo &DCI,
9414                                     SelectionDAG &DAG) {
9415   // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
9416   // we can convert that DUP into another extract_high (of a bigger DUP), which
9417   // helps the backend to decide that an sabdl2 would be useful, saving a real
9418   // extract_high operation.
9419   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
9420       N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
9421     SDNode *ABDNode = N->getOperand(0).getNode();
9422     unsigned IID = getIntrinsicID(ABDNode);
9423     if (IID == Intrinsic::aarch64_neon_sabd ||
9424         IID == Intrinsic::aarch64_neon_uabd) {
9425       SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
9426       if (!NewABD.getNode())
9427         return SDValue();
9428
9429       return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
9430                          NewABD);
9431     }
9432   }
9433
9434   // This is effectively a custom type legalization for AArch64.
9435   //
9436   // Type legalization will split an extend of a small, legal, type to a larger
9437   // illegal type by first splitting the destination type, often creating
9438   // illegal source types, which then get legalized in isel-confusing ways,
9439   // leading to really terrible codegen. E.g.,
9440   //   %result = v8i32 sext v8i8 %value
9441   // becomes
9442   //   %losrc = extract_subreg %value, ...
9443   //   %hisrc = extract_subreg %value, ...
9444   //   %lo = v4i32 sext v4i8 %losrc
9445   //   %hi = v4i32 sext v4i8 %hisrc
9446   // Things go rapidly downhill from there.
9447   //
9448   // For AArch64, the [sz]ext vector instructions can only go up one element
9449   // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
9450   // take two instructions.
9451   //
9452   // This implies that the most efficient way to do the extend from v8i8
9453   // to two v4i32 values is to first extend the v8i8 to v8i16, then do
9454   // the normal splitting to happen for the v8i16->v8i32.
9455
9456   // This is pre-legalization to catch some cases where the default
9457   // type legalization will create ill-tempered code.
9458   if (!DCI.isBeforeLegalizeOps())
9459     return SDValue();
9460
9461   // We're only interested in cleaning things up for non-legal vector types
9462   // here. If both the source and destination are legal, things will just
9463   // work naturally without any fiddling.
9464   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9465   EVT ResVT = N->getValueType(0);
9466   if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
9467     return SDValue();
9468   // If the vector type isn't a simple VT, it's beyond the scope of what
9469   // we're  worried about here. Let legalization do its thing and hope for
9470   // the best.
9471   SDValue Src = N->getOperand(0);
9472   EVT SrcVT = Src->getValueType(0);
9473   if (!ResVT.isSimple() || !SrcVT.isSimple())
9474     return SDValue();
9475
9476   // If the source VT is a 64-bit vector, we can play games and get the
9477   // better results we want.
9478   if (SrcVT.getSizeInBits() != 64)
9479     return SDValue();
9480
9481   unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
9482   unsigned ElementCount = SrcVT.getVectorNumElements();
9483   SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
9484   SDLoc DL(N);
9485   Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
9486
9487   // Now split the rest of the operation into two halves, each with a 64
9488   // bit source.
9489   EVT LoVT, HiVT;
9490   SDValue Lo, Hi;
9491   unsigned NumElements = ResVT.getVectorNumElements();
9492   assert(!(NumElements & 1) && "Splitting vector, but not in half!");
9493   LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
9494                                  ResVT.getVectorElementType(), NumElements / 2);
9495
9496   EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
9497                                LoVT.getVectorNumElements());
9498   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
9499                    DAG.getConstant(0, DL, MVT::i64));
9500   Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
9501                    DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
9502   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
9503   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
9504
9505   // Now combine the parts back together so we still have a single result
9506   // like the combiner expects.
9507   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
9508 }
9509
9510 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
9511                                SDValue SplatVal, unsigned NumVecElts) {
9512   unsigned OrigAlignment = St.getAlignment();
9513   unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
9514
9515   // Create scalar stores. This is at least as good as the code sequence for a
9516   // split unaligned store which is a dup.s, ext.b, and two stores.
9517   // Most of the time the three stores should be replaced by store pair
9518   // instructions (stp).
9519   SDLoc DL(&St);
9520   SDValue BasePtr = St.getBasePtr();
9521   uint64_t BaseOffset = 0;
9522
9523   const MachinePointerInfo &PtrInfo = St.getPointerInfo();
9524   SDValue NewST1 =
9525       DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
9526                    OrigAlignment, St.getMemOperand()->getFlags());
9527
9528   // As this in ISel, we will not merge this add which may degrade results.
9529   if (BasePtr->getOpcode() == ISD::ADD &&
9530       isa<ConstantSDNode>(BasePtr->getOperand(1))) {
9531     BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
9532     BasePtr = BasePtr->getOperand(0);
9533   }
9534
9535   unsigned Offset = EltOffset;
9536   while (--NumVecElts) {
9537     unsigned Alignment = MinAlign(OrigAlignment, Offset);
9538     SDValue OffsetPtr =
9539         DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
9540                     DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
9541     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
9542                           PtrInfo.getWithOffset(Offset), Alignment,
9543                           St.getMemOperand()->getFlags());
9544     Offset += EltOffset;
9545   }
9546   return NewST1;
9547 }
9548
9549 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
9550 /// load store optimizer pass will merge them to store pair stores.  This should
9551 /// be better than a movi to create the vector zero followed by a vector store
9552 /// if the zero constant is not re-used, since one instructions and one register
9553 /// live range will be removed.
9554 ///
9555 /// For example, the final generated code should be:
9556 ///
9557 ///   stp xzr, xzr, [x0]
9558 ///
9559 /// instead of:
9560 ///
9561 ///   movi v0.2d, #0
9562 ///   str q0, [x0]
9563 ///
9564 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
9565   SDValue StVal = St.getValue();
9566   EVT VT = StVal.getValueType();
9567
9568   // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
9569   // 2, 3 or 4 i32 elements.
9570   int NumVecElts = VT.getVectorNumElements();
9571   if (!(((NumVecElts == 2 || NumVecElts == 3) &&
9572          VT.getVectorElementType().getSizeInBits() == 64) ||
9573         ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
9574          VT.getVectorElementType().getSizeInBits() == 32)))
9575     return SDValue();
9576
9577   if (StVal.getOpcode() != ISD::BUILD_VECTOR)
9578     return SDValue();
9579
9580   // If the zero constant has more than one use then the vector store could be
9581   // better since the constant mov will be amortized and stp q instructions
9582   // should be able to be formed.
9583   if (!StVal.hasOneUse())
9584     return SDValue();
9585
9586   // If the immediate offset of the address operand is too large for the stp
9587   // instruction, then bail out.
9588   if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
9589     int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
9590     if (Offset < -512 || Offset > 504)
9591       return SDValue();
9592   }
9593
9594   for (int I = 0; I < NumVecElts; ++I) {
9595     SDValue EltVal = StVal.getOperand(I);
9596     if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
9597       return SDValue();
9598   }
9599
9600   // Use a CopyFromReg WZR/XZR here to prevent
9601   // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
9602   SDLoc DL(&St);
9603   unsigned ZeroReg;
9604   EVT ZeroVT;
9605   if (VT.getVectorElementType().getSizeInBits() == 32) {
9606     ZeroReg = AArch64::WZR;
9607     ZeroVT = MVT::i32;
9608   } else {
9609     ZeroReg = AArch64::XZR;
9610     ZeroVT = MVT::i64;
9611   }
9612   SDValue SplatVal =
9613       DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
9614   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
9615 }
9616
9617 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
9618 /// value. The load store optimizer pass will merge them to store pair stores.
9619 /// This has better performance than a splat of the scalar followed by a split
9620 /// vector store. Even if the stores are not merged it is four stores vs a dup,
9621 /// followed by an ext.b and two stores.
9622 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
9623   SDValue StVal = St.getValue();
9624   EVT VT = StVal.getValueType();
9625
9626   // Don't replace floating point stores, they possibly won't be transformed to
9627   // stp because of the store pair suppress pass.
9628   if (VT.isFloatingPoint())
9629     return SDValue();
9630
9631   // We can express a splat as store pair(s) for 2 or 4 elements.
9632   unsigned NumVecElts = VT.getVectorNumElements();
9633   if (NumVecElts != 4 && NumVecElts != 2)
9634     return SDValue();
9635
9636   // Check that this is a splat.
9637   // Make sure that each of the relevant vector element locations are inserted
9638   // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
9639   std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
9640   SDValue SplatVal;
9641   for (unsigned I = 0; I < NumVecElts; ++I) {
9642     // Check for insert vector elements.
9643     if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
9644       return SDValue();
9645
9646     // Check that same value is inserted at each vector element.
9647     if (I == 0)
9648       SplatVal = StVal.getOperand(1);
9649     else if (StVal.getOperand(1) != SplatVal)
9650       return SDValue();
9651
9652     // Check insert element index.
9653     ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
9654     if (!CIndex)
9655       return SDValue();
9656     uint64_t IndexVal = CIndex->getZExtValue();
9657     if (IndexVal >= NumVecElts)
9658       return SDValue();
9659     IndexNotInserted.reset(IndexVal);
9660
9661     StVal = StVal.getOperand(0);
9662   }
9663   // Check that all vector element locations were inserted to.
9664   if (IndexNotInserted.any())
9665       return SDValue();
9666
9667   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
9668 }
9669
9670 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
9671                            SelectionDAG &DAG,
9672                            const AArch64Subtarget *Subtarget) {
9673
9674   StoreSDNode *S = cast<StoreSDNode>(N);
9675   if (S->isVolatile() || S->isIndexed())
9676     return SDValue();
9677
9678   SDValue StVal = S->getValue();
9679   EVT VT = StVal.getValueType();
9680   if (!VT.isVector())
9681     return SDValue();
9682
9683   // If we get a splat of zeros, convert this vector store to a store of
9684   // scalars. They will be merged into store pairs of xzr thereby removing one
9685   // instruction and one register.
9686   if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
9687     return ReplacedZeroSplat;
9688
9689   // FIXME: The logic for deciding if an unaligned store should be split should
9690   // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
9691   // a call to that function here.
9692
9693   if (!Subtarget->isMisaligned128StoreSlow())
9694     return SDValue();
9695
9696   // Don't split at -Oz.
9697   if (DAG.getMachineFunction().getFunction().optForMinSize())
9698     return SDValue();
9699
9700   // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
9701   // those up regresses performance on micro-benchmarks and olden/bh.
9702   if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
9703     return SDValue();
9704
9705   // Split unaligned 16B stores. They are terrible for performance.
9706   // Don't split stores with alignment of 1 or 2. Code that uses clang vector
9707   // extensions can use this to mark that it does not want splitting to happen
9708   // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
9709   // eliminating alignment hazards is only 1 in 8 for alignment of 2.
9710   if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
9711       S->getAlignment() <= 2)
9712     return SDValue();
9713
9714   // If we get a splat of a scalar convert this vector store to a store of
9715   // scalars. They will be merged into store pairs thereby removing two
9716   // instructions.
9717   if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
9718     return ReplacedSplat;
9719
9720   SDLoc DL(S);
9721   unsigned NumElts = VT.getVectorNumElements() / 2;
9722   // Split VT into two.
9723   EVT HalfVT =
9724       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
9725   SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
9726                                    DAG.getConstant(0, DL, MVT::i64));
9727   SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
9728                                    DAG.getConstant(NumElts, DL, MVT::i64));
9729   SDValue BasePtr = S->getBasePtr();
9730   SDValue NewST1 =
9731       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
9732                    S->getAlignment(), S->getMemOperand()->getFlags());
9733   SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
9734                                   DAG.getConstant(8, DL, MVT::i64));
9735   return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
9736                       S->getPointerInfo(), S->getAlignment(),
9737                       S->getMemOperand()->getFlags());
9738 }
9739
9740 /// Target-specific DAG combine function for post-increment LD1 (lane) and
9741 /// post-increment LD1R.
9742 static SDValue performPostLD1Combine(SDNode *N,
9743                                      TargetLowering::DAGCombinerInfo &DCI,
9744                                      bool IsLaneOp) {
9745   if (DCI.isBeforeLegalizeOps())
9746     return SDValue();
9747
9748   SelectionDAG &DAG = DCI.DAG;
9749   EVT VT = N->getValueType(0);
9750
9751   unsigned LoadIdx = IsLaneOp ? 1 : 0;
9752   SDNode *LD = N->getOperand(LoadIdx).getNode();
9753   // If it is not LOAD, can not do such combine.
9754   if (LD->getOpcode() != ISD::LOAD)
9755     return SDValue();
9756
9757   LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
9758   EVT MemVT = LoadSDN->getMemoryVT();
9759   // Check if memory operand is the same type as the vector element.
9760   if (MemVT != VT.getVectorElementType())
9761     return SDValue();
9762
9763   // Check if there are other uses. If so, do not combine as it will introduce
9764   // an extra load.
9765   for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
9766        ++UI) {
9767     if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
9768       continue;
9769     if (*UI != N)
9770       return SDValue();
9771   }
9772
9773   SDValue Addr = LD->getOperand(1);
9774   SDValue Vector = N->getOperand(0);
9775   // Search for a use of the address operand that is an increment.
9776   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
9777        Addr.getNode()->use_end(); UI != UE; ++UI) {
9778     SDNode *User = *UI;
9779     if (User->getOpcode() != ISD::ADD
9780         || UI.getUse().getResNo() != Addr.getResNo())
9781       continue;
9782
9783     // Check that the add is independent of the load.  Otherwise, folding it
9784     // would create a cycle.
9785     if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
9786       continue;
9787     // Also check that add is not used in the vector operand.  This would also
9788     // create a cycle.
9789     if (User->isPredecessorOf(Vector.getNode()))
9790       continue;
9791
9792     // If the increment is a constant, it must match the memory ref size.
9793     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
9794     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
9795       uint32_t IncVal = CInc->getZExtValue();
9796       unsigned NumBytes = VT.getScalarSizeInBits() / 8;
9797       if (IncVal != NumBytes)
9798         continue;
9799       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
9800     }
9801
9802     // Finally, check that the vector doesn't depend on the load.
9803     // Again, this would create a cycle.
9804     // The load depending on the vector is fine, as that's the case for the
9805     // LD1*post we'll eventually generate anyway.
9806     if (LoadSDN->isPredecessorOf(Vector.getNode()))
9807       continue;
9808
9809     SmallVector<SDValue, 8> Ops;
9810     Ops.push_back(LD->getOperand(0));  // Chain
9811     if (IsLaneOp) {
9812       Ops.push_back(Vector);           // The vector to be inserted
9813       Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
9814     }
9815     Ops.push_back(Addr);
9816     Ops.push_back(Inc);
9817
9818     EVT Tys[3] = { VT, MVT::i64, MVT::Other };
9819     SDVTList SDTys = DAG.getVTList(Tys);
9820     unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
9821     SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
9822                                            MemVT,
9823                                            LoadSDN->getMemOperand());
9824
9825     // Update the uses.
9826     SDValue NewResults[] = {
9827         SDValue(LD, 0),            // The result of load
9828         SDValue(UpdN.getNode(), 2) // Chain
9829     };
9830     DCI.CombineTo(LD, NewResults);
9831     DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
9832     DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
9833
9834     break;
9835   }
9836   return SDValue();
9837 }
9838
9839 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during
9840 /// address translation.
9841 static bool performTBISimplification(SDValue Addr,
9842                                      TargetLowering::DAGCombinerInfo &DCI,
9843                                      SelectionDAG &DAG) {
9844   APInt DemandedMask = APInt::getLowBitsSet(64, 56);
9845   KnownBits Known;
9846   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
9847                                         !DCI.isBeforeLegalizeOps());
9848   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9849   if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
9850     DCI.CommitTargetLoweringOpt(TLO);
9851     return true;
9852   }
9853   return false;
9854 }
9855
9856 static SDValue performSTORECombine(SDNode *N,
9857                                    TargetLowering::DAGCombinerInfo &DCI,
9858                                    SelectionDAG &DAG,
9859                                    const AArch64Subtarget *Subtarget) {
9860   if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
9861     return Split;
9862
9863   if (Subtarget->supportsAddressTopByteIgnored() &&
9864       performTBISimplification(N->getOperand(2), DCI, DAG))
9865     return SDValue(N, 0);
9866
9867   return SDValue();
9868 }
9869
9870
9871 /// Target-specific DAG combine function for NEON load/store intrinsics
9872 /// to merge base address updates.
9873 static SDValue performNEONPostLDSTCombine(SDNode *N,
9874                                           TargetLowering::DAGCombinerInfo &DCI,
9875                                           SelectionDAG &DAG) {
9876   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
9877     return SDValue();
9878
9879   unsigned AddrOpIdx = N->getNumOperands() - 1;
9880   SDValue Addr = N->getOperand(AddrOpIdx);
9881
9882   // Search for a use of the address operand that is an increment.
9883   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
9884        UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
9885     SDNode *User = *UI;
9886     if (User->getOpcode() != ISD::ADD ||
9887         UI.getUse().getResNo() != Addr.getResNo())
9888       continue;
9889
9890     // Check that the add is independent of the load/store.  Otherwise, folding
9891     // it would create a cycle.
9892     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
9893       continue;
9894
9895     // Find the new opcode for the updating load/store.
9896     bool IsStore = false;
9897     bool IsLaneOp = false;
9898     bool IsDupOp = false;
9899     unsigned NewOpc = 0;
9900     unsigned NumVecs = 0;
9901     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
9902     switch (IntNo) {
9903     default: llvm_unreachable("unexpected intrinsic for Neon base update");
9904     case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
9905       NumVecs = 2; break;
9906     case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
9907       NumVecs = 3; break;
9908     case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
9909       NumVecs = 4; break;
9910     case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
9911       NumVecs = 2; IsStore = true; break;
9912     case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
9913       NumVecs = 3; IsStore = true; break;
9914     case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
9915       NumVecs = 4; IsStore = true; break;
9916     case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
9917       NumVecs = 2; break;
9918     case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
9919       NumVecs = 3; break;
9920     case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
9921       NumVecs = 4; break;
9922     case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
9923       NumVecs = 2; IsStore = true; break;
9924     case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
9925       NumVecs = 3; IsStore = true; break;
9926     case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
9927       NumVecs = 4; IsStore = true; break;
9928     case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
9929       NumVecs = 2; IsDupOp = true; break;
9930     case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
9931       NumVecs = 3; IsDupOp = true; break;
9932     case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
9933       NumVecs = 4; IsDupOp = true; break;
9934     case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
9935       NumVecs = 2; IsLaneOp = true; break;
9936     case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
9937       NumVecs = 3; IsLaneOp = true; break;
9938     case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
9939       NumVecs = 4; IsLaneOp = true; break;
9940     case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
9941       NumVecs = 2; IsStore = true; IsLaneOp = true; break;
9942     case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
9943       NumVecs = 3; IsStore = true; IsLaneOp = true; break;
9944     case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
9945       NumVecs = 4; IsStore = true; IsLaneOp = true; break;
9946     }
9947
9948     EVT VecTy;
9949     if (IsStore)
9950       VecTy = N->getOperand(2).getValueType();
9951     else
9952       VecTy = N->getValueType(0);
9953
9954     // If the increment is a constant, it must match the memory ref size.
9955     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
9956     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
9957       uint32_t IncVal = CInc->getZExtValue();
9958       unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
9959       if (IsLaneOp || IsDupOp)
9960         NumBytes /= VecTy.getVectorNumElements();
9961       if (IncVal != NumBytes)
9962         continue;
9963       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
9964     }
9965     SmallVector<SDValue, 8> Ops;
9966     Ops.push_back(N->getOperand(0)); // Incoming chain
9967     // Load lane and store have vector list as input.
9968     if (IsLaneOp || IsStore)
9969       for (unsigned i = 2; i < AddrOpIdx; ++i)
9970         Ops.push_back(N->getOperand(i));
9971     Ops.push_back(Addr); // Base register
9972     Ops.push_back(Inc);
9973
9974     // Return Types.
9975     EVT Tys[6];
9976     unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
9977     unsigned n;
9978     for (n = 0; n < NumResultVecs; ++n)
9979       Tys[n] = VecTy;
9980     Tys[n++] = MVT::i64;  // Type of write back register
9981     Tys[n] = MVT::Other;  // Type of the chain
9982     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
9983
9984     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
9985     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
9986                                            MemInt->getMemoryVT(),
9987                                            MemInt->getMemOperand());
9988
9989     // Update the uses.
9990     std::vector<SDValue> NewResults;
9991     for (unsigned i = 0; i < NumResultVecs; ++i) {
9992       NewResults.push_back(SDValue(UpdN.getNode(), i));
9993     }
9994     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
9995     DCI.CombineTo(N, NewResults);
9996     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
9997
9998     break;
9999   }
10000   return SDValue();
10001 }
10002
10003 // Checks to see if the value is the prescribed width and returns information
10004 // about its extension mode.
10005 static
10006 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
10007   ExtType = ISD::NON_EXTLOAD;
10008   switch(V.getNode()->getOpcode()) {
10009   default:
10010     return false;
10011   case ISD::LOAD: {
10012     LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
10013     if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
10014        || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
10015       ExtType = LoadNode->getExtensionType();
10016       return true;
10017     }
10018     return false;
10019   }
10020   case ISD::AssertSext: {
10021     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
10022     if ((TypeNode->getVT() == MVT::i8 && width == 8)
10023        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
10024       ExtType = ISD::SEXTLOAD;
10025       return true;
10026     }
10027     return false;
10028   }
10029   case ISD::AssertZext: {
10030     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
10031     if ((TypeNode->getVT() == MVT::i8 && width == 8)
10032        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
10033       ExtType = ISD::ZEXTLOAD;
10034       return true;
10035     }
10036     return false;
10037   }
10038   case ISD::Constant:
10039   case ISD::TargetConstant: {
10040     return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
10041            1LL << (width - 1);
10042   }
10043   }
10044
10045   return true;
10046 }
10047
10048 // This function does a whole lot of voodoo to determine if the tests are
10049 // equivalent without and with a mask. Essentially what happens is that given a
10050 // DAG resembling:
10051 //
10052 //  +-------------+ +-------------+ +-------------+ +-------------+
10053 //  |    Input    | | AddConstant | | CompConstant| |     CC      |
10054 //  +-------------+ +-------------+ +-------------+ +-------------+
10055 //           |           |           |               |
10056 //           V           V           |    +----------+
10057 //          +-------------+  +----+  |    |
10058 //          |     ADD     |  |0xff|  |    |
10059 //          +-------------+  +----+  |    |
10060 //                  |           |    |    |
10061 //                  V           V    |    |
10062 //                 +-------------+   |    |
10063 //                 |     AND     |   |    |
10064 //                 +-------------+   |    |
10065 //                      |            |    |
10066 //                      +-----+      |    |
10067 //                            |      |    |
10068 //                            V      V    V
10069 //                           +-------------+
10070 //                           |     CMP     |
10071 //                           +-------------+
10072 //
10073 // The AND node may be safely removed for some combinations of inputs. In
10074 // particular we need to take into account the extension type of the Input,
10075 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
10076 // width of the input (this can work for any width inputs, the above graph is
10077 // specific to 8 bits.
10078 //
10079 // The specific equations were worked out by generating output tables for each
10080 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
10081 // problem was simplified by working with 4 bit inputs, which means we only
10082 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
10083 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
10084 // patterns present in both extensions (0,7). For every distinct set of
10085 // AddConstant and CompConstants bit patterns we can consider the masked and
10086 // unmasked versions to be equivalent if the result of this function is true for
10087 // all 16 distinct bit patterns of for the current extension type of Input (w0).
10088 //
10089 //   sub      w8, w0, w1
10090 //   and      w10, w8, #0x0f
10091 //   cmp      w8, w2
10092 //   cset     w9, AArch64CC
10093 //   cmp      w10, w2
10094 //   cset     w11, AArch64CC
10095 //   cmp      w9, w11
10096 //   cset     w0, eq
10097 //   ret
10098 //
10099 // Since the above function shows when the outputs are equivalent it defines
10100 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
10101 // would be expensive to run during compiles. The equations below were written
10102 // in a test harness that confirmed they gave equivalent outputs to the above
10103 // for all inputs function, so they can be used determine if the removal is
10104 // legal instead.
10105 //
10106 // isEquivalentMaskless() is the code for testing if the AND can be removed
10107 // factored out of the DAG recognition as the DAG can take several forms.
10108
10109 static bool isEquivalentMaskless(unsigned CC, unsigned width,
10110                                  ISD::LoadExtType ExtType, int AddConstant,
10111                                  int CompConstant) {
10112   // By being careful about our equations and only writing the in term
10113   // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
10114   // make them generally applicable to all bit widths.
10115   int MaxUInt = (1 << width);
10116
10117   // For the purposes of these comparisons sign extending the type is
10118   // equivalent to zero extending the add and displacing it by half the integer
10119   // width. Provided we are careful and make sure our equations are valid over
10120   // the whole range we can just adjust the input and avoid writing equations
10121   // for sign extended inputs.
10122   if (ExtType == ISD::SEXTLOAD)
10123     AddConstant -= (1 << (width-1));
10124
10125   switch(CC) {
10126   case AArch64CC::LE:
10127   case AArch64CC::GT:
10128     if ((AddConstant == 0) ||
10129         (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
10130         (AddConstant >= 0 && CompConstant < 0) ||
10131         (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
10132       return true;
10133     break;
10134   case AArch64CC::LT:
10135   case AArch64CC::GE:
10136     if ((AddConstant == 0) ||
10137         (AddConstant >= 0 && CompConstant <= 0) ||
10138         (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
10139       return true;
10140     break;
10141   case AArch64CC::HI:
10142   case AArch64CC::LS:
10143     if ((AddConstant >= 0 && CompConstant < 0) ||
10144        (AddConstant <= 0 && CompConstant >= -1 &&
10145         CompConstant < AddConstant + MaxUInt))
10146       return true;
10147    break;
10148   case AArch64CC::PL:
10149   case AArch64CC::MI:
10150     if ((AddConstant == 0) ||
10151         (AddConstant > 0 && CompConstant <= 0) ||
10152         (AddConstant < 0 && CompConstant <= AddConstant))
10153       return true;
10154     break;
10155   case AArch64CC::LO:
10156   case AArch64CC::HS:
10157     if ((AddConstant >= 0 && CompConstant <= 0) ||
10158         (AddConstant <= 0 && CompConstant >= 0 &&
10159          CompConstant <= AddConstant + MaxUInt))
10160       return true;
10161     break;
10162   case AArch64CC::EQ:
10163   case AArch64CC::NE:
10164     if ((AddConstant > 0 && CompConstant < 0) ||
10165         (AddConstant < 0 && CompConstant >= 0 &&
10166          CompConstant < AddConstant + MaxUInt) ||
10167         (AddConstant >= 0 && CompConstant >= 0 &&
10168          CompConstant >= AddConstant) ||
10169         (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
10170       return true;
10171     break;
10172   case AArch64CC::VS:
10173   case AArch64CC::VC:
10174   case AArch64CC::AL:
10175   case AArch64CC::NV:
10176     return true;
10177   case AArch64CC::Invalid:
10178     break;
10179   }
10180
10181   return false;
10182 }
10183
10184 static
10185 SDValue performCONDCombine(SDNode *N,
10186                            TargetLowering::DAGCombinerInfo &DCI,
10187                            SelectionDAG &DAG, unsigned CCIndex,
10188                            unsigned CmpIndex) {
10189   unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
10190   SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
10191   unsigned CondOpcode = SubsNode->getOpcode();
10192
10193   if (CondOpcode != AArch64ISD::SUBS)
10194     return SDValue();
10195
10196   // There is a SUBS feeding this condition. Is it fed by a mask we can
10197   // use?
10198
10199   SDNode *AndNode = SubsNode->getOperand(0).getNode();
10200   unsigned MaskBits = 0;
10201
10202   if (AndNode->getOpcode() != ISD::AND)
10203     return SDValue();
10204
10205   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
10206     uint32_t CNV = CN->getZExtValue();
10207     if (CNV == 255)
10208       MaskBits = 8;
10209     else if (CNV == 65535)
10210       MaskBits = 16;
10211   }
10212
10213   if (!MaskBits)
10214     return SDValue();
10215
10216   SDValue AddValue = AndNode->getOperand(0);
10217
10218   if (AddValue.getOpcode() != ISD::ADD)
10219     return SDValue();
10220
10221   // The basic dag structure is correct, grab the inputs and validate them.
10222
10223   SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
10224   SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
10225   SDValue SubsInputValue = SubsNode->getOperand(1);
10226
10227   // The mask is present and the provenance of all the values is a smaller type,
10228   // lets see if the mask is superfluous.
10229
10230   if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
10231       !isa<ConstantSDNode>(SubsInputValue.getNode()))
10232     return SDValue();
10233
10234   ISD::LoadExtType ExtType;
10235
10236   if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
10237       !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
10238       !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
10239     return SDValue();
10240
10241   if(!isEquivalentMaskless(CC, MaskBits, ExtType,
10242                 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
10243                 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
10244     return SDValue();
10245
10246   // The AND is not necessary, remove it.
10247
10248   SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
10249                                SubsNode->getValueType(1));
10250   SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
10251
10252   SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
10253   DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
10254
10255   return SDValue(N, 0);
10256 }
10257
10258 // Optimize compare with zero and branch.
10259 static SDValue performBRCONDCombine(SDNode *N,
10260                                     TargetLowering::DAGCombinerInfo &DCI,
10261                                     SelectionDAG &DAG) {
10262   if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
10263     N = NV.getNode();
10264   SDValue Chain = N->getOperand(0);
10265   SDValue Dest = N->getOperand(1);
10266   SDValue CCVal = N->getOperand(2);
10267   SDValue Cmp = N->getOperand(3);
10268
10269   assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
10270   unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
10271   if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
10272     return SDValue();
10273
10274   unsigned CmpOpc = Cmp.getOpcode();
10275   if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
10276     return SDValue();
10277
10278   // Only attempt folding if there is only one use of the flag and no use of the
10279   // value.
10280   if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
10281     return SDValue();
10282
10283   SDValue LHS = Cmp.getOperand(0);
10284   SDValue RHS = Cmp.getOperand(1);
10285
10286   assert(LHS.getValueType() == RHS.getValueType() &&
10287          "Expected the value type to be the same for both operands!");
10288   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
10289     return SDValue();
10290
10291   if (isNullConstant(LHS))
10292     std::swap(LHS, RHS);
10293
10294   if (!isNullConstant(RHS))
10295     return SDValue();
10296
10297   if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
10298       LHS.getOpcode() == ISD::SRL)
10299     return SDValue();
10300
10301   // Fold the compare into the branch instruction.
10302   SDValue BR;
10303   if (CC == AArch64CC::EQ)
10304     BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
10305   else
10306     BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
10307
10308   // Do not add new nodes to DAG combiner worklist.
10309   DCI.CombineTo(N, BR, false);
10310
10311   return SDValue();
10312 }
10313
10314 // Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test
10315 // as well as whether the test should be inverted.  This code is required to
10316 // catch these cases (as opposed to standard dag combines) because
10317 // AArch64ISD::TBZ is matched during legalization.
10318 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
10319                                  SelectionDAG &DAG) {
10320
10321   if (!Op->hasOneUse())
10322     return Op;
10323
10324   // We don't handle undef/constant-fold cases below, as they should have
10325   // already been taken care of (e.g. and of 0, test of undefined shifted bits,
10326   // etc.)
10327
10328   // (tbz (trunc x), b) -> (tbz x, b)
10329   // This case is just here to enable more of the below cases to be caught.
10330   if (Op->getOpcode() == ISD::TRUNCATE &&
10331       Bit < Op->getValueType(0).getSizeInBits()) {
10332     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
10333   }
10334
10335   if (Op->getNumOperands() != 2)
10336     return Op;
10337
10338   auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
10339   if (!C)
10340     return Op;
10341
10342   switch (Op->getOpcode()) {
10343   default:
10344     return Op;
10345
10346   // (tbz (and x, m), b) -> (tbz x, b)
10347   case ISD::AND:
10348     if ((C->getZExtValue() >> Bit) & 1)
10349       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
10350     return Op;
10351
10352   // (tbz (shl x, c), b) -> (tbz x, b-c)
10353   case ISD::SHL:
10354     if (C->getZExtValue() <= Bit &&
10355         (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
10356       Bit = Bit - C->getZExtValue();
10357       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
10358     }
10359     return Op;
10360
10361   // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
10362   case ISD::SRA:
10363     Bit = Bit + C->getZExtValue();
10364     if (Bit >= Op->getValueType(0).getSizeInBits())
10365       Bit = Op->getValueType(0).getSizeInBits() - 1;
10366     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
10367
10368   // (tbz (srl x, c), b) -> (tbz x, b+c)
10369   case ISD::SRL:
10370     if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
10371       Bit = Bit + C->getZExtValue();
10372       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
10373     }
10374     return Op;
10375
10376   // (tbz (xor x, -1), b) -> (tbnz x, b)
10377   case ISD::XOR:
10378     if ((C->getZExtValue() >> Bit) & 1)
10379       Invert = !Invert;
10380     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
10381   }
10382 }
10383
10384 // Optimize test single bit zero/non-zero and branch.
10385 static SDValue performTBZCombine(SDNode *N,
10386                                  TargetLowering::DAGCombinerInfo &DCI,
10387                                  SelectionDAG &DAG) {
10388   unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
10389   bool Invert = false;
10390   SDValue TestSrc = N->getOperand(1);
10391   SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
10392
10393   if (TestSrc == NewTestSrc)
10394     return SDValue();
10395
10396   unsigned NewOpc = N->getOpcode();
10397   if (Invert) {
10398     if (NewOpc == AArch64ISD::TBZ)
10399       NewOpc = AArch64ISD::TBNZ;
10400     else {
10401       assert(NewOpc == AArch64ISD::TBNZ);
10402       NewOpc = AArch64ISD::TBZ;
10403     }
10404   }
10405
10406   SDLoc DL(N);
10407   return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
10408                      DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
10409 }
10410
10411 // vselect (v1i1 setcc) ->
10412 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
10413 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
10414 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
10415 // such VSELECT.
10416 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
10417   SDValue N0 = N->getOperand(0);
10418   EVT CCVT = N0.getValueType();
10419
10420   if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
10421       CCVT.getVectorElementType() != MVT::i1)
10422     return SDValue();
10423
10424   EVT ResVT = N->getValueType(0);
10425   EVT CmpVT = N0.getOperand(0).getValueType();
10426   // Only combine when the result type is of the same size as the compared
10427   // operands.
10428   if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
10429     return SDValue();
10430
10431   SDValue IfTrue = N->getOperand(1);
10432   SDValue IfFalse = N->getOperand(2);
10433   SDValue SetCC =
10434       DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
10435                    N0.getOperand(0), N0.getOperand(1),
10436                    cast<CondCodeSDNode>(N0.getOperand(2))->get());
10437   return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
10438                      IfTrue, IfFalse);
10439 }
10440
10441 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
10442 /// the compare-mask instructions rather than going via NZCV, even if LHS and
10443 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
10444 /// with a vector one followed by a DUP shuffle on the result.
10445 static SDValue performSelectCombine(SDNode *N,
10446                                     TargetLowering::DAGCombinerInfo &DCI) {
10447   SelectionDAG &DAG = DCI.DAG;
10448   SDValue N0 = N->getOperand(0);
10449   EVT ResVT = N->getValueType(0);
10450
10451   if (N0.getOpcode() != ISD::SETCC)
10452     return SDValue();
10453
10454   // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
10455   // scalar SetCCResultType. We also don't expect vectors, because we assume
10456   // that selects fed by vector SETCCs are canonicalized to VSELECT.
10457   assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
10458          "Scalar-SETCC feeding SELECT has unexpected result type!");
10459
10460   // If NumMaskElts == 0, the comparison is larger than select result. The
10461   // largest real NEON comparison is 64-bits per lane, which means the result is
10462   // at most 32-bits and an illegal vector. Just bail out for now.
10463   EVT SrcVT = N0.getOperand(0).getValueType();
10464
10465   // Don't try to do this optimization when the setcc itself has i1 operands.
10466   // There are no legal vectors of i1, so this would be pointless.
10467   if (SrcVT == MVT::i1)
10468     return SDValue();
10469
10470   int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
10471   if (!ResVT.isVector() || NumMaskElts == 0)
10472     return SDValue();
10473
10474   SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
10475   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
10476
10477   // Also bail out if the vector CCVT isn't the same size as ResVT.
10478   // This can happen if the SETCC operand size doesn't divide the ResVT size
10479   // (e.g., f64 vs v3f32).
10480   if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
10481     return SDValue();
10482
10483   // Make sure we didn't create illegal types, if we're not supposed to.
10484   assert(DCI.isBeforeLegalize() ||
10485          DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
10486
10487   // First perform a vector comparison, where lane 0 is the one we're interested
10488   // in.
10489   SDLoc DL(N0);
10490   SDValue LHS =
10491       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
10492   SDValue RHS =
10493       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
10494   SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
10495
10496   // Now duplicate the comparison mask we want across all other lanes.
10497   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
10498   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
10499   Mask = DAG.getNode(ISD::BITCAST, DL,
10500                      ResVT.changeVectorElementTypeToInteger(), Mask);
10501
10502   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
10503 }
10504
10505 /// Get rid of unnecessary NVCASTs (that don't change the type).
10506 static SDValue performNVCASTCombine(SDNode *N) {
10507   if (N->getValueType(0) == N->getOperand(0).getValueType())
10508     return N->getOperand(0);
10509
10510   return SDValue();
10511 }
10512
10513 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
10514                                                  DAGCombinerInfo &DCI) const {
10515   SelectionDAG &DAG = DCI.DAG;
10516   switch (N->getOpcode()) {
10517   default:
10518     DEBUG(dbgs() << "Custom combining: skipping\n");
10519     break;
10520   case ISD::ADD:
10521   case ISD::SUB:
10522     return performAddSubLongCombine(N, DCI, DAG);
10523   case ISD::XOR:
10524     return performXorCombine(N, DAG, DCI, Subtarget);
10525   case ISD::MUL:
10526     return performMulCombine(N, DAG, DCI, Subtarget);
10527   case ISD::SINT_TO_FP:
10528   case ISD::UINT_TO_FP:
10529     return performIntToFpCombine(N, DAG, Subtarget);
10530   case ISD::FP_TO_SINT:
10531   case ISD::FP_TO_UINT:
10532     return performFpToIntCombine(N, DAG, DCI, Subtarget);
10533   case ISD::FDIV:
10534     return performFDivCombine(N, DAG, DCI, Subtarget);
10535   case ISD::OR:
10536     return performORCombine(N, DCI, Subtarget);
10537   case ISD::SRL:
10538     return performSRLCombine(N, DCI);
10539   case ISD::INTRINSIC_WO_CHAIN:
10540     return performIntrinsicCombine(N, DCI, Subtarget);
10541   case ISD::ANY_EXTEND:
10542   case ISD::ZERO_EXTEND:
10543   case ISD::SIGN_EXTEND:
10544     return performExtendCombine(N, DCI, DAG);
10545   case ISD::BITCAST:
10546     return performBitcastCombine(N, DCI, DAG);
10547   case ISD::CONCAT_VECTORS:
10548     return performConcatVectorsCombine(N, DCI, DAG);
10549   case ISD::SELECT:
10550     return performSelectCombine(N, DCI);
10551   case ISD::VSELECT:
10552     return performVSelectCombine(N, DCI.DAG);
10553   case ISD::LOAD:
10554     if (performTBISimplification(N->getOperand(1), DCI, DAG))
10555       return SDValue(N, 0);
10556     break;
10557   case ISD::STORE:
10558     return performSTORECombine(N, DCI, DAG, Subtarget);
10559   case AArch64ISD::BRCOND:
10560     return performBRCONDCombine(N, DCI, DAG);
10561   case AArch64ISD::TBNZ:
10562   case AArch64ISD::TBZ:
10563     return performTBZCombine(N, DCI, DAG);
10564   case AArch64ISD::CSEL:
10565     return performCONDCombine(N, DCI, DAG, 2, 3);
10566   case AArch64ISD::DUP:
10567     return performPostLD1Combine(N, DCI, false);
10568   case AArch64ISD::NVCAST:
10569     return performNVCASTCombine(N);
10570   case ISD::INSERT_VECTOR_ELT:
10571     return performPostLD1Combine(N, DCI, true);
10572   case ISD::INTRINSIC_VOID:
10573   case ISD::INTRINSIC_W_CHAIN:
10574     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
10575     case Intrinsic::aarch64_neon_ld2:
10576     case Intrinsic::aarch64_neon_ld3:
10577     case Intrinsic::aarch64_neon_ld4:
10578     case Intrinsic::aarch64_neon_ld1x2:
10579     case Intrinsic::aarch64_neon_ld1x3:
10580     case Intrinsic::aarch64_neon_ld1x4:
10581     case Intrinsic::aarch64_neon_ld2lane:
10582     case Intrinsic::aarch64_neon_ld3lane:
10583     case Intrinsic::aarch64_neon_ld4lane:
10584     case Intrinsic::aarch64_neon_ld2r:
10585     case Intrinsic::aarch64_neon_ld3r:
10586     case Intrinsic::aarch64_neon_ld4r:
10587     case Intrinsic::aarch64_neon_st2:
10588     case Intrinsic::aarch64_neon_st3:
10589     case Intrinsic::aarch64_neon_st4:
10590     case Intrinsic::aarch64_neon_st1x2:
10591     case Intrinsic::aarch64_neon_st1x3:
10592     case Intrinsic::aarch64_neon_st1x4:
10593     case Intrinsic::aarch64_neon_st2lane:
10594     case Intrinsic::aarch64_neon_st3lane:
10595     case Intrinsic::aarch64_neon_st4lane:
10596       return performNEONPostLDSTCombine(N, DCI, DAG);
10597     default:
10598       break;
10599     }
10600   }
10601   return SDValue();
10602 }
10603
10604 // Check if the return value is used as only a return value, as otherwise
10605 // we can't perform a tail-call. In particular, we need to check for
10606 // target ISD nodes that are returns and any other "odd" constructs
10607 // that the generic analysis code won't necessarily catch.
10608 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
10609                                                SDValue &Chain) const {
10610   if (N->getNumValues() != 1)
10611     return false;
10612   if (!N->hasNUsesOfValue(1, 0))
10613     return false;
10614
10615   SDValue TCChain = Chain;
10616   SDNode *Copy = *N->use_begin();
10617   if (Copy->getOpcode() == ISD::CopyToReg) {
10618     // If the copy has a glue operand, we conservatively assume it isn't safe to
10619     // perform a tail call.
10620     if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
10621         MVT::Glue)
10622       return false;
10623     TCChain = Copy->getOperand(0);
10624   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
10625     return false;
10626
10627   bool HasRet = false;
10628   for (SDNode *Node : Copy->uses()) {
10629     if (Node->getOpcode() != AArch64ISD::RET_FLAG)
10630       return false;
10631     HasRet = true;
10632   }
10633
10634   if (!HasRet)
10635     return false;
10636
10637   Chain = TCChain;
10638   return true;
10639 }
10640
10641 // Return whether the an instruction can potentially be optimized to a tail
10642 // call. This will cause the optimizers to attempt to move, or duplicate,
10643 // return instructions to help enable tail call optimizations for this
10644 // instruction.
10645 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
10646   return CI->isTailCall();
10647 }
10648
10649 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
10650                                                    SDValue &Offset,
10651                                                    ISD::MemIndexedMode &AM,
10652                                                    bool &IsInc,
10653                                                    SelectionDAG &DAG) const {
10654   if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
10655     return false;
10656
10657   Base = Op->getOperand(0);
10658   // All of the indexed addressing mode instructions take a signed
10659   // 9 bit immediate offset.
10660   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
10661     int64_t RHSC = RHS->getSExtValue();
10662     if (Op->getOpcode() == ISD::SUB)
10663       RHSC = -(uint64_t)RHSC;
10664     if (!isInt<9>(RHSC))
10665       return false;
10666     IsInc = (Op->getOpcode() == ISD::ADD);
10667     Offset = Op->getOperand(1);
10668     return true;
10669   }
10670   return false;
10671 }
10672
10673 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
10674                                                       SDValue &Offset,
10675                                                       ISD::MemIndexedMode &AM,
10676                                                       SelectionDAG &DAG) const {
10677   EVT VT;
10678   SDValue Ptr;
10679   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
10680     VT = LD->getMemoryVT();
10681     Ptr = LD->getBasePtr();
10682   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
10683     VT = ST->getMemoryVT();
10684     Ptr = ST->getBasePtr();
10685   } else
10686     return false;
10687
10688   bool IsInc;
10689   if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
10690     return false;
10691   AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
10692   return true;
10693 }
10694
10695 bool AArch64TargetLowering::getPostIndexedAddressParts(
10696     SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
10697     ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
10698   EVT VT;
10699   SDValue Ptr;
10700   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
10701     VT = LD->getMemoryVT();
10702     Ptr = LD->getBasePtr();
10703   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
10704     VT = ST->getMemoryVT();
10705     Ptr = ST->getBasePtr();
10706   } else
10707     return false;
10708
10709   bool IsInc;
10710   if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
10711     return false;
10712   // Post-indexing updates the base, so it's not a valid transform
10713   // if that's not the same as the load's pointer.
10714   if (Ptr != Base)
10715     return false;
10716   AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
10717   return true;
10718 }
10719
10720 static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
10721                                   SelectionDAG &DAG) {
10722   SDLoc DL(N);
10723   SDValue Op = N->getOperand(0);
10724
10725   if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
10726     return;
10727
10728   Op = SDValue(
10729       DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
10730                          DAG.getUNDEF(MVT::i32), Op,
10731                          DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
10732       0);
10733   Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
10734   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
10735 }
10736
10737 static void ReplaceReductionResults(SDNode *N,
10738                                     SmallVectorImpl<SDValue> &Results,
10739                                     SelectionDAG &DAG, unsigned InterOp,
10740                                     unsigned AcrossOp) {
10741   EVT LoVT, HiVT;
10742   SDValue Lo, Hi;
10743   SDLoc dl(N);
10744   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
10745   std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
10746   SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
10747   SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
10748   Results.push_back(SplitVal);
10749 }
10750
10751 static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
10752   SDLoc DL(N);
10753   SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
10754   SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
10755                            DAG.getNode(ISD::SRL, DL, MVT::i128, N,
10756                                        DAG.getConstant(64, DL, MVT::i64)));
10757   return std::make_pair(Lo, Hi);
10758 }
10759
10760 // Create an even/odd pair of X registers holding integer value V.
10761 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
10762   SDLoc dl(V.getNode());
10763   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
10764   SDValue VHi = DAG.getAnyExtOrTrunc(
10765       DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
10766       dl, MVT::i64);
10767   if (DAG.getDataLayout().isBigEndian())
10768     std::swap (VLo, VHi);
10769   SDValue RegClass =
10770       DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
10771   SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
10772   SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
10773   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
10774   return SDValue(
10775       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10776 }
10777
10778 static void ReplaceCMP_SWAP_128Results(SDNode *N,
10779                                        SmallVectorImpl<SDValue> &Results,
10780                                        SelectionDAG &DAG,
10781                                        const AArch64Subtarget *Subtarget) {
10782   assert(N->getValueType(0) == MVT::i128 &&
10783          "AtomicCmpSwap on types less than 128 should be legal");
10784
10785   if (Subtarget->hasLSE()) {
10786     // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
10787     // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
10788     SDValue Ops[] = {
10789         createGPRPairNode(DAG, N->getOperand(2)), // Compare value
10790         createGPRPairNode(DAG, N->getOperand(3)), // Store value
10791         N->getOperand(1), // Ptr
10792         N->getOperand(0), // Chain in
10793     };
10794
10795     MachineFunction &MF = DAG.getMachineFunction();
10796     MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
10797     MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
10798
10799     unsigned Opcode;
10800     switch (MemOp[0]->getOrdering()) {
10801     case AtomicOrdering::Monotonic:
10802       Opcode = AArch64::CASPX;
10803       break;
10804     case AtomicOrdering::Acquire:
10805       Opcode = AArch64::CASPAX;
10806       break;
10807     case AtomicOrdering::Release:
10808       Opcode = AArch64::CASPLX;
10809       break;
10810     case AtomicOrdering::AcquireRelease:
10811     case AtomicOrdering::SequentiallyConsistent:
10812       Opcode = AArch64::CASPALX;
10813       break;
10814     default:
10815       llvm_unreachable("Unexpected ordering!");
10816     }
10817
10818     MachineSDNode *CmpSwap = DAG.getMachineNode(
10819         Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
10820     CmpSwap->setMemRefs(MemOp, MemOp + 1);
10821
10822     unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
10823     if (DAG.getDataLayout().isBigEndian())
10824       std::swap(SubReg1, SubReg2);
10825     Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
10826                                                  SDValue(CmpSwap, 0)));
10827     Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
10828                                                  SDValue(CmpSwap, 0)));
10829     Results.push_back(SDValue(CmpSwap, 1)); // Chain out
10830     return;
10831   }
10832
10833   auto Desired = splitInt128(N->getOperand(2), DAG);
10834   auto New = splitInt128(N->getOperand(3), DAG);
10835   SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
10836                    New.first,        New.second,    N->getOperand(0)};
10837   SDNode *CmpSwap = DAG.getMachineNode(
10838       AArch64::CMP_SWAP_128, SDLoc(N),
10839       DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
10840
10841   MachineFunction &MF = DAG.getMachineFunction();
10842   MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
10843   MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
10844   cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
10845
10846   Results.push_back(SDValue(CmpSwap, 0));
10847   Results.push_back(SDValue(CmpSwap, 1));
10848   Results.push_back(SDValue(CmpSwap, 3));
10849 }
10850
10851 void AArch64TargetLowering::ReplaceNodeResults(
10852     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
10853   switch (N->getOpcode()) {
10854   default:
10855     llvm_unreachable("Don't know how to custom expand this");
10856   case ISD::BITCAST:
10857     ReplaceBITCASTResults(N, Results, DAG);
10858     return;
10859   case ISD::VECREDUCE_ADD:
10860   case ISD::VECREDUCE_SMAX:
10861   case ISD::VECREDUCE_SMIN:
10862   case ISD::VECREDUCE_UMAX:
10863   case ISD::VECREDUCE_UMIN:
10864     Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
10865     return;
10866
10867   case AArch64ISD::SADDV:
10868     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
10869     return;
10870   case AArch64ISD::UADDV:
10871     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
10872     return;
10873   case AArch64ISD::SMINV:
10874     ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
10875     return;
10876   case AArch64ISD::UMINV:
10877     ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
10878     return;
10879   case AArch64ISD::SMAXV:
10880     ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
10881     return;
10882   case AArch64ISD::UMAXV:
10883     ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
10884     return;
10885   case ISD::FP_TO_UINT:
10886   case ISD::FP_TO_SINT:
10887     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
10888     // Let normal code take care of it by not adding anything to Results.
10889     return;
10890   case ISD::ATOMIC_CMP_SWAP:
10891     ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
10892     return;
10893   }
10894 }
10895
10896 bool AArch64TargetLowering::useLoadStackGuardNode() const {
10897   if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
10898     return TargetLowering::useLoadStackGuardNode();
10899   return true;
10900 }
10901
10902 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
10903   // Combine multiple FDIVs with the same divisor into multiple FMULs by the
10904   // reciprocal if there are three or more FDIVs.
10905   return 3;
10906 }
10907
10908 TargetLoweringBase::LegalizeTypeAction
10909 AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
10910   MVT SVT = VT.getSimpleVT();
10911   // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
10912   // v4i16, v2i32 instead of to promote.
10913   if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
10914       || SVT == MVT::v1f32)
10915     return TypeWidenVector;
10916
10917   return TargetLoweringBase::getPreferredVectorAction(VT);
10918 }
10919
10920 // Loads and stores less than 128-bits are already atomic; ones above that
10921 // are doomed anyway, so defer to the default libcall and blame the OS when
10922 // things go wrong.
10923 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
10924   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
10925   return Size == 128;
10926 }
10927
10928 // Loads and stores less than 128-bits are already atomic; ones above that
10929 // are doomed anyway, so defer to the default libcall and blame the OS when
10930 // things go wrong.
10931 TargetLowering::AtomicExpansionKind
10932 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
10933   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
10934   return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
10935 }
10936
10937 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
10938 TargetLowering::AtomicExpansionKind
10939 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
10940   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
10941   if (Size > 128) return AtomicExpansionKind::None;
10942   // Nand not supported in LSE.
10943   if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
10944   // Leave 128 bits to LLSC.
10945   return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
10946 }
10947
10948 bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
10949     AtomicCmpXchgInst *AI) const {
10950   // If subtarget has LSE, leave cmpxchg intact for codegen.
10951   if (Subtarget->hasLSE()) return false;
10952   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
10953   // implement cmpxchg without spilling. If the address being exchanged is also
10954   // on the stack and close enough to the spill slot, this can lead to a
10955   // situation where the monitor always gets cleared and the atomic operation
10956   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
10957   return getTargetMachine().getOptLevel() != 0;
10958 }
10959
10960 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
10961                                              AtomicOrdering Ord) const {
10962   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
10963   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
10964   bool IsAcquire = isAcquireOrStronger(Ord);
10965
10966   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
10967   // intrinsic must return {i64, i64} and we have to recombine them into a
10968   // single i128 here.
10969   if (ValTy->getPrimitiveSizeInBits() == 128) {
10970     Intrinsic::ID Int =
10971         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
10972     Function *Ldxr = Intrinsic::getDeclaration(M, Int);
10973
10974     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
10975     Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
10976
10977     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
10978     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
10979     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
10980     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
10981     return Builder.CreateOr(
10982         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
10983   }
10984
10985   Type *Tys[] = { Addr->getType() };
10986   Intrinsic::ID Int =
10987       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
10988   Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
10989
10990   return Builder.CreateTruncOrBitCast(
10991       Builder.CreateCall(Ldxr, Addr),
10992       cast<PointerType>(Addr->getType())->getElementType());
10993 }
10994
10995 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
10996     IRBuilder<> &Builder) const {
10997   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
10998   Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
10999 }
11000
11001 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
11002                                                    Value *Val, Value *Addr,
11003                                                    AtomicOrdering Ord) const {
11004   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
11005   bool IsRelease = isReleaseOrStronger(Ord);
11006
11007   // Since the intrinsics must have legal type, the i128 intrinsics take two
11008   // parameters: "i64, i64". We must marshal Val into the appropriate form
11009   // before the call.
11010   if (Val->getType()->getPrimitiveSizeInBits() == 128) {
11011     Intrinsic::ID Int =
11012         IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
11013     Function *Stxr = Intrinsic::getDeclaration(M, Int);
11014     Type *Int64Ty = Type::getInt64Ty(M->getContext());
11015
11016     Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
11017     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
11018     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
11019     return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
11020   }
11021
11022   Intrinsic::ID Int =
11023       IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
11024   Type *Tys[] = { Addr->getType() };
11025   Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
11026
11027   return Builder.CreateCall(Stxr,
11028                             {Builder.CreateZExtOrBitCast(
11029                                  Val, Stxr->getFunctionType()->getParamType(0)),
11030                              Addr});
11031 }
11032
11033 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
11034     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
11035   return Ty->isArrayTy();
11036 }
11037
11038 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
11039                                                             EVT) const {
11040   return false;
11041 }
11042
11043 static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
11044   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
11045   Function *ThreadPointerFunc =
11046       Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
11047   return IRB.CreatePointerCast(
11048       IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset),
11049       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
11050 }
11051
11052 Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
11053   // Android provides a fixed TLS slot for the stack cookie. See the definition
11054   // of TLS_SLOT_STACK_GUARD in
11055   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
11056   if (Subtarget->isTargetAndroid())
11057     return UseTlsOffset(IRB, 0x28);
11058
11059   // Fuchsia is similar.
11060   // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
11061   if (Subtarget->isTargetFuchsia())
11062     return UseTlsOffset(IRB, -0x10);
11063
11064   return TargetLowering::getIRStackGuard(IRB);
11065 }
11066
11067 Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
11068   // Android provides a fixed TLS slot for the SafeStack pointer. See the
11069   // definition of TLS_SLOT_SAFESTACK in
11070   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
11071   if (Subtarget->isTargetAndroid())
11072     return UseTlsOffset(IRB, 0x48);
11073
11074   // Fuchsia is similar.
11075   // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
11076   if (Subtarget->isTargetFuchsia())
11077     return UseTlsOffset(IRB, -0x8);
11078
11079   return TargetLowering::getSafeStackPointerLocation(IRB);
11080 }
11081
11082 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
11083     const Instruction &AndI) const {
11084   // Only sink 'and' mask to cmp use block if it is masking a single bit, since
11085   // this is likely to be fold the and/cmp/br into a single tbz instruction.  It
11086   // may be beneficial to sink in other cases, but we would have to check that
11087   // the cmp would not get folded into the br to form a cbz for these to be
11088   // beneficial.
11089   ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
11090   if (!Mask)
11091     return false;
11092   return Mask->getValue().isPowerOf2();
11093 }
11094
11095 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
11096   // Update IsSplitCSR in AArch64unctionInfo.
11097   AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
11098   AFI->setIsSplitCSR(true);
11099 }
11100
11101 void AArch64TargetLowering::insertCopiesSplitCSR(
11102     MachineBasicBlock *Entry,
11103     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
11104   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
11105   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
11106   if (!IStart)
11107     return;
11108
11109   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11110   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
11111   MachineBasicBlock::iterator MBBI = Entry->begin();
11112   for (const MCPhysReg *I = IStart; *I; ++I) {
11113     const TargetRegisterClass *RC = nullptr;
11114     if (AArch64::GPR64RegClass.contains(*I))
11115       RC = &AArch64::GPR64RegClass;
11116     else if (AArch64::FPR64RegClass.contains(*I))
11117       RC = &AArch64::FPR64RegClass;
11118     else
11119       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
11120
11121     unsigned NewVR = MRI->createVirtualRegister(RC);
11122     // Create copy from CSR to a virtual register.
11123     // FIXME: this currently does not emit CFI pseudo-instructions, it works
11124     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
11125     // nounwind. If we want to generalize this later, we may need to emit
11126     // CFI pseudo-instructions.
11127     assert(Entry->getParent()->getFunction().hasFnAttribute(
11128                Attribute::NoUnwind) &&
11129            "Function should be nounwind in insertCopiesSplitCSR!");
11130     Entry->addLiveIn(*I);
11131     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
11132         .addReg(*I);
11133
11134     // Insert the copy-back instructions right before the terminator.
11135     for (auto *Exit : Exits)
11136       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
11137               TII->get(TargetOpcode::COPY), *I)
11138           .addReg(NewVR);
11139   }
11140 }
11141
11142 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
11143   // Integer division on AArch64 is expensive. However, when aggressively
11144   // optimizing for code size, we prefer to use a div instruction, as it is
11145   // usually smaller than the alternative sequence.
11146   // The exception to this is vector division. Since AArch64 doesn't have vector
11147   // integer division, leaving the division as-is is a loss even in terms of
11148   // size, because it will have to be scalarized, while the alternative code
11149   // sequence can be performed in vector form.
11150   bool OptSize =
11151       Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
11152   return OptSize && !VT.isVector();
11153 }
11154
11155 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
11156   return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
11157 }
11158
11159 unsigned
11160 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
11161   if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
11162     return getPointerTy(DL).getSizeInBits();
11163
11164   return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
11165 }
11166
11167 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
11168   MF.getFrameInfo().computeMaxCallFrameSize(MF);
11169   TargetLoweringBase::finalizeLowering(MF);
11170 }