llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

   1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements the AArch64TargetLowering class.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "AArch64ISelLowering.h"
  14 #include "AArch64CallingConvention.h"
  15 #include "AArch64ExpandImm.h"
  16 #include "AArch64MachineFunctionInfo.h"
  17 #include "AArch64PerfectShuffle.h"
  18 #include "AArch64RegisterInfo.h"
  19 #include "AArch64Subtarget.h"
  20 #include "MCTargetDesc/AArch64AddressingModes.h"
  21 #include "Utils/AArch64BaseInfo.h"
  22 #include "llvm/ADT/APFloat.h"
  23 #include "llvm/ADT/APInt.h"
  24 #include "llvm/ADT/ArrayRef.h"
  25 #include "llvm/ADT/STLExtras.h"
  26 #include "llvm/ADT/SmallSet.h"
  27 #include "llvm/ADT/SmallVector.h"
  28 #include "llvm/ADT/Statistic.h"
  29 #include "llvm/ADT/StringRef.h"
  30 #include "llvm/ADT/Triple.h"
  31 #include "llvm/ADT/Twine.h"
  32 #include "llvm/Analysis/LoopInfo.h"
  33 #include "llvm/Analysis/MemoryLocation.h"
  34 #include "llvm/Analysis/ObjCARCUtil.h"
  35 #include "llvm/Analysis/TargetTransformInfo.h"
  36 #include "llvm/Analysis/VectorUtils.h"
  37 #include "llvm/CodeGen/Analysis.h"
  38 #include "llvm/CodeGen/CallingConvLower.h"
  39 #include "llvm/CodeGen/ISDOpcodes.h"
  40 #include "llvm/CodeGen/MachineBasicBlock.h"
  41 #include "llvm/CodeGen/MachineFrameInfo.h"
  42 #include "llvm/CodeGen/MachineFunction.h"
  43 #include "llvm/CodeGen/MachineInstr.h"
  44 #include "llvm/CodeGen/MachineInstrBuilder.h"
  45 #include "llvm/CodeGen/MachineMemOperand.h"
  46 #include "llvm/CodeGen/MachineRegisterInfo.h"
  47 #include "llvm/CodeGen/RuntimeLibcalls.h"
  48 #include "llvm/CodeGen/SelectionDAG.h"
  49 #include "llvm/CodeGen/SelectionDAGNodes.h"
  50 #include "llvm/CodeGen/TargetCallingConv.h"
  51 #include "llvm/CodeGen/TargetInstrInfo.h"
  52 #include "llvm/CodeGen/ValueTypes.h"
  53 #include "llvm/IR/Attributes.h"
  54 #include "llvm/IR/Constants.h"
  55 #include "llvm/IR/DataLayout.h"
  56 #include "llvm/IR/DebugLoc.h"
  57 #include "llvm/IR/DerivedTypes.h"
  58 #include "llvm/IR/Function.h"
  59 #include "llvm/IR/GetElementPtrTypeIterator.h"
  60 #include "llvm/IR/GlobalValue.h"
  61 #include "llvm/IR/IRBuilder.h"
  62 #include "llvm/IR/Instruction.h"
  63 #include "llvm/IR/Instructions.h"
  64 #include "llvm/IR/IntrinsicInst.h"
  65 #include "llvm/IR/Intrinsics.h"
  66 #include "llvm/IR/IntrinsicsAArch64.h"
  67 #include "llvm/IR/Module.h"
  68 #include "llvm/IR/OperandTraits.h"
  69 #include "llvm/IR/PatternMatch.h"
  70 #include "llvm/IR/Type.h"
  71 #include "llvm/IR/Use.h"
  72 #include "llvm/IR/Value.h"
  73 #include "llvm/MC/MCRegisterInfo.h"
  74 #include "llvm/Support/Casting.h"
  75 #include "llvm/Support/CodeGen.h"
  76 #include "llvm/Support/CommandLine.h"
  77 #include "llvm/Support/Compiler.h"
  78 #include "llvm/Support/Debug.h"
  79 #include "llvm/Support/ErrorHandling.h"
  80 #include "llvm/Support/InstructionCost.h"
  81 #include "llvm/Support/KnownBits.h"
  82 #include "llvm/Support/MachineValueType.h"
  83 #include "llvm/Support/MathExtras.h"
  84 #include "llvm/Support/raw_ostream.h"
  85 #include "llvm/Target/TargetMachine.h"
  86 #include "llvm/Target/TargetOptions.h"
  87 #include <algorithm>
  88 #include <bitset>
  89 #include <cassert>
  90 #include <cctype>
  91 #include <cstdint>
  92 #include <cstdlib>
  93 #include <iterator>
  94 #include <limits>
  95 #include <tuple>
  96 #include <utility>
  97 #include <vector>
  98
  99 using namespace llvm;
 100 using namespace llvm::PatternMatch;
 101
 102 #define DEBUG_TYPE "aarch64-lower"
 103
 104 STATISTIC(NumTailCalls, "Number of tail calls");
 105 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
 106 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
 107
 108 // FIXME: The necessary dtprel relocations don't seem to be supported
 109 // well in the GNU bfd and gold linkers at the moment. Therefore, by
 110 // default, for now, fall back to GeneralDynamic code generation.
 111 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
 112     "aarch64-elf-ldtls-generation", cl::Hidden,
 113     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
 114     cl::init(false));
 115
 116 static cl::opt<bool>
 117 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
 118                          cl::desc("Enable AArch64 logical imm instruction "
 119                                   "optimization"),
 120                          cl::init(true));
 121
 122 // Temporary option added for the purpose of testing functionality added
 123 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
 124 // in future when both implementations will be based off MGATHER rather
 125 // than the GLD1 nodes added for the SVE gather load intrinsics.
 126 static cl::opt<bool>
 127 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
 128                                 cl::desc("Combine extends of AArch64 masked "
 129                                          "gather intrinsics"),
 130                                 cl::init(true));
 131
 132 /// Value type used for condition codes.
 133 static const MVT MVT_CC = MVT::i32;
 134
 135 static inline EVT getPackedSVEVectorVT(EVT VT) {
 136   switch (VT.getSimpleVT().SimpleTy) {
 137   default:
 138     llvm_unreachable("unexpected element type for vector");
 139   case MVT::i8:
 140     return MVT::nxv16i8;
 141   case MVT::i16:
 142     return MVT::nxv8i16;
 143   case MVT::i32:
 144     return MVT::nxv4i32;
 145   case MVT::i64:
 146     return MVT::nxv2i64;
 147   case MVT::f16:
 148     return MVT::nxv8f16;
 149   case MVT::f32:
 150     return MVT::nxv4f32;
 151   case MVT::f64:
 152     return MVT::nxv2f64;
 153   case MVT::bf16:
 154     return MVT::nxv8bf16;
 155   }
 156 }
 157
 158 // NOTE: Currently there's only a need to return integer vector types. If this
 159 // changes then just add an extra "type" parameter.
 160 static inline EVT getPackedSVEVectorVT(ElementCount EC) {
 161   switch (EC.getKnownMinValue()) {
 162   default:
 163     llvm_unreachable("unexpected element count for vector");
 164   case 16:
 165     return MVT::nxv16i8;
 166   case 8:
 167     return MVT::nxv8i16;
 168   case 4:
 169     return MVT::nxv4i32;
 170   case 2:
 171     return MVT::nxv2i64;
 172   }
 173 }
 174
 175 static inline EVT getPromotedVTForPredicate(EVT VT) {
 176   assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
 177          "Expected scalable predicate vector type!");
 178   switch (VT.getVectorMinNumElements()) {
 179   default:
 180     llvm_unreachable("unexpected element count for vector");
 181   case 2:
 182     return MVT::nxv2i64;
 183   case 4:
 184     return MVT::nxv4i32;
 185   case 8:
 186     return MVT::nxv8i16;
 187   case 16:
 188     return MVT::nxv16i8;
 189   }
 190 }
 191
 192 /// Returns true if VT's elements occupy the lowest bit positions of its
 193 /// associated register class without any intervening space.
 194 ///
 195 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
 196 /// same register class, but only nxv8f16 can be treated as a packed vector.
 197 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
 198   assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
 199          "Expected legal vector type!");
 200   return VT.isFixedLengthVector() ||
 201          VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
 202 }
 203
 204 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
 205 // predicate and end with a passthru value matching the result type.
 206 static bool isMergePassthruOpcode(unsigned Opc) {
 207   switch (Opc) {
 208   default:
 209     return false;
 210   case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
 211   case AArch64ISD::BSWAP_MERGE_PASSTHRU:
 212   case AArch64ISD::REVH_MERGE_PASSTHRU:
 213   case AArch64ISD::REVW_MERGE_PASSTHRU:
 214   case AArch64ISD::REVD_MERGE_PASSTHRU:
 215   case AArch64ISD::CTLZ_MERGE_PASSTHRU:
 216   case AArch64ISD::CTPOP_MERGE_PASSTHRU:
 217   case AArch64ISD::DUP_MERGE_PASSTHRU:
 218   case AArch64ISD::ABS_MERGE_PASSTHRU:
 219   case AArch64ISD::NEG_MERGE_PASSTHRU:
 220   case AArch64ISD::FNEG_MERGE_PASSTHRU:
 221   case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
 222   case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
 223   case AArch64ISD::FCEIL_MERGE_PASSTHRU:
 224   case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
 225   case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
 226   case AArch64ISD::FRINT_MERGE_PASSTHRU:
 227   case AArch64ISD::FROUND_MERGE_PASSTHRU:
 228   case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
 229   case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
 230   case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
 231   case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
 232   case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
 233   case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
 234   case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
 235   case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
 236   case AArch64ISD::FSQRT_MERGE_PASSTHRU:
 237   case AArch64ISD::FRECPX_MERGE_PASSTHRU:
 238   case AArch64ISD::FABS_MERGE_PASSTHRU:
 239     return true;
 240   }
 241 }
 242
 243 // Returns true if inactive lanes are known to be zeroed by construction.
 244 static bool isZeroingInactiveLanes(SDValue Op) {
 245   switch (Op.getOpcode()) {
 246   default:
 247     // We guarantee i1 splat_vectors to zero the other lanes by
 248     // implementing it with ptrue and possibly a punpklo for nxv1i1.
 249     if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
 250       return true;
 251     return false;
 252   case AArch64ISD::PTRUE:
 253   case AArch64ISD::SETCC_MERGE_ZERO:
 254     return true;
 255   case ISD::INTRINSIC_WO_CHAIN:
 256     switch (Op.getConstantOperandVal(0)) {
 257     default:
 258       return false;
 259     case Intrinsic::aarch64_sve_ptrue:
 260     case Intrinsic::aarch64_sve_pnext:
 261     case Intrinsic::aarch64_sve_cmpeq:
 262     case Intrinsic::aarch64_sve_cmpne:
 263     case Intrinsic::aarch64_sve_cmpge:
 264     case Intrinsic::aarch64_sve_cmpgt:
 265     case Intrinsic::aarch64_sve_cmphs:
 266     case Intrinsic::aarch64_sve_cmphi:
 267     case Intrinsic::aarch64_sve_cmpeq_wide:
 268     case Intrinsic::aarch64_sve_cmpne_wide:
 269     case Intrinsic::aarch64_sve_cmpge_wide:
 270     case Intrinsic::aarch64_sve_cmpgt_wide:
 271     case Intrinsic::aarch64_sve_cmplt_wide:
 272     case Intrinsic::aarch64_sve_cmple_wide:
 273     case Intrinsic::aarch64_sve_cmphs_wide:
 274     case Intrinsic::aarch64_sve_cmphi_wide:
 275     case Intrinsic::aarch64_sve_cmplo_wide:
 276     case Intrinsic::aarch64_sve_cmpls_wide:
 277     case Intrinsic::aarch64_sve_fcmpeq:
 278     case Intrinsic::aarch64_sve_fcmpne:
 279     case Intrinsic::aarch64_sve_fcmpge:
 280     case Intrinsic::aarch64_sve_fcmpgt:
 281     case Intrinsic::aarch64_sve_fcmpuo:
 282       return true;
 283     }
 284   }
 285 }
 286
 287 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 288                                              const AArch64Subtarget &STI)
 289     : TargetLowering(TM), Subtarget(&STI) {
 290   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
 291   // we have to make something up. Arbitrarily, choose ZeroOrOne.
 292   setBooleanContents(ZeroOrOneBooleanContent);
 293   // When comparing vectors the result sets the different elements in the
 294   // vector to all-one or all-zero.
 295   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 296
 297   // Set up the register classes.
 298   addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
 299   addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
 300
 301   if (Subtarget->hasLS64()) {
 302     addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
 303     setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
 304     setOperationAction(ISD::STORE, MVT::i64x8, Custom);
 305   }
 306
 307   if (Subtarget->hasFPARMv8()) {
 308     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
 309     addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
 310     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
 311     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
 312     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
 313   }
 314
 315   if (Subtarget->hasNEON()) {
 316     addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
 317     addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
 318     // Someone set us up the NEON.
 319     addDRTypeForNEON(MVT::v2f32);
 320     addDRTypeForNEON(MVT::v8i8);
 321     addDRTypeForNEON(MVT::v4i16);
 322     addDRTypeForNEON(MVT::v2i32);
 323     addDRTypeForNEON(MVT::v1i64);
 324     addDRTypeForNEON(MVT::v1f64);
 325     addDRTypeForNEON(MVT::v4f16);
 326     if (Subtarget->hasBF16())
 327       addDRTypeForNEON(MVT::v4bf16);
 328
 329     addQRTypeForNEON(MVT::v4f32);
 330     addQRTypeForNEON(MVT::v2f64);
 331     addQRTypeForNEON(MVT::v16i8);
 332     addQRTypeForNEON(MVT::v8i16);
 333     addQRTypeForNEON(MVT::v4i32);
 334     addQRTypeForNEON(MVT::v2i64);
 335     addQRTypeForNEON(MVT::v8f16);
 336     if (Subtarget->hasBF16())
 337       addQRTypeForNEON(MVT::v8bf16);
 338   }
 339
 340   if (Subtarget->hasSVE() || Subtarget->hasSME()) {
 341     // Add legal sve predicate types
 342     addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
 343     addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
 344     addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
 345     addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
 346     addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
 347
 348     // Add legal sve data types
 349     addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
 350     addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
 351     addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
 352     addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
 353
 354     addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
 355     addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
 356     addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
 357     addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
 358     addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
 359     addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
 360
 361     if (Subtarget->hasBF16()) {
 362       addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
 363       addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
 364       addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
 365     }
 366
 367     if (Subtarget->useSVEForFixedLengthVectors()) {
 368       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
 369         if (useSVEForFixedLengthVectorVT(VT))
 370           addRegisterClass(VT, &AArch64::ZPRRegClass);
 371
 372       for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
 373         if (useSVEForFixedLengthVectorVT(VT))
 374           addRegisterClass(VT, &AArch64::ZPRRegClass);
 375     }
 376   }
 377
 378   // Compute derived properties from the register classes
 379   computeRegisterProperties(Subtarget->getRegisterInfo());
 380
 381   // Provide all sorts of operation actions
 382   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
 383   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 384   setOperationAction(ISD::SETCC, MVT::i32, Custom);
 385   setOperationAction(ISD::SETCC, MVT::i64, Custom);
 386   setOperationAction(ISD::SETCC, MVT::f16, Custom);
 387   setOperationAction(ISD::SETCC, MVT::f32, Custom);
 388   setOperationAction(ISD::SETCC, MVT::f64, Custom);
 389   setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
 390   setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
 391   setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
 392   setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
 393   setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
 394   setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
 395   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
 396   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
 397   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
 398   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
 399   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
 400   setOperationAction(ISD::BR_CC, MVT::f16, Custom);
 401   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
 402   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
 403   setOperationAction(ISD::SELECT, MVT::i32, Custom);
 404   setOperationAction(ISD::SELECT, MVT::i64, Custom);
 405   setOperationAction(ISD::SELECT, MVT::f16, Custom);
 406   setOperationAction(ISD::SELECT, MVT::bf16, Custom);
 407   setOperationAction(ISD::SELECT, MVT::f32, Custom);
 408   setOperationAction(ISD::SELECT, MVT::f64, Custom);
 409   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 410   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
 411   setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
 412   setOperationAction(ISD::SELECT_CC, MVT::bf16, Expand);
 413   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
 414   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 415   setOperationAction(ISD::BR_JT, MVT::Other, Custom);
 416   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
 417   setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom);
 418
 419   setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
 420   setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
 421   setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
 422
 423   setOperationAction(ISD::FREM, MVT::f32, Expand);
 424   setOperationAction(ISD::FREM, MVT::f64, Expand);
 425   setOperationAction(ISD::FREM, MVT::f80, Expand);
 426
 427   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
 428
 429   // Custom lowering hooks are needed for XOR
 430   // to fold it into CSINC/CSINV.
 431   setOperationAction(ISD::XOR, MVT::i32, Custom);
 432   setOperationAction(ISD::XOR, MVT::i64, Custom);
 433
 434   // Virtually no operation on f128 is legal, but LLVM can't expand them when
 435   // there's a valid register class, so we need custom operations in most cases.
 436   setOperationAction(ISD::FABS, MVT::f128, Expand);
 437   setOperationAction(ISD::FADD, MVT::f128, LibCall);
 438   setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
 439   setOperationAction(ISD::FCOS, MVT::f128, Expand);
 440   setOperationAction(ISD::FDIV, MVT::f128, LibCall);
 441   setOperationAction(ISD::FMA, MVT::f128, Expand);
 442   setOperationAction(ISD::FMUL, MVT::f128, LibCall);
 443   setOperationAction(ISD::FNEG, MVT::f128, Expand);
 444   setOperationAction(ISD::FPOW, MVT::f128, Expand);
 445   setOperationAction(ISD::FREM, MVT::f128, Expand);
 446   setOperationAction(ISD::FRINT, MVT::f128, Expand);
 447   setOperationAction(ISD::FSIN, MVT::f128, Expand);
 448   setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
 449   setOperationAction(ISD::FSQRT, MVT::f128, Expand);
 450   setOperationAction(ISD::FSUB, MVT::f128, LibCall);
 451   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
 452   setOperationAction(ISD::SETCC, MVT::f128, Custom);
 453   setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
 454   setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
 455   setOperationAction(ISD::BR_CC, MVT::f128, Custom);
 456   setOperationAction(ISD::SELECT, MVT::f128, Custom);
 457   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
 458   setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
 459   // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
 460   // aren't handled.
 461
 462   // Lowering for many of the conversions is actually specified by the non-f128
 463   // type. The LowerXXX function will be trivial when f128 isn't involved.
 464   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 465   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 466   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
 467   setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
 468   setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
 469   setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
 470   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 471   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 472   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
 473   setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
 474   setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
 475   setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
 476   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 477   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 478   setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
 479   setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
 480   setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
 481   setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
 482   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
 483   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 484   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
 485   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
 486   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
 487   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
 488   setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
 489   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
 490   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
 491   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
 492   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
 493   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
 494
 495   setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
 496   setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
 497   setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
 498   setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
 499
 500   // Variable arguments.
 501   setOperationAction(ISD::VASTART, MVT::Other, Custom);
 502   setOperationAction(ISD::VAARG, MVT::Other, Custom);
 503   setOperationAction(ISD::VACOPY, MVT::Other, Custom);
 504   setOperationAction(ISD::VAEND, MVT::Other, Expand);
 505
 506   // Variable-sized objects.
 507   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
 508   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
 509
 510   if (Subtarget->isTargetWindows())
 511     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
 512   else
 513     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
 514
 515   // Constant pool entries
 516   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
 517
 518   // BlockAddress
 519   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
 520
 521   // AArch64 lacks both left-rotate and popcount instructions.
 522   setOperationAction(ISD::ROTL, MVT::i32, Expand);
 523   setOperationAction(ISD::ROTL, MVT::i64, Expand);
 524   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
 525     setOperationAction(ISD::ROTL, VT, Expand);
 526     setOperationAction(ISD::ROTR, VT, Expand);
 527   }
 528
 529   // AArch64 doesn't have i32 MULH{S|U}.
 530   setOperationAction(ISD::MULHU, MVT::i32, Expand);
 531   setOperationAction(ISD::MULHS, MVT::i32, Expand);
 532
 533   // AArch64 doesn't have {U|S}MUL_LOHI.
 534   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
 535   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 536
 537   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
 538   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
 539   setOperationAction(ISD::CTPOP, MVT::i128, Custom);
 540
 541   setOperationAction(ISD::PARITY, MVT::i64, Custom);
 542   setOperationAction(ISD::PARITY, MVT::i128, Custom);
 543
 544   setOperationAction(ISD::ABS, MVT::i32, Custom);
 545   setOperationAction(ISD::ABS, MVT::i64, Custom);
 546
 547   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
 548   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
 549   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
 550     setOperationAction(ISD::SDIVREM, VT, Expand);
 551     setOperationAction(ISD::UDIVREM, VT, Expand);
 552   }
 553   setOperationAction(ISD::SREM, MVT::i32, Expand);
 554   setOperationAction(ISD::SREM, MVT::i64, Expand);
 555   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
 556   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
 557   setOperationAction(ISD::UREM, MVT::i32, Expand);
 558   setOperationAction(ISD::UREM, MVT::i64, Expand);
 559
 560   // Custom lower Add/Sub/Mul with overflow.
 561   setOperationAction(ISD::SADDO, MVT::i32, Custom);
 562   setOperationAction(ISD::SADDO, MVT::i64, Custom);
 563   setOperationAction(ISD::UADDO, MVT::i32, Custom);
 564   setOperationAction(ISD::UADDO, MVT::i64, Custom);
 565   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
 566   setOperationAction(ISD::SSUBO, MVT::i64, Custom);
 567   setOperationAction(ISD::USUBO, MVT::i32, Custom);
 568   setOperationAction(ISD::USUBO, MVT::i64, Custom);
 569   setOperationAction(ISD::SMULO, MVT::i32, Custom);
 570   setOperationAction(ISD::SMULO, MVT::i64, Custom);
 571   setOperationAction(ISD::UMULO, MVT::i32, Custom);
 572   setOperationAction(ISD::UMULO, MVT::i64, Custom);
 573
 574   setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
 575   setOperationAction(ISD::ADDCARRY, MVT::i64, Custom);
 576   setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
 577   setOperationAction(ISD::SUBCARRY, MVT::i64, Custom);
 578   setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom);
 579   setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom);
 580   setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom);
 581   setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom);
 582
 583   setOperationAction(ISD::FSIN, MVT::f32, Expand);
 584   setOperationAction(ISD::FSIN, MVT::f64, Expand);
 585   setOperationAction(ISD::FCOS, MVT::f32, Expand);
 586   setOperationAction(ISD::FCOS, MVT::f64, Expand);
 587   setOperationAction(ISD::FPOW, MVT::f32, Expand);
 588   setOperationAction(ISD::FPOW, MVT::f64, Expand);
 589   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 590   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 591   if (Subtarget->hasFullFP16())
 592     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
 593   else
 594     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
 595
 596   for (auto Op : {ISD::FREM,        ISD::FPOW,         ISD::FPOWI,
 597                   ISD::FCOS,        ISD::FSIN,         ISD::FSINCOS,
 598                   ISD::FEXP,        ISD::FEXP2,        ISD::FLOG,
 599                   ISD::FLOG2,       ISD::FLOG10,       ISD::STRICT_FREM,
 600                   ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS,
 601                   ISD::STRICT_FSIN, ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,
 602                   ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
 603     setOperationAction(Op, MVT::f16, Promote);
 604     setOperationAction(Op, MVT::v4f16, Expand);
 605     setOperationAction(Op, MVT::v8f16, Expand);
 606   }
 607
 608   if (!Subtarget->hasFullFP16()) {
 609     for (auto Op :
 610          {ISD::SETCC,          ISD::SELECT_CC,
 611           ISD::BR_CC,          ISD::FADD,           ISD::FSUB,
 612           ISD::FMUL,           ISD::FDIV,           ISD::FMA,
 613           ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
 614           ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
 615           ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
 616           ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
 617           ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
 618           ISD::STRICT_FSUB,    ISD::STRICT_FMUL,    ISD::STRICT_FDIV,
 619           ISD::STRICT_FMA,     ISD::STRICT_FCEIL,   ISD::STRICT_FFLOOR,
 620           ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,   ISD::STRICT_FNEARBYINT,
 621           ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  ISD::STRICT_FROUNDEVEN,
 622           ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
 623           ISD::STRICT_FMAXIMUM})
 624       setOperationAction(Op, MVT::f16, Promote);
 625
 626     // Round-to-integer need custom lowering for fp16, as Promote doesn't work
 627     // because the result type is integer.
 628     for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
 629                     ISD::STRICT_LLRINT})
 630       setOperationAction(Op, MVT::f16, Custom);
 631
 632     // promote v4f16 to v4f32 when that is known to be safe.
 633     setOperationPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
 634     setOperationPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
 635     setOperationPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
 636     setOperationPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
 637
 638     setOperationAction(ISD::FABS,        MVT::v4f16, Expand);
 639     setOperationAction(ISD::FNEG,        MVT::v4f16, Expand);
 640     setOperationAction(ISD::FROUND,      MVT::v4f16, Expand);
 641     setOperationAction(ISD::FROUNDEVEN,  MVT::v4f16, Expand);
 642     setOperationAction(ISD::FMA,         MVT::v4f16, Expand);
 643     setOperationAction(ISD::SETCC,       MVT::v4f16, Expand);
 644     setOperationAction(ISD::BR_CC,       MVT::v4f16, Expand);
 645     setOperationAction(ISD::SELECT,      MVT::v4f16, Expand);
 646     setOperationAction(ISD::SELECT_CC,   MVT::v4f16, Expand);
 647     setOperationAction(ISD::FTRUNC,      MVT::v4f16, Expand);
 648     setOperationAction(ISD::FCOPYSIGN,   MVT::v4f16, Expand);
 649     setOperationAction(ISD::FFLOOR,      MVT::v4f16, Expand);
 650     setOperationAction(ISD::FCEIL,       MVT::v4f16, Expand);
 651     setOperationAction(ISD::FRINT,       MVT::v4f16, Expand);
 652     setOperationAction(ISD::FNEARBYINT,  MVT::v4f16, Expand);
 653     setOperationAction(ISD::FSQRT,       MVT::v4f16, Expand);
 654
 655     setOperationAction(ISD::FABS,        MVT::v8f16, Expand);
 656     setOperationAction(ISD::FADD,        MVT::v8f16, Expand);
 657     setOperationAction(ISD::FCEIL,       MVT::v8f16, Expand);
 658     setOperationAction(ISD::FCOPYSIGN,   MVT::v8f16, Expand);
 659     setOperationAction(ISD::FDIV,        MVT::v8f16, Expand);
 660     setOperationAction(ISD::FFLOOR,      MVT::v8f16, Expand);
 661     setOperationAction(ISD::FMA,         MVT::v8f16, Expand);
 662     setOperationAction(ISD::FMUL,        MVT::v8f16, Expand);
 663     setOperationAction(ISD::FNEARBYINT,  MVT::v8f16, Expand);
 664     setOperationAction(ISD::FNEG,        MVT::v8f16, Expand);
 665     setOperationAction(ISD::FROUND,      MVT::v8f16, Expand);
 666     setOperationAction(ISD::FROUNDEVEN,  MVT::v8f16, Expand);
 667     setOperationAction(ISD::FRINT,       MVT::v8f16, Expand);
 668     setOperationAction(ISD::FSQRT,       MVT::v8f16, Expand);
 669     setOperationAction(ISD::FSUB,        MVT::v8f16, Expand);
 670     setOperationAction(ISD::FTRUNC,      MVT::v8f16, Expand);
 671     setOperationAction(ISD::SETCC,       MVT::v8f16, Expand);
 672     setOperationAction(ISD::BR_CC,       MVT::v8f16, Expand);
 673     setOperationAction(ISD::SELECT,      MVT::v8f16, Expand);
 674     setOperationAction(ISD::SELECT_CC,   MVT::v8f16, Expand);
 675     setOperationAction(ISD::FP_EXTEND,   MVT::v8f16, Expand);
 676   }
 677
 678   // AArch64 has implementations of a lot of rounding-like FP operations.
 679   for (auto Op :
 680        {ISD::FFLOOR,          ISD::FNEARBYINT,      ISD::FCEIL,
 681         ISD::FRINT,           ISD::FTRUNC,          ISD::FROUND,
 682         ISD::FROUNDEVEN,      ISD::FMINNUM,         ISD::FMAXNUM,
 683         ISD::FMINIMUM,        ISD::FMAXIMUM,        ISD::LROUND,
 684         ISD::LLROUND,         ISD::LRINT,           ISD::LLRINT,
 685         ISD::STRICT_FFLOOR,   ISD::STRICT_FCEIL,    ISD::STRICT_FNEARBYINT,
 686         ISD::STRICT_FRINT,    ISD::STRICT_FTRUNC,   ISD::STRICT_FROUNDEVEN,
 687         ISD::STRICT_FROUND,   ISD::STRICT_FMINNUM,  ISD::STRICT_FMAXNUM,
 688         ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
 689         ISD::STRICT_LLROUND,  ISD::STRICT_LRINT,    ISD::STRICT_LLRINT}) {
 690     for (MVT Ty : {MVT::f32, MVT::f64})
 691       setOperationAction(Op, Ty, Legal);
 692     if (Subtarget->hasFullFP16())
 693       setOperationAction(Op, MVT::f16, Legal);
 694   }
 695
 696   // Basic strict FP operations are legal
 697   for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
 698                   ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
 699     for (MVT Ty : {MVT::f32, MVT::f64})
 700       setOperationAction(Op, Ty, Legal);
 701     if (Subtarget->hasFullFP16())
 702       setOperationAction(Op, MVT::f16, Legal);
 703   }
 704
 705   // Strict conversion to a larger type is legal
 706   for (auto VT : {MVT::f32, MVT::f64})
 707     setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
 708
 709   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 710
 711   setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
 712   setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
 713
 714   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
 715   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
 716   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
 717   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
 718   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
 719
 720   // Generate outline atomics library calls only if LSE was not specified for
 721   // subtarget
 722   if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
 723     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
 724     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
 725     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
 726     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
 727     setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
 728     setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
 729     setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
 730     setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
 731     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
 732     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
 733     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
 734     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
 735     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
 736     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
 737     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
 738     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
 739     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
 740     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
 741     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
 742     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
 743     setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
 744     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
 745     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
 746     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
 747     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
 748 #define LCALLNAMES(A, B, N)                                                    \
 749   setLibcallName(A##N##_RELAX, #B #N "_relax");                                \
 750   setLibcallName(A##N##_ACQ, #B #N "_acq");                                    \
 751   setLibcallName(A##N##_REL, #B #N "_rel");                                    \
 752   setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
 753 #define LCALLNAME4(A, B)                                                       \
 754   LCALLNAMES(A, B, 1)                                                          \
 755   LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
 756 #define LCALLNAME5(A, B)                                                       \
 757   LCALLNAMES(A, B, 1)                                                          \
 758   LCALLNAMES(A, B, 2)                                                          \
 759   LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
 760     LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
 761     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
 762     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
 763     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
 764     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
 765     LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
 766 #undef LCALLNAMES
 767 #undef LCALLNAME4
 768 #undef LCALLNAME5
 769   }
 770
 771   // 128-bit loads and stores can be done without expanding
 772   setOperationAction(ISD::LOAD, MVT::i128, Custom);
 773   setOperationAction(ISD::STORE, MVT::i128, Custom);
 774
 775   // Aligned 128-bit loads and stores are single-copy atomic according to the
 776   // v8.4a spec.
 777   if (Subtarget->hasLSE2()) {
 778     setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
 779     setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
 780   }
 781
 782   // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
 783   // custom lowering, as there are no un-paired non-temporal stores and
 784   // legalization will break up 256 bit inputs.
 785   setOperationAction(ISD::STORE, MVT::v32i8, Custom);
 786   setOperationAction(ISD::STORE, MVT::v16i16, Custom);
 787   setOperationAction(ISD::STORE, MVT::v16f16, Custom);
 788   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
 789   setOperationAction(ISD::STORE, MVT::v8f32, Custom);
 790   setOperationAction(ISD::STORE, MVT::v4f64, Custom);
 791   setOperationAction(ISD::STORE, MVT::v4i64, Custom);
 792
 793   // 256 bit non-temporal loads can be lowered to LDNP. This is done using
 794   // custom lowering, as there are no un-paired non-temporal loads legalization
 795   // will break up 256 bit inputs.
 796   setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
 797   setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
 798   setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
 799   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
 800   setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
 801   setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
 802   setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
 803
 804   // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
 805   // This requires the Performance Monitors extension.
 806   if (Subtarget->hasPerfMon())
 807     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
 808
 809   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
 810       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
 811     // Issue __sincos_stret if available.
 812     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
 813     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
 814   } else {
 815     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 816     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 817   }
 818
 819   if (Subtarget->getTargetTriple().isOSMSVCRT()) {
 820     // MSVCRT doesn't have powi; fall back to pow
 821     setLibcallName(RTLIB::POWI_F32, nullptr);
 822     setLibcallName(RTLIB::POWI_F64, nullptr);
 823   }
 824
 825   // Make floating-point constants legal for the large code model, so they don't
 826   // become loads from the constant pool.
 827   if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
 828     setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 829     setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
 830   }
 831
 832   // AArch64 does not have floating-point extending loads, i1 sign-extending
 833   // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
 834   for (MVT VT : MVT::fp_valuetypes()) {
 835     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
 836     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
 837     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
 838     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
 839   }
 840   for (MVT VT : MVT::integer_valuetypes())
 841     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
 842
 843   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 844   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 845   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 846   setTruncStoreAction(MVT::f128, MVT::f80, Expand);
 847   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
 848   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
 849   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 850
 851   setOperationAction(ISD::BITCAST, MVT::i16, Custom);
 852   setOperationAction(ISD::BITCAST, MVT::f16, Custom);
 853   setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
 854
 855   // Indexed loads and stores are supported.
 856   for (unsigned im = (unsigned)ISD::PRE_INC;
 857        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
 858     setIndexedLoadAction(im, MVT::i8, Legal);
 859     setIndexedLoadAction(im, MVT::i16, Legal);
 860     setIndexedLoadAction(im, MVT::i32, Legal);
 861     setIndexedLoadAction(im, MVT::i64, Legal);
 862     setIndexedLoadAction(im, MVT::f64, Legal);
 863     setIndexedLoadAction(im, MVT::f32, Legal);
 864     setIndexedLoadAction(im, MVT::f16, Legal);
 865     setIndexedLoadAction(im, MVT::bf16, Legal);
 866     setIndexedStoreAction(im, MVT::i8, Legal);
 867     setIndexedStoreAction(im, MVT::i16, Legal);
 868     setIndexedStoreAction(im, MVT::i32, Legal);
 869     setIndexedStoreAction(im, MVT::i64, Legal);
 870     setIndexedStoreAction(im, MVT::f64, Legal);
 871     setIndexedStoreAction(im, MVT::f32, Legal);
 872     setIndexedStoreAction(im, MVT::f16, Legal);
 873     setIndexedStoreAction(im, MVT::bf16, Legal);
 874   }
 875
 876   // Trap.
 877   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 878   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 879   setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
 880
 881   // We combine OR nodes for bitfield operations.
 882   setTargetDAGCombine(ISD::OR);
 883   // Try to create BICs for vector ANDs.
 884   setTargetDAGCombine(ISD::AND);
 885
 886   // Vector add and sub nodes may conceal a high-half opportunity.
 887   // Also, try to fold ADD into CSINC/CSINV..
 888   setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
 889                        ISD::UINT_TO_FP});
 890
 891   setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
 892                        ISD::FP_TO_UINT_SAT, ISD::FDIV});
 893
 894   // Try and combine setcc with csel
 895   setTargetDAGCombine(ISD::SETCC);
 896
 897   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 898
 899   setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
 900                        ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG,
 901                        ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR,
 902                        ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR});
 903   setTargetDAGCombine(ISD::LOAD);
 904
 905   setTargetDAGCombine(ISD::MSTORE);
 906
 907   setTargetDAGCombine(ISD::MUL);
 908
 909   setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
 910
 911   setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
 912                        ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
 913                        ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
 914
 915   setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
 916
 917   setTargetDAGCombine(ISD::FP_EXTEND);
 918
 919   setTargetDAGCombine(ISD::GlobalAddress);
 920
 921   // In case of strict alignment, avoid an excessive number of byte wide stores.
 922   MaxStoresPerMemsetOptSize = 8;
 923   MaxStoresPerMemset =
 924       Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
 925
 926   MaxGluedStoresPerMemcpy = 4;
 927   MaxStoresPerMemcpyOptSize = 4;
 928   MaxStoresPerMemcpy =
 929       Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
 930
 931   MaxStoresPerMemmoveOptSize = 4;
 932   MaxStoresPerMemmove = 4;
 933
 934   MaxLoadsPerMemcmpOptSize = 4;
 935   MaxLoadsPerMemcmp =
 936       Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
 937
 938   setStackPointerRegisterToSaveRestore(AArch64::SP);
 939
 940   setSchedulingPreference(Sched::Hybrid);
 941
 942   EnableExtLdPromotion = true;
 943
 944   // Set required alignment.
 945   setMinFunctionAlignment(Align(4));
 946   // Set preferred alignments.
 947   setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
 948   setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
 949   setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
 950
 951   // Only change the limit for entries in a jump table if specified by
 952   // the sub target, but not at the command line.
 953   unsigned MaxJT = STI.getMaximumJumpTableSize();
 954   if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
 955     setMaximumJumpTableSize(MaxJT);
 956
 957   setHasExtractBitsInsn(true);
 958
 959   setMaxDivRemBitWidthSupported(128);
 960
 961   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 962
 963   if (Subtarget->hasNEON()) {
 964     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
 965     // silliness like this:
 966     for (auto Op :
 967          {ISD::SELECT,         ISD::SELECT_CC,      ISD::SETCC,
 968           ISD::BR_CC,          ISD::FADD,           ISD::FSUB,
 969           ISD::FMUL,           ISD::FDIV,           ISD::FMA,
 970           ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
 971           ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
 972           ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
 973           ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
 974           ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
 975           ISD::STRICT_FSUB,    ISD::STRICT_FMUL,    ISD::STRICT_FDIV,
 976           ISD::STRICT_FMA,     ISD::STRICT_FCEIL,   ISD::STRICT_FFLOOR,
 977           ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,   ISD::STRICT_FNEARBYINT,
 978           ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  ISD::STRICT_FROUNDEVEN,
 979           ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
 980           ISD::STRICT_FMAXIMUM})
 981       setOperationAction(Op, MVT::v1f64, Expand);
 982
 983     for (auto Op :
 984          {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
 985           ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
 986           ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
 987           ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
 988       setOperationAction(Op, MVT::v1i64, Expand);
 989
 990     // AArch64 doesn't have a direct vector ->f32 conversion instructions for
 991     // elements smaller than i32, so promote the input to i32 first.
 992     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
 993     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
 994
 995     // Similarly, there is no direct i32 -> f64 vector conversion instruction.
 996     // Or, direct i32 -> f16 vector conversion.  Set it so custom, so the
 997     // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
 998     for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
 999                     ISD::STRICT_UINT_TO_FP})
1000       for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1001         setOperationAction(Op, VT, Custom);
1002
1003     if (Subtarget->hasFullFP16()) {
1004       setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
1005
1006       setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
1007       setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1008       setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom);
1009       setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1010       setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
1011       setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1012       setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
1013       setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1014     } else {
1015       // when AArch64 doesn't have fullfp16 support, promote the input
1016       // to i32 first.
1017       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1018       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1019       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1020       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1021       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1022       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1023       setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1024       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1025     }
1026
1027     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
1028     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
1029     setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
1030     setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
1031     setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
1032     setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
1033     setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1034     setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
1035     for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1036       setOperationAction(ISD::UMAX, VT, Custom);
1037       setOperationAction(ISD::SMAX, VT, Custom);
1038       setOperationAction(ISD::UMIN, VT, Custom);
1039       setOperationAction(ISD::SMIN, VT, Custom);
1040     }
1041
1042     // AArch64 doesn't have MUL.2d:
1043     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
1044     // Custom handling for some quad-vector types to detect MULL.
1045     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1046     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1047     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1048
1049     // Saturates
1050     for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1051                     MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1052       setOperationAction(ISD::SADDSAT, VT, Legal);
1053       setOperationAction(ISD::UADDSAT, VT, Legal);
1054       setOperationAction(ISD::SSUBSAT, VT, Legal);
1055       setOperationAction(ISD::USUBSAT, VT, Legal);
1056     }
1057
1058     for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1059                    MVT::v4i32}) {
1060       setOperationAction(ISD::AVGFLOORS, VT, Legal);
1061       setOperationAction(ISD::AVGFLOORU, VT, Legal);
1062       setOperationAction(ISD::AVGCEILS, VT, Legal);
1063       setOperationAction(ISD::AVGCEILU, VT, Legal);
1064       setOperationAction(ISD::ABDS, VT, Legal);
1065       setOperationAction(ISD::ABDU, VT, Legal);
1066     }
1067
1068     // Vector reductions
1069     for (MVT VT : { MVT::v4f16, MVT::v2f32,
1070                     MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1071       if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1072         setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1073         setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1074
1075         setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1076       }
1077     }
1078     for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1079                     MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1080       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1081       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1082       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1083       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1084       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1085     }
1086     setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1087
1088     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
1089     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1090     // Likewise, narrowing and extending vector loads/stores aren't handled
1091     // directly.
1092     for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1093       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
1094
1095       if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1096         setOperationAction(ISD::MULHS, VT, Legal);
1097         setOperationAction(ISD::MULHU, VT, Legal);
1098       } else {
1099         setOperationAction(ISD::MULHS, VT, Expand);
1100         setOperationAction(ISD::MULHU, VT, Expand);
1101       }
1102       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1103       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1104
1105       setOperationAction(ISD::BSWAP, VT, Expand);
1106       setOperationAction(ISD::CTTZ, VT, Expand);
1107
1108       for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1109         setTruncStoreAction(VT, InnerVT, Expand);
1110         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1111         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1112         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1113       }
1114     }
1115
1116     // AArch64 has implementations of a lot of rounding-like FP operations.
1117     for (auto Op :
1118          {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1119           ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
1120           ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
1121           ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
1122       for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1123         setOperationAction(Op, Ty, Legal);
1124       if (Subtarget->hasFullFP16())
1125         for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1126           setOperationAction(Op, Ty, Legal);
1127     }
1128
1129     setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1130
1131     setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);
1132     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1133     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1134     setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);
1135     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1136     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1137
1138     // ADDP custom lowering
1139     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1140       setOperationAction(ISD::ADD, VT, Custom);
1141     // FADDP custom lowering
1142     for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1143       setOperationAction(ISD::FADD, VT, Custom);
1144   }
1145
1146   if (Subtarget->hasSME()) {
1147     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1148   }
1149
1150   // FIXME: Move lowering for more nodes here if those are common between
1151   // SVE and SME.
1152   if (Subtarget->hasSVE() || Subtarget->hasSME()) {
1153     for (auto VT :
1154          {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1155       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1156       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1157     }
1158   }
1159
1160   if (Subtarget->hasSME())
1161     setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1162
1163   if (Subtarget->hasSVE()) {
1164     for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1165       setOperationAction(ISD::BITREVERSE, VT, Custom);
1166       setOperationAction(ISD::BSWAP, VT, Custom);
1167       setOperationAction(ISD::CTLZ, VT, Custom);
1168       setOperationAction(ISD::CTPOP, VT, Custom);
1169       setOperationAction(ISD::CTTZ, VT, Custom);
1170       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1171       setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1172       setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1173       setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1174       setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1175       setOperationAction(ISD::MGATHER, VT, Custom);
1176       setOperationAction(ISD::MSCATTER, VT, Custom);
1177       setOperationAction(ISD::MLOAD, VT, Custom);
1178       setOperationAction(ISD::MUL, VT, Custom);
1179       setOperationAction(ISD::MULHS, VT, Custom);
1180       setOperationAction(ISD::MULHU, VT, Custom);
1181       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1182       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1183       setOperationAction(ISD::SELECT, VT, Custom);
1184       setOperationAction(ISD::SETCC, VT, Custom);
1185       setOperationAction(ISD::SDIV, VT, Custom);
1186       setOperationAction(ISD::UDIV, VT, Custom);
1187       setOperationAction(ISD::SMIN, VT, Custom);
1188       setOperationAction(ISD::UMIN, VT, Custom);
1189       setOperationAction(ISD::SMAX, VT, Custom);
1190       setOperationAction(ISD::UMAX, VT, Custom);
1191       setOperationAction(ISD::SHL, VT, Custom);
1192       setOperationAction(ISD::SRL, VT, Custom);
1193       setOperationAction(ISD::SRA, VT, Custom);
1194       setOperationAction(ISD::ABS, VT, Custom);
1195       setOperationAction(ISD::ABDS, VT, Custom);
1196       setOperationAction(ISD::ABDU, VT, Custom);
1197       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1198       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1199       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1200       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1201       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1202       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1203       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1204       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1205       setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1206
1207       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1208       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1209       setOperationAction(ISD::SELECT_CC, VT, Expand);
1210       setOperationAction(ISD::ROTL, VT, Expand);
1211       setOperationAction(ISD::ROTR, VT, Expand);
1212
1213       setOperationAction(ISD::SADDSAT, VT, Legal);
1214       setOperationAction(ISD::UADDSAT, VT, Legal);
1215       setOperationAction(ISD::SSUBSAT, VT, Legal);
1216       setOperationAction(ISD::USUBSAT, VT, Legal);
1217       setOperationAction(ISD::UREM, VT, Expand);
1218       setOperationAction(ISD::SREM, VT, Expand);
1219       setOperationAction(ISD::SDIVREM, VT, Expand);
1220       setOperationAction(ISD::UDIVREM, VT, Expand);
1221     }
1222
1223     // Illegal unpacked integer vector types.
1224     for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1225       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1226       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1227     }
1228
1229     // Legalize unpacked bitcasts to REINTERPRET_CAST.
1230     for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1231                     MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1232       setOperationAction(ISD::BITCAST, VT, Custom);
1233
1234     for (auto VT :
1235          { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1236            MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1237       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
1238
1239     for (auto VT :
1240          {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1241       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1242       setOperationAction(ISD::SELECT, VT, Custom);
1243       setOperationAction(ISD::SETCC, VT, Custom);
1244       setOperationAction(ISD::TRUNCATE, VT, Custom);
1245       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1246       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1247       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1248
1249       setOperationAction(ISD::SELECT_CC, VT, Expand);
1250       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1251       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1252
1253       // There are no legal MVT::nxv16f## based types.
1254       if (VT != MVT::nxv16i1) {
1255         setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1256         setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1257       }
1258     }
1259
1260     // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1261     for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1262                     MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1263                     MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1264       setOperationAction(ISD::MLOAD, VT, Custom);
1265       setOperationAction(ISD::MSTORE, VT, Custom);
1266       setOperationAction(ISD::MGATHER, VT, Custom);
1267       setOperationAction(ISD::MSCATTER, VT, Custom);
1268     }
1269
1270     // Firstly, exclude all scalable vector extending loads/truncating stores,
1271     // include both integer and floating scalable vector.
1272     for (MVT VT : MVT::scalable_vector_valuetypes()) {
1273       for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1274         setTruncStoreAction(VT, InnerVT, Expand);
1275         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1276         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1277         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1278       }
1279     }
1280
1281     // Then, selectively enable those which we directly support.
1282     setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1283     setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1284     setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1285     setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1286     setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1287     setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1288     for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1289       setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1290       setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1291       setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1292       setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1293       setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1294       setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1295     }
1296
1297     // SVE supports truncating stores of 64 and 128-bit vectors
1298     setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1299     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1300     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1301     setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1302     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1303
1304     for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1305                     MVT::nxv4f32, MVT::nxv2f64}) {
1306       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1307       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1308       setOperationAction(ISD::MGATHER, VT, Custom);
1309       setOperationAction(ISD::MSCATTER, VT, Custom);
1310       setOperationAction(ISD::MLOAD, VT, Custom);
1311       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1312       setOperationAction(ISD::SELECT, VT, Custom);
1313       setOperationAction(ISD::FADD, VT, Custom);
1314       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1315       setOperationAction(ISD::FDIV, VT, Custom);
1316       setOperationAction(ISD::FMA, VT, Custom);
1317       setOperationAction(ISD::FMAXIMUM, VT, Custom);
1318       setOperationAction(ISD::FMAXNUM, VT, Custom);
1319       setOperationAction(ISD::FMINIMUM, VT, Custom);
1320       setOperationAction(ISD::FMINNUM, VT, Custom);
1321       setOperationAction(ISD::FMUL, VT, Custom);
1322       setOperationAction(ISD::FNEG, VT, Custom);
1323       setOperationAction(ISD::FSUB, VT, Custom);
1324       setOperationAction(ISD::FCEIL, VT, Custom);
1325       setOperationAction(ISD::FFLOOR, VT, Custom);
1326       setOperationAction(ISD::FNEARBYINT, VT, Custom);
1327       setOperationAction(ISD::FRINT, VT, Custom);
1328       setOperationAction(ISD::FROUND, VT, Custom);
1329       setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1330       setOperationAction(ISD::FTRUNC, VT, Custom);
1331       setOperationAction(ISD::FSQRT, VT, Custom);
1332       setOperationAction(ISD::FABS, VT, Custom);
1333       setOperationAction(ISD::FP_EXTEND, VT, Custom);
1334       setOperationAction(ISD::FP_ROUND, VT, Custom);
1335       setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1336       setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1337       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1338       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1339       setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1340
1341       setOperationAction(ISD::SELECT_CC, VT, Expand);
1342       setOperationAction(ISD::FREM, VT, Expand);
1343       setOperationAction(ISD::FPOW, VT, Expand);
1344       setOperationAction(ISD::FPOWI, VT, Expand);
1345       setOperationAction(ISD::FCOS, VT, Expand);
1346       setOperationAction(ISD::FSIN, VT, Expand);
1347       setOperationAction(ISD::FSINCOS, VT, Expand);
1348       setOperationAction(ISD::FEXP, VT, Expand);
1349       setOperationAction(ISD::FEXP2, VT, Expand);
1350       setOperationAction(ISD::FLOG, VT, Expand);
1351       setOperationAction(ISD::FLOG2, VT, Expand);
1352       setOperationAction(ISD::FLOG10, VT, Expand);
1353
1354       setCondCodeAction(ISD::SETO, VT, Expand);
1355       setCondCodeAction(ISD::SETOLT, VT, Expand);
1356       setCondCodeAction(ISD::SETLT, VT, Expand);
1357       setCondCodeAction(ISD::SETOLE, VT, Expand);
1358       setCondCodeAction(ISD::SETLE, VT, Expand);
1359       setCondCodeAction(ISD::SETULT, VT, Expand);
1360       setCondCodeAction(ISD::SETULE, VT, Expand);
1361       setCondCodeAction(ISD::SETUGE, VT, Expand);
1362       setCondCodeAction(ISD::SETUGT, VT, Expand);
1363       setCondCodeAction(ISD::SETUEQ, VT, Expand);
1364       setCondCodeAction(ISD::SETONE, VT, Expand);
1365     }
1366
1367     for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1368       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1369       setOperationAction(ISD::MGATHER, VT, Custom);
1370       setOperationAction(ISD::MSCATTER, VT, Custom);
1371       setOperationAction(ISD::MLOAD, VT, Custom);
1372       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1373       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1374     }
1375
1376     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
1377     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
1378
1379     // NEON doesn't support integer divides, but SVE does
1380     for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1381                     MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1382       setOperationAction(ISD::SDIV, VT, Custom);
1383       setOperationAction(ISD::UDIV, VT, Custom);
1384     }
1385
1386     // NEON doesn't support 64-bit vector integer muls, but SVE does.
1387     setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1388     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1389
1390     // NEON doesn't support across-vector reductions, but SVE does.
1391     for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1392       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1393
1394     // NOTE: Currently this has to happen after computeRegisterProperties rather
1395     // than the preferred option of combining it with the addRegisterClass call.
1396     if (Subtarget->useSVEForFixedLengthVectors()) {
1397       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
1398         if (useSVEForFixedLengthVectorVT(VT))
1399           addTypeForFixedLengthSVE(VT);
1400       for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
1401         if (useSVEForFixedLengthVectorVT(VT))
1402           addTypeForFixedLengthSVE(VT);
1403
1404       // 64bit results can mean a bigger than NEON input.
1405       for (auto VT : {MVT::v8i8, MVT::v4i16})
1406         setOperationAction(ISD::TRUNCATE, VT, Custom);
1407       setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
1408
1409       // 128bit results imply a bigger than NEON input.
1410       for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1411         setOperationAction(ISD::TRUNCATE, VT, Custom);
1412       for (auto VT : {MVT::v8f16, MVT::v4f32})
1413         setOperationAction(ISD::FP_ROUND, VT, Custom);
1414
1415       // These operations are not supported on NEON but SVE can do them.
1416       setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1417       setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1418       setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1419       setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1420       setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1421       setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1422       setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1423       setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1424       setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1425       setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1426       setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1427       setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1428       setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1429       setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1430       setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1431       setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1432       setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1433       setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1434       setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1435       setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1436
1437       // Int operations with no NEON support.
1438       for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1439                       MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1440         setOperationAction(ISD::BITREVERSE, VT, Custom);
1441         setOperationAction(ISD::CTTZ, VT, Custom);
1442         setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1443         setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1444         setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1445       }
1446
1447
1448       // Use SVE for vectors with more than 2 elements.
1449       for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1450         setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1451     }
1452
1453     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1454     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1455     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1456     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1457
1458     setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1459   }
1460
1461   if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1462     // Only required for llvm.aarch64.mops.memset.tag
1463     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
1464   }
1465
1466   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1467
1468   IsStrictFPEnabled = true;
1469 }
1470
1471 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1472   assert(VT.isVector() && "VT should be a vector type");
1473
1474   if (VT.isFloatingPoint()) {
1475     MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
1476     setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1477     setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1478   }
1479
1480   // Mark vector float intrinsics as expand.
1481   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1482     setOperationAction(ISD::FSIN, VT, Expand);
1483     setOperationAction(ISD::FCOS, VT, Expand);
1484     setOperationAction(ISD::FPOW, VT, Expand);
1485     setOperationAction(ISD::FLOG, VT, Expand);
1486     setOperationAction(ISD::FLOG2, VT, Expand);
1487     setOperationAction(ISD::FLOG10, VT, Expand);
1488     setOperationAction(ISD::FEXP, VT, Expand);
1489     setOperationAction(ISD::FEXP2, VT, Expand);
1490   }
1491
1492   // But we do support custom-lowering for FCOPYSIGN.
1493   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1494       ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1495     setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1496
1497   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1498   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1499   setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1500   setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1501   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1502   setOperationAction(ISD::SRA, VT, Custom);
1503   setOperationAction(ISD::SRL, VT, Custom);
1504   setOperationAction(ISD::SHL, VT, Custom);
1505   setOperationAction(ISD::OR, VT, Custom);
1506   setOperationAction(ISD::SETCC, VT, Custom);
1507   setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
1508
1509   setOperationAction(ISD::SELECT, VT, Expand);
1510   setOperationAction(ISD::SELECT_CC, VT, Expand);
1511   setOperationAction(ISD::VSELECT, VT, Expand);
1512   for (MVT InnerVT : MVT::all_valuetypes())
1513     setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1514
1515   // CNT supports only B element sizes, then use UADDLP to widen.
1516   if (VT != MVT::v8i8 && VT != MVT::v16i8)
1517     setOperationAction(ISD::CTPOP, VT, Custom);
1518
1519   setOperationAction(ISD::UDIV, VT, Expand);
1520   setOperationAction(ISD::SDIV, VT, Expand);
1521   setOperationAction(ISD::UREM, VT, Expand);
1522   setOperationAction(ISD::SREM, VT, Expand);
1523   setOperationAction(ISD::FREM, VT, Expand);
1524
1525   for (unsigned Opcode :
1526        {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1527         ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1528     setOperationAction(Opcode, VT, Custom);
1529
1530   if (!VT.isFloatingPoint())
1531     setOperationAction(ISD::ABS, VT, Legal);
1532
1533   // [SU][MIN|MAX] are available for all NEON types apart from i64.
1534   if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1535     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1536       setOperationAction(Opcode, VT, Legal);
1537
1538   // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1539   // NEON types.
1540   if (VT.isFloatingPoint() &&
1541       VT.getVectorElementType() != MVT::bf16 &&
1542       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1543     for (unsigned Opcode :
1544          {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
1545           ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
1546           ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
1547           ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
1548           ISD::STRICT_FSQRT})
1549       setOperationAction(Opcode, VT, Legal);
1550
1551   // Strict fp extend and trunc are legal
1552   if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1553     setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
1554   if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1555     setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
1556
1557   // FIXME: We could potentially make use of the vector comparison instructions
1558   // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1559   // complications:
1560   //  * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1561   //    so we would need to expand when the condition code doesn't match the
1562   //    kind of comparison.
1563   //  * Some kinds of comparison require more than one FCMXY instruction so
1564   //    would need to be expanded instead.
1565   //  * The lowering of the non-strict versions involves target-specific ISD
1566   //    nodes so we would likely need to add strict versions of all of them and
1567   //    handle them appropriately.
1568   setOperationAction(ISD::STRICT_FSETCC, VT, Expand);
1569   setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);
1570
1571   if (Subtarget->isLittleEndian()) {
1572     for (unsigned im = (unsigned)ISD::PRE_INC;
1573          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1574       setIndexedLoadAction(im, VT, Legal);
1575       setIndexedStoreAction(im, VT, Legal);
1576     }
1577   }
1578 }
1579
1580 bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
1581                                                           EVT OpVT) const {
1582   // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1583   if (!Subtarget->hasSVE())
1584     return true;
1585
1586   // We can only support legal predicate result types. We can use the SVE
1587   // whilelo instruction for generating fixed-width predicates too.
1588   if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1589       ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1590       ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1591     return true;
1592
1593   // The whilelo instruction only works with i32 or i64 scalar inputs.
1594   if (OpVT != MVT::i32 && OpVT != MVT::i64)
1595     return true;
1596
1597   return false;
1598 }
1599
1600 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1601   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1602
1603   // By default everything must be expanded.
1604   for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1605     setOperationAction(Op, VT, Expand);
1606
1607   // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1608   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1609
1610   if (VT.isFloatingPoint()) {
1611     setCondCodeAction(ISD::SETO, VT, Expand);
1612     setCondCodeAction(ISD::SETOLT, VT, Expand);
1613     setCondCodeAction(ISD::SETLT, VT, Expand);
1614     setCondCodeAction(ISD::SETOLE, VT, Expand);
1615     setCondCodeAction(ISD::SETLE, VT, Expand);
1616     setCondCodeAction(ISD::SETULT, VT, Expand);
1617     setCondCodeAction(ISD::SETULE, VT, Expand);
1618     setCondCodeAction(ISD::SETUGE, VT, Expand);
1619     setCondCodeAction(ISD::SETUGT, VT, Expand);
1620     setCondCodeAction(ISD::SETUEQ, VT, Expand);
1621     setCondCodeAction(ISD::SETONE, VT, Expand);
1622   }
1623
1624   // Mark integer truncating stores/extending loads as having custom lowering
1625   if (VT.isInteger()) {
1626     MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1627     while (InnerVT != VT) {
1628       setTruncStoreAction(VT, InnerVT, Custom);
1629       setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1630       setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1631       InnerVT = InnerVT.changeVectorElementType(
1632           MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1633     }
1634   }
1635
1636   // Mark floating-point truncating stores/extending loads as having custom
1637   // lowering
1638   if (VT.isFloatingPoint()) {
1639     MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1640     while (InnerVT != VT) {
1641       setTruncStoreAction(VT, InnerVT, Custom);
1642       setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1643       InnerVT = InnerVT.changeVectorElementType(
1644           MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
1645     }
1646   }
1647
1648   // Lower fixed length vector operations to scalable equivalents.
1649   setOperationAction(ISD::ABS, VT, Custom);
1650   setOperationAction(ISD::ADD, VT, Custom);
1651   setOperationAction(ISD::AND, VT, Custom);
1652   setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1653   setOperationAction(ISD::BITCAST, VT, Custom);
1654   setOperationAction(ISD::BITREVERSE, VT, Custom);
1655   setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1656   setOperationAction(ISD::BSWAP, VT, Custom);
1657   setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1658   setOperationAction(ISD::CTLZ, VT, Custom);
1659   setOperationAction(ISD::CTPOP, VT, Custom);
1660   setOperationAction(ISD::CTTZ, VT, Custom);
1661   setOperationAction(ISD::FABS, VT, Custom);
1662   setOperationAction(ISD::FADD, VT, Custom);
1663   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1664   setOperationAction(ISD::FCEIL, VT, Custom);
1665   setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1666   setOperationAction(ISD::FDIV, VT, Custom);
1667   setOperationAction(ISD::FFLOOR, VT, Custom);
1668   setOperationAction(ISD::FMA, VT, Custom);
1669   setOperationAction(ISD::FMAXIMUM, VT, Custom);
1670   setOperationAction(ISD::FMAXNUM, VT, Custom);
1671   setOperationAction(ISD::FMINIMUM, VT, Custom);
1672   setOperationAction(ISD::FMINNUM, VT, Custom);
1673   setOperationAction(ISD::FMUL, VT, Custom);
1674   setOperationAction(ISD::FNEARBYINT, VT, Custom);
1675   setOperationAction(ISD::FNEG, VT, Custom);
1676   setOperationAction(ISD::FP_EXTEND, VT, Custom);
1677   setOperationAction(ISD::FP_ROUND, VT, Custom);
1678   setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1679   setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1680   setOperationAction(ISD::FRINT, VT, Custom);
1681   setOperationAction(ISD::FROUND, VT, Custom);
1682   setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1683   setOperationAction(ISD::FSQRT, VT, Custom);
1684   setOperationAction(ISD::FSUB, VT, Custom);
1685   setOperationAction(ISD::FTRUNC, VT, Custom);
1686   setOperationAction(ISD::LOAD, VT, Custom);
1687   setOperationAction(ISD::MGATHER, VT, Custom);
1688   setOperationAction(ISD::MLOAD, VT, Custom);
1689   setOperationAction(ISD::MSCATTER, VT, Custom);
1690   setOperationAction(ISD::MSTORE, VT, Custom);
1691   setOperationAction(ISD::MUL, VT, Custom);
1692   setOperationAction(ISD::MULHS, VT, Custom);
1693   setOperationAction(ISD::MULHU, VT, Custom);
1694   setOperationAction(ISD::OR, VT, Custom);
1695   setOperationAction(ISD::SDIV, VT, Custom);
1696   setOperationAction(ISD::SELECT, VT, Custom);
1697   setOperationAction(ISD::SETCC, VT, Custom);
1698   setOperationAction(ISD::SHL, VT, Custom);
1699   setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1700   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1701   setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1702   setOperationAction(ISD::SMAX, VT, Custom);
1703   setOperationAction(ISD::SMIN, VT, Custom);
1704   setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1705   setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1706   setOperationAction(ISD::SRA, VT, Custom);
1707   setOperationAction(ISD::SRL, VT, Custom);
1708   setOperationAction(ISD::STORE, VT, Custom);
1709   setOperationAction(ISD::SUB, VT, Custom);
1710   setOperationAction(ISD::TRUNCATE, VT, Custom);
1711   setOperationAction(ISD::UDIV, VT, Custom);
1712   setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1713   setOperationAction(ISD::UMAX, VT, Custom);
1714   setOperationAction(ISD::UMIN, VT, Custom);
1715   setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1716   setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1717   setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1718   setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1719   setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1720   setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1721   setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1722   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1723   setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1724   setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1725   setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1726   setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1727   setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1728   setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1729   setOperationAction(ISD::VSELECT, VT, Custom);
1730   setOperationAction(ISD::XOR, VT, Custom);
1731   setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1732 }
1733
1734 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1735   addRegisterClass(VT, &AArch64::FPR64RegClass);
1736   addTypeForNEON(VT);
1737 }
1738
1739 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1740   addRegisterClass(VT, &AArch64::FPR128RegClass);
1741   addTypeForNEON(VT);
1742 }
1743
1744 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
1745                                               LLVMContext &C, EVT VT) const {
1746   if (!VT.isVector())
1747     return MVT::i32;
1748   if (VT.isScalableVector())
1749     return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
1750   return VT.changeVectorElementTypeToInteger();
1751 }
1752
1753 // isIntImmediate - This method tests to see if the node is a constant
1754 // operand. If so Imm will receive the value.
1755 static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
1756   if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
1757     Imm = C->getZExtValue();
1758     return true;
1759   }
1760   return false;
1761 }
1762
1763 // isOpcWithIntImmediate - This method tests to see if the node is a specific
1764 // opcode and that it has a immediate integer right operand.
1765 // If so Imm will receive the value.
1766 static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
1767                                   uint64_t &Imm) {
1768   return N->getOpcode() == Opc &&
1769          isIntImmediate(N->getOperand(1).getNode(), Imm);
1770 }
1771
1772 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1773                                const APInt &Demanded,
1774                                TargetLowering::TargetLoweringOpt &TLO,
1775                                unsigned NewOpc) {
1776   uint64_t OldImm = Imm, NewImm, Enc;
1777   uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1778
1779   // Return if the immediate is already all zeros, all ones, a bimm32 or a
1780   // bimm64.
1781   if (Imm == 0 || Imm == Mask ||
1782       AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
1783     return false;
1784
1785   unsigned EltSize = Size;
1786   uint64_t DemandedBits = Demanded.getZExtValue();
1787
1788   // Clear bits that are not demanded.
1789   Imm &= DemandedBits;
1790
1791   while (true) {
1792     // The goal here is to set the non-demanded bits in a way that minimizes
1793     // the number of switching between 0 and 1. In order to achieve this goal,
1794     // we set the non-demanded bits to the value of the preceding demanded bits.
1795     // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1796     // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1797     // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1798     // The final result is 0b11000011.
1799     uint64_t NonDemandedBits = ~DemandedBits;
1800     uint64_t InvertedImm = ~Imm & DemandedBits;
1801     uint64_t RotatedImm =
1802         ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1803         NonDemandedBits;
1804     uint64_t Sum = RotatedImm + NonDemandedBits;
1805     bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1806     uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1807     NewImm = (Imm | Ones) & Mask;
1808
1809     // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1810     // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1811     // we halve the element size and continue the search.
1812     if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1813       break;
1814
1815     // We cannot shrink the element size any further if it is 2-bits.
1816     if (EltSize == 2)
1817       return false;
1818
1819     EltSize /= 2;
1820     Mask >>= EltSize;
1821     uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1822
1823     // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1824     if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1825       return false;
1826
1827     // Merge the upper and lower halves of Imm and DemandedBits.
1828     Imm |= Hi;
1829     DemandedBits |= DemandedBitsHi;
1830   }
1831
1832   ++NumOptimizedImms;
1833
1834   // Replicate the element across the register width.
1835   while (EltSize < Size) {
1836     NewImm |= NewImm << EltSize;
1837     EltSize *= 2;
1838   }
1839
1840   (void)OldImm;
1841   assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1842          "demanded bits should never be altered");
1843   assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1844
1845   // Create the new constant immediate node.
1846   EVT VT = Op.getValueType();
1847   SDLoc DL(Op);
1848   SDValue New;
1849
1850   // If the new constant immediate is all-zeros or all-ones, let the target
1851   // independent DAG combine optimize this node.
1852   if (NewImm == 0 || NewImm == OrigMask) {
1853     New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1854                           TLO.DAG.getConstant(NewImm, DL, VT));
1855   // Otherwise, create a machine node so that target independent DAG combine
1856   // doesn't undo this optimization.
1857   } else {
1858     Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
1859     SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1860     New = SDValue(
1861         TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1862   }
1863
1864   return TLO.CombineTo(Op, New);
1865 }
1866
1867 bool AArch64TargetLowering::targetShrinkDemandedConstant(
1868     SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1869     TargetLoweringOpt &TLO) const {
1870   // Delay this optimization to as late as possible.
1871   if (!TLO.LegalOps)
1872     return false;
1873
1874   if (!EnableOptimizeLogicalImm)
1875     return false;
1876
1877   EVT VT = Op.getValueType();
1878   if (VT.isVector())
1879     return false;
1880
1881   unsigned Size = VT.getSizeInBits();
1882   assert((Size == 32 || Size == 64) &&
1883          "i32 or i64 is expected after legalization.");
1884
1885   // Exit early if we demand all bits.
1886   if (DemandedBits.countPopulation() == Size)
1887     return false;
1888
1889   unsigned NewOpc;
1890   switch (Op.getOpcode()) {
1891   default:
1892     return false;
1893   case ISD::AND:
1894     NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1895     break;
1896   case ISD::OR:
1897     NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1898     break;
1899   case ISD::XOR:
1900     NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1901     break;
1902   }
1903   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1904   if (!C)
1905     return false;
1906   uint64_t Imm = C->getZExtValue();
1907   return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1908 }
1909
1910 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1911 /// Mask are known to be either zero or one and return them Known.
1912 void AArch64TargetLowering::computeKnownBitsForTargetNode(
1913     const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
1914     const SelectionDAG &DAG, unsigned Depth) const {
1915   switch (Op.getOpcode()) {
1916   default:
1917     break;
1918   case AArch64ISD::DUP: {
1919     SDValue SrcOp = Op.getOperand(0);
1920     Known = DAG.computeKnownBits(SrcOp, Depth + 1);
1921     if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
1922       assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
1923              "Expected DUP implicit truncation");
1924       Known = Known.trunc(Op.getScalarValueSizeInBits());
1925     }
1926     break;
1927   }
1928   case AArch64ISD::CSEL: {
1929     KnownBits Known2;
1930     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1931     Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1932     Known = KnownBits::commonBits(Known, Known2);
1933     break;
1934   }
1935   case AArch64ISD::BICi: {
1936     // Compute the bit cleared value.
1937     uint64_t Mask =
1938         ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
1939     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1940     Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
1941     break;
1942   }
1943   case AArch64ISD::VLSHR: {
1944     KnownBits Known2;
1945     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1946     Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1947     Known = KnownBits::lshr(Known, Known2);
1948     break;
1949   }
1950   case AArch64ISD::VASHR: {
1951     KnownBits Known2;
1952     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1953     Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1954     Known = KnownBits::ashr(Known, Known2);
1955     break;
1956   }
1957   case AArch64ISD::LOADgot:
1958   case AArch64ISD::ADDlow: {
1959     if (!Subtarget->isTargetILP32())
1960       break;
1961     // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1962     Known.Zero = APInt::getHighBitsSet(64, 32);
1963     break;
1964   }
1965   case AArch64ISD::ASSERT_ZEXT_BOOL: {
1966     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1967     Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
1968     break;
1969   }
1970   case ISD::INTRINSIC_W_CHAIN: {
1971     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1972     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1973     switch (IntID) {
1974     default: return;
1975     case Intrinsic::aarch64_ldaxr:
1976     case Intrinsic::aarch64_ldxr: {
1977       unsigned BitWidth = Known.getBitWidth();
1978       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1979       unsigned MemBits = VT.getScalarSizeInBits();
1980       Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1981       return;
1982     }
1983     }
1984     break;
1985   }
1986   case ISD::INTRINSIC_WO_CHAIN:
1987   case ISD::INTRINSIC_VOID: {
1988     unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1989     switch (IntNo) {
1990     default:
1991       break;
1992     case Intrinsic::aarch64_neon_umaxv:
1993     case Intrinsic::aarch64_neon_uminv: {
1994       // Figure out the datatype of the vector operand. The UMINV instruction
1995       // will zero extend the result, so we can mark as known zero all the
1996       // bits larger than the element datatype. 32-bit or larget doesn't need
1997       // this as those are legal types and will be handled by isel directly.
1998       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1999       unsigned BitWidth = Known.getBitWidth();
2000       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2001         assert(BitWidth >= 8 && "Unexpected width!");
2002         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
2003         Known.Zero |= Mask;
2004       } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2005         assert(BitWidth >= 16 && "Unexpected width!");
2006         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
2007         Known.Zero |= Mask;
2008       }
2009       break;
2010     } break;
2011     }
2012   }
2013   }
2014 }
2015
2016 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
2017                                                   EVT) const {
2018   return MVT::i64;
2019 }
2020
2021 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2022     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2023     bool *Fast) const {
2024   if (Subtarget->requiresStrictAlign())
2025     return false;
2026
2027   if (Fast) {
2028     // Some CPUs are fine with unaligned stores except for 128-bit ones.
2029     *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2030             // See comments in performSTORECombine() for more details about
2031             // these conditions.
2032
2033             // Code that uses clang vector extensions can mark that it
2034             // wants unaligned accesses to be treated as fast by
2035             // underspecifying alignment to be 1 or 2.
2036             Alignment <= 2 ||
2037
2038             // Disregard v2i64. Memcpy lowering produces those and splitting
2039             // them regresses performance on micro-benchmarks and olden/bh.
2040             VT == MVT::v2i64;
2041   }
2042   return true;
2043 }
2044
2045 // Same as above but handling LLTs instead.
2046 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2047     LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2048     bool *Fast) const {
2049   if (Subtarget->requiresStrictAlign())
2050     return false;
2051
2052   if (Fast) {
2053     // Some CPUs are fine with unaligned stores except for 128-bit ones.
2054     *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2055             Ty.getSizeInBytes() != 16 ||
2056             // See comments in performSTORECombine() for more details about
2057             // these conditions.
2058
2059             // Code that uses clang vector extensions can mark that it
2060             // wants unaligned accesses to be treated as fast by
2061             // underspecifying alignment to be 1 or 2.
2062             Alignment <= 2 ||
2063
2064             // Disregard v2i64. Memcpy lowering produces those and splitting
2065             // them regresses performance on micro-benchmarks and olden/bh.
2066             Ty == LLT::fixed_vector(2, 64);
2067   }
2068   return true;
2069 }
2070
2071 FastISel *
2072 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2073                                       const TargetLibraryInfo *libInfo) const {
2074   return AArch64::createFastISel(funcInfo, libInfo);
2075 }
2076
2077 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2078 #define MAKE_CASE(V)                                                           \
2079   case V:                                                                      \
2080     return #V;
2081   switch ((AArch64ISD::NodeType)Opcode) {
2082   case AArch64ISD::FIRST_NUMBER:
2083     break;
2084     MAKE_CASE(AArch64ISD::OBSCURE_COPY)
2085     MAKE_CASE(AArch64ISD::SMSTART)
2086     MAKE_CASE(AArch64ISD::SMSTOP)
2087     MAKE_CASE(AArch64ISD::RESTORE_ZA)
2088     MAKE_CASE(AArch64ISD::CALL)
2089     MAKE_CASE(AArch64ISD::ADRP)
2090     MAKE_CASE(AArch64ISD::ADR)
2091     MAKE_CASE(AArch64ISD::ADDlow)
2092     MAKE_CASE(AArch64ISD::LOADgot)
2093     MAKE_CASE(AArch64ISD::RET_FLAG)
2094     MAKE_CASE(AArch64ISD::BRCOND)
2095     MAKE_CASE(AArch64ISD::CSEL)
2096     MAKE_CASE(AArch64ISD::CSINV)
2097     MAKE_CASE(AArch64ISD::CSNEG)
2098     MAKE_CASE(AArch64ISD::CSINC)
2099     MAKE_CASE(AArch64ISD::THREAD_POINTER)
2100     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
2101     MAKE_CASE(AArch64ISD::ABDS_PRED)
2102     MAKE_CASE(AArch64ISD::ABDU_PRED)
2103     MAKE_CASE(AArch64ISD::MUL_PRED)
2104     MAKE_CASE(AArch64ISD::MULHS_PRED)
2105     MAKE_CASE(AArch64ISD::MULHU_PRED)
2106     MAKE_CASE(AArch64ISD::SDIV_PRED)
2107     MAKE_CASE(AArch64ISD::SHL_PRED)
2108     MAKE_CASE(AArch64ISD::SMAX_PRED)
2109     MAKE_CASE(AArch64ISD::SMIN_PRED)
2110     MAKE_CASE(AArch64ISD::SRA_PRED)
2111     MAKE_CASE(AArch64ISD::SRL_PRED)
2112     MAKE_CASE(AArch64ISD::UDIV_PRED)
2113     MAKE_CASE(AArch64ISD::UMAX_PRED)
2114     MAKE_CASE(AArch64ISD::UMIN_PRED)
2115     MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1)
2116     MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
2117     MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
2118     MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
2119     MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
2120     MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
2121     MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
2122     MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
2123     MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
2124     MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
2125     MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
2126     MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
2127     MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
2128     MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
2129     MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
2130     MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
2131     MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
2132     MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
2133     MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
2134     MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
2135     MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
2136     MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
2137     MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
2138     MAKE_CASE(AArch64ISD::ADC)
2139     MAKE_CASE(AArch64ISD::SBC)
2140     MAKE_CASE(AArch64ISD::ADDS)
2141     MAKE_CASE(AArch64ISD::SUBS)
2142     MAKE_CASE(AArch64ISD::ADCS)
2143     MAKE_CASE(AArch64ISD::SBCS)
2144     MAKE_CASE(AArch64ISD::ANDS)
2145     MAKE_CASE(AArch64ISD::CCMP)
2146     MAKE_CASE(AArch64ISD::CCMN)
2147     MAKE_CASE(AArch64ISD::FCCMP)
2148     MAKE_CASE(AArch64ISD::FCMP)
2149     MAKE_CASE(AArch64ISD::STRICT_FCMP)
2150     MAKE_CASE(AArch64ISD::STRICT_FCMPE)
2151     MAKE_CASE(AArch64ISD::DUP)
2152     MAKE_CASE(AArch64ISD::DUPLANE8)
2153     MAKE_CASE(AArch64ISD::DUPLANE16)
2154     MAKE_CASE(AArch64ISD::DUPLANE32)
2155     MAKE_CASE(AArch64ISD::DUPLANE64)
2156     MAKE_CASE(AArch64ISD::DUPLANE128)
2157     MAKE_CASE(AArch64ISD::MOVI)
2158     MAKE_CASE(AArch64ISD::MOVIshift)
2159     MAKE_CASE(AArch64ISD::MOVIedit)
2160     MAKE_CASE(AArch64ISD::MOVImsl)
2161     MAKE_CASE(AArch64ISD::FMOV)
2162     MAKE_CASE(AArch64ISD::MVNIshift)
2163     MAKE_CASE(AArch64ISD::MVNImsl)
2164     MAKE_CASE(AArch64ISD::BICi)
2165     MAKE_CASE(AArch64ISD::ORRi)
2166     MAKE_CASE(AArch64ISD::BSP)
2167     MAKE_CASE(AArch64ISD::EXTR)
2168     MAKE_CASE(AArch64ISD::ZIP1)
2169     MAKE_CASE(AArch64ISD::ZIP2)
2170     MAKE_CASE(AArch64ISD::UZP1)
2171     MAKE_CASE(AArch64ISD::UZP2)
2172     MAKE_CASE(AArch64ISD::TRN1)
2173     MAKE_CASE(AArch64ISD::TRN2)
2174     MAKE_CASE(AArch64ISD::REV16)
2175     MAKE_CASE(AArch64ISD::REV32)
2176     MAKE_CASE(AArch64ISD::REV64)
2177     MAKE_CASE(AArch64ISD::EXT)
2178     MAKE_CASE(AArch64ISD::SPLICE)
2179     MAKE_CASE(AArch64ISD::VSHL)
2180     MAKE_CASE(AArch64ISD::VLSHR)
2181     MAKE_CASE(AArch64ISD::VASHR)
2182     MAKE_CASE(AArch64ISD::VSLI)
2183     MAKE_CASE(AArch64ISD::VSRI)
2184     MAKE_CASE(AArch64ISD::CMEQ)
2185     MAKE_CASE(AArch64ISD::CMGE)
2186     MAKE_CASE(AArch64ISD::CMGT)
2187     MAKE_CASE(AArch64ISD::CMHI)
2188     MAKE_CASE(AArch64ISD::CMHS)
2189     MAKE_CASE(AArch64ISD::FCMEQ)
2190     MAKE_CASE(AArch64ISD::FCMGE)
2191     MAKE_CASE(AArch64ISD::FCMGT)
2192     MAKE_CASE(AArch64ISD::CMEQz)
2193     MAKE_CASE(AArch64ISD::CMGEz)
2194     MAKE_CASE(AArch64ISD::CMGTz)
2195     MAKE_CASE(AArch64ISD::CMLEz)
2196     MAKE_CASE(AArch64ISD::CMLTz)
2197     MAKE_CASE(AArch64ISD::FCMEQz)
2198     MAKE_CASE(AArch64ISD::FCMGEz)
2199     MAKE_CASE(AArch64ISD::FCMGTz)
2200     MAKE_CASE(AArch64ISD::FCMLEz)
2201     MAKE_CASE(AArch64ISD::FCMLTz)
2202     MAKE_CASE(AArch64ISD::SADDV)
2203     MAKE_CASE(AArch64ISD::UADDV)
2204     MAKE_CASE(AArch64ISD::SDOT)
2205     MAKE_CASE(AArch64ISD::UDOT)
2206     MAKE_CASE(AArch64ISD::SMINV)
2207     MAKE_CASE(AArch64ISD::UMINV)
2208     MAKE_CASE(AArch64ISD::SMAXV)
2209     MAKE_CASE(AArch64ISD::UMAXV)
2210     MAKE_CASE(AArch64ISD::SADDV_PRED)
2211     MAKE_CASE(AArch64ISD::UADDV_PRED)
2212     MAKE_CASE(AArch64ISD::SMAXV_PRED)
2213     MAKE_CASE(AArch64ISD::UMAXV_PRED)
2214     MAKE_CASE(AArch64ISD::SMINV_PRED)
2215     MAKE_CASE(AArch64ISD::UMINV_PRED)
2216     MAKE_CASE(AArch64ISD::ORV_PRED)
2217     MAKE_CASE(AArch64ISD::EORV_PRED)
2218     MAKE_CASE(AArch64ISD::ANDV_PRED)
2219     MAKE_CASE(AArch64ISD::CLASTA_N)
2220     MAKE_CASE(AArch64ISD::CLASTB_N)
2221     MAKE_CASE(AArch64ISD::LASTA)
2222     MAKE_CASE(AArch64ISD::LASTB)
2223     MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
2224     MAKE_CASE(AArch64ISD::LS64_BUILD)
2225     MAKE_CASE(AArch64ISD::LS64_EXTRACT)
2226     MAKE_CASE(AArch64ISD::TBL)
2227     MAKE_CASE(AArch64ISD::FADD_PRED)
2228     MAKE_CASE(AArch64ISD::FADDA_PRED)
2229     MAKE_CASE(AArch64ISD::FADDV_PRED)
2230     MAKE_CASE(AArch64ISD::FDIV_PRED)
2231     MAKE_CASE(AArch64ISD::FMA_PRED)
2232     MAKE_CASE(AArch64ISD::FMAX_PRED)
2233     MAKE_CASE(AArch64ISD::FMAXV_PRED)
2234     MAKE_CASE(AArch64ISD::FMAXNM_PRED)
2235     MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
2236     MAKE_CASE(AArch64ISD::FMIN_PRED)
2237     MAKE_CASE(AArch64ISD::FMINV_PRED)
2238     MAKE_CASE(AArch64ISD::FMINNM_PRED)
2239     MAKE_CASE(AArch64ISD::FMINNMV_PRED)
2240     MAKE_CASE(AArch64ISD::FMUL_PRED)
2241     MAKE_CASE(AArch64ISD::FSUB_PRED)
2242     MAKE_CASE(AArch64ISD::RDSVL)
2243     MAKE_CASE(AArch64ISD::BIC)
2244     MAKE_CASE(AArch64ISD::BIT)
2245     MAKE_CASE(AArch64ISD::CBZ)
2246     MAKE_CASE(AArch64ISD::CBNZ)
2247     MAKE_CASE(AArch64ISD::TBZ)
2248     MAKE_CASE(AArch64ISD::TBNZ)
2249     MAKE_CASE(AArch64ISD::TC_RETURN)
2250     MAKE_CASE(AArch64ISD::PREFETCH)
2251     MAKE_CASE(AArch64ISD::SITOF)
2252     MAKE_CASE(AArch64ISD::UITOF)
2253     MAKE_CASE(AArch64ISD::NVCAST)
2254     MAKE_CASE(AArch64ISD::MRS)
2255     MAKE_CASE(AArch64ISD::SQSHL_I)
2256     MAKE_CASE(AArch64ISD::UQSHL_I)
2257     MAKE_CASE(AArch64ISD::SRSHR_I)
2258     MAKE_CASE(AArch64ISD::URSHR_I)
2259     MAKE_CASE(AArch64ISD::SQSHLU_I)
2260     MAKE_CASE(AArch64ISD::WrapperLarge)
2261     MAKE_CASE(AArch64ISD::LD2post)
2262     MAKE_CASE(AArch64ISD::LD3post)
2263     MAKE_CASE(AArch64ISD::LD4post)
2264     MAKE_CASE(AArch64ISD::ST2post)
2265     MAKE_CASE(AArch64ISD::ST3post)
2266     MAKE_CASE(AArch64ISD::ST4post)
2267     MAKE_CASE(AArch64ISD::LD1x2post)
2268     MAKE_CASE(AArch64ISD::LD1x3post)
2269     MAKE_CASE(AArch64ISD::LD1x4post)
2270     MAKE_CASE(AArch64ISD::ST1x2post)
2271     MAKE_CASE(AArch64ISD::ST1x3post)
2272     MAKE_CASE(AArch64ISD::ST1x4post)
2273     MAKE_CASE(AArch64ISD::LD1DUPpost)
2274     MAKE_CASE(AArch64ISD::LD2DUPpost)
2275     MAKE_CASE(AArch64ISD::LD3DUPpost)
2276     MAKE_CASE(AArch64ISD::LD4DUPpost)
2277     MAKE_CASE(AArch64ISD::LD1LANEpost)
2278     MAKE_CASE(AArch64ISD::LD2LANEpost)
2279     MAKE_CASE(AArch64ISD::LD3LANEpost)
2280     MAKE_CASE(AArch64ISD::LD4LANEpost)
2281     MAKE_CASE(AArch64ISD::ST2LANEpost)
2282     MAKE_CASE(AArch64ISD::ST3LANEpost)
2283     MAKE_CASE(AArch64ISD::ST4LANEpost)
2284     MAKE_CASE(AArch64ISD::SMULL)
2285     MAKE_CASE(AArch64ISD::UMULL)
2286     MAKE_CASE(AArch64ISD::PMULL)
2287     MAKE_CASE(AArch64ISD::FRECPE)
2288     MAKE_CASE(AArch64ISD::FRECPS)
2289     MAKE_CASE(AArch64ISD::FRSQRTE)
2290     MAKE_CASE(AArch64ISD::FRSQRTS)
2291     MAKE_CASE(AArch64ISD::STG)
2292     MAKE_CASE(AArch64ISD::STZG)
2293     MAKE_CASE(AArch64ISD::ST2G)
2294     MAKE_CASE(AArch64ISD::STZ2G)
2295     MAKE_CASE(AArch64ISD::SUNPKHI)
2296     MAKE_CASE(AArch64ISD::SUNPKLO)
2297     MAKE_CASE(AArch64ISD::UUNPKHI)
2298     MAKE_CASE(AArch64ISD::UUNPKLO)
2299     MAKE_CASE(AArch64ISD::INSR)
2300     MAKE_CASE(AArch64ISD::PTEST)
2301     MAKE_CASE(AArch64ISD::PTRUE)
2302     MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
2303     MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
2304     MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
2305     MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
2306     MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
2307     MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
2308     MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
2309     MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
2310     MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
2311     MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
2312     MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
2313     MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
2314     MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
2315     MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
2316     MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
2317     MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
2318     MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2319     MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2320     MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2321     MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2322     MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
2323     MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
2324     MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
2325     MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
2326     MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
2327     MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
2328     MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
2329     MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
2330     MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
2331     MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
2332     MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
2333     MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
2334     MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
2335     MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
2336     MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
2337     MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
2338     MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
2339     MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
2340     MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
2341     MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
2342     MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2343     MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2344     MAKE_CASE(AArch64ISD::ST1_PRED)
2345     MAKE_CASE(AArch64ISD::SST1_PRED)
2346     MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
2347     MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
2348     MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
2349     MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
2350     MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
2351     MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
2352     MAKE_CASE(AArch64ISD::SSTNT1_PRED)
2353     MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
2354     MAKE_CASE(AArch64ISD::LDP)
2355     MAKE_CASE(AArch64ISD::LDNP)
2356     MAKE_CASE(AArch64ISD::STP)
2357     MAKE_CASE(AArch64ISD::STNP)
2358     MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
2359     MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
2360     MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
2361     MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
2362     MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
2363     MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
2364     MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
2365     MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
2366     MAKE_CASE(AArch64ISD::INDEX_VECTOR)
2367     MAKE_CASE(AArch64ISD::ADDP)
2368     MAKE_CASE(AArch64ISD::SADDLP)
2369     MAKE_CASE(AArch64ISD::UADDLP)
2370     MAKE_CASE(AArch64ISD::CALL_RVMARKER)
2371     MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
2372     MAKE_CASE(AArch64ISD::MOPS_MEMSET)
2373     MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
2374     MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
2375     MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
2376     MAKE_CASE(AArch64ISD::CALL_BTI)
2377   }
2378 #undef MAKE_CASE
2379   return nullptr;
2380 }
2381
2382 MachineBasicBlock *
2383 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2384                                     MachineBasicBlock *MBB) const {
2385   // We materialise the F128CSEL pseudo-instruction as some control flow and a
2386   // phi node:
2387
2388   // OrigBB:
2389   //     [... previous instrs leading to comparison ...]
2390   //     b.ne TrueBB
2391   //     b EndBB
2392   // TrueBB:
2393   //     ; Fallthrough
2394   // EndBB:
2395   //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2396
2397   MachineFunction *MF = MBB->getParent();
2398   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2399   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2400   DebugLoc DL = MI.getDebugLoc();
2401   MachineFunction::iterator It = ++MBB->getIterator();
2402
2403   Register DestReg = MI.getOperand(0).getReg();
2404   Register IfTrueReg = MI.getOperand(1).getReg();
2405   Register IfFalseReg = MI.getOperand(2).getReg();
2406   unsigned CondCode = MI.getOperand(3).getImm();
2407   bool NZCVKilled = MI.getOperand(4).isKill();
2408
2409   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2410   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2411   MF->insert(It, TrueBB);
2412   MF->insert(It, EndBB);
2413
2414   // Transfer rest of current basic-block to EndBB
2415   EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2416                 MBB->end());
2417   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
2418
2419   BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2420   BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2421   MBB->addSuccessor(TrueBB);
2422   MBB->addSuccessor(EndBB);
2423
2424   // TrueBB falls through to the end.
2425   TrueBB->addSuccessor(EndBB);
2426
2427   if (!NZCVKilled) {
2428     TrueBB->addLiveIn(AArch64::NZCV);
2429     EndBB->addLiveIn(AArch64::NZCV);
2430   }
2431
2432   BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2433       .addReg(IfTrueReg)
2434       .addMBB(TrueBB)
2435       .addReg(IfFalseReg)
2436       .addMBB(MBB);
2437
2438   MI.eraseFromParent();
2439   return EndBB;
2440 }
2441
2442 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2443        MachineInstr &MI, MachineBasicBlock *BB) const {
2444   assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2445              BB->getParent()->getFunction().getPersonalityFn())) &&
2446          "SEH does not use catchret!");
2447   return BB;
2448 }
2449
2450 MachineBasicBlock *
2451 AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2452                                     MachineInstr &MI,
2453                                     MachineBasicBlock *BB) const {
2454   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2455   MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2456
2457   MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2458   MIB.add(MI.getOperand(1)); // slice index register
2459   MIB.add(MI.getOperand(2)); // slice index offset
2460   MIB.add(MI.getOperand(3)); // pg
2461   MIB.add(MI.getOperand(4)); // base
2462   MIB.add(MI.getOperand(5)); // offset
2463
2464   MI.eraseFromParent(); // The pseudo is gone now.
2465   return BB;
2466 }
2467
2468 MachineBasicBlock *
2469 AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
2470   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2471   MachineInstrBuilder MIB =
2472       BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2473
2474   MIB.addReg(AArch64::ZA, RegState::Define);
2475   MIB.add(MI.getOperand(0)); // Vector select register
2476   MIB.add(MI.getOperand(1)); // Vector select offset
2477   MIB.add(MI.getOperand(2)); // Base
2478   MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2479
2480   MI.eraseFromParent(); // The pseudo is gone now.
2481   return BB;
2482 }
2483
2484 MachineBasicBlock *
2485 AArch64TargetLowering::EmitMopa(unsigned Opc, unsigned BaseReg,
2486                                 MachineInstr &MI, MachineBasicBlock *BB) const {
2487   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2488   MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2489
2490   MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2491   MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2492   MIB.add(MI.getOperand(1)); // pn
2493   MIB.add(MI.getOperand(2)); // pm
2494   MIB.add(MI.getOperand(3)); // zn
2495   MIB.add(MI.getOperand(4)); // zm
2496
2497   MI.eraseFromParent(); // The pseudo is gone now.
2498   return BB;
2499 }
2500
2501 MachineBasicBlock *
2502 AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
2503                                               MachineInstr &MI,
2504                                               MachineBasicBlock *BB) const {
2505   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2506   MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2507
2508   MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2509   MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2510   MIB.add(MI.getOperand(1)); // Slice index register
2511   MIB.add(MI.getOperand(2)); // Slice index offset
2512   MIB.add(MI.getOperand(3)); // pg
2513   MIB.add(MI.getOperand(4)); // zn
2514
2515   MI.eraseFromParent(); // The pseudo is gone now.
2516   return BB;
2517 }
2518
2519 MachineBasicBlock *
2520 AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
2521   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2522   MachineInstrBuilder MIB =
2523       BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2524   MIB.add(MI.getOperand(0)); // Mask
2525
2526   unsigned Mask = MI.getOperand(0).getImm();
2527   for (unsigned I = 0; I < 8; I++) {
2528     if (Mask & (1 << I))
2529       MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2530   }
2531
2532   MI.eraseFromParent(); // The pseudo is gone now.
2533   return BB;
2534 }
2535
2536 MachineBasicBlock *
2537 AArch64TargetLowering::EmitAddVectorToTile(unsigned Opc, unsigned BaseReg,
2538                                            MachineInstr &MI,
2539                                            MachineBasicBlock *BB) const {
2540   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2541   MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2542
2543   MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2544   MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2545   MIB.add(MI.getOperand(1)); // pn
2546   MIB.add(MI.getOperand(2)); // pm
2547   MIB.add(MI.getOperand(3)); // zn
2548
2549   MI.eraseFromParent(); // The pseudo is gone now.
2550   return BB;
2551 }
2552
2553 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
2554     MachineInstr &MI, MachineBasicBlock *BB) const {
2555   switch (MI.getOpcode()) {
2556   default:
2557 #ifndef NDEBUG
2558     MI.dump();
2559 #endif
2560     llvm_unreachable("Unexpected instruction for custom inserter!");
2561
2562   case AArch64::F128CSEL:
2563     return EmitF128CSEL(MI, BB);
2564   case TargetOpcode::STATEPOINT:
2565     // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2566     // while bl call instruction (where statepoint will be lowered at the end)
2567     // has implicit def. This def is early-clobber as it will be set at
2568     // the moment of the call and earlier than any use is read.
2569     // Add this implicit dead def here as a workaround.
2570     MI.addOperand(*MI.getMF(),
2571                   MachineOperand::CreateReg(
2572                       AArch64::LR, /*isDef*/ true,
2573                       /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2574                       /*isUndef*/ false, /*isEarlyClobber*/ true));
2575     [[fallthrough]];
2576   case TargetOpcode::STACKMAP:
2577   case TargetOpcode::PATCHPOINT:
2578     return emitPatchPoint(MI, BB);
2579
2580   case AArch64::CATCHRET:
2581     return EmitLoweredCatchRet(MI, BB);
2582   case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2583     return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2584   case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2585     return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2586   case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2587     return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2588   case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2589     return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2590   case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2591     return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2592   case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2593     return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2594   case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2595     return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2596   case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2597     return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2598   case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2599     return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2600   case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2601     return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2602   case AArch64::LDR_ZA_PSEUDO:
2603     return EmitFill(MI, BB);
2604   case AArch64::BFMOPA_MPPZZ_PSEUDO:
2605     return EmitMopa(AArch64::BFMOPA_MPPZZ, AArch64::ZAS0, MI, BB);
2606   case AArch64::BFMOPS_MPPZZ_PSEUDO:
2607     return EmitMopa(AArch64::BFMOPS_MPPZZ, AArch64::ZAS0, MI, BB);
2608   case AArch64::FMOPAL_MPPZZ_PSEUDO:
2609     return EmitMopa(AArch64::FMOPAL_MPPZZ, AArch64::ZAS0, MI, BB);
2610   case AArch64::FMOPSL_MPPZZ_PSEUDO:
2611     return EmitMopa(AArch64::FMOPSL_MPPZZ, AArch64::ZAS0, MI, BB);
2612   case AArch64::FMOPA_MPPZZ_S_PSEUDO:
2613     return EmitMopa(AArch64::FMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2614   case AArch64::FMOPS_MPPZZ_S_PSEUDO:
2615     return EmitMopa(AArch64::FMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2616   case AArch64::FMOPA_MPPZZ_D_PSEUDO:
2617     return EmitMopa(AArch64::FMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2618   case AArch64::FMOPS_MPPZZ_D_PSEUDO:
2619     return EmitMopa(AArch64::FMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2620   case AArch64::SMOPA_MPPZZ_S_PSEUDO:
2621     return EmitMopa(AArch64::SMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2622   case AArch64::SMOPS_MPPZZ_S_PSEUDO:
2623     return EmitMopa(AArch64::SMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2624   case AArch64::UMOPA_MPPZZ_S_PSEUDO:
2625     return EmitMopa(AArch64::UMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2626   case AArch64::UMOPS_MPPZZ_S_PSEUDO:
2627     return EmitMopa(AArch64::UMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2628   case AArch64::SUMOPA_MPPZZ_S_PSEUDO:
2629     return EmitMopa(AArch64::SUMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2630   case AArch64::SUMOPS_MPPZZ_S_PSEUDO:
2631     return EmitMopa(AArch64::SUMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2632   case AArch64::USMOPA_MPPZZ_S_PSEUDO:
2633     return EmitMopa(AArch64::USMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2634   case AArch64::USMOPS_MPPZZ_S_PSEUDO:
2635     return EmitMopa(AArch64::USMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2636   case AArch64::SMOPA_MPPZZ_D_PSEUDO:
2637     return EmitMopa(AArch64::SMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2638   case AArch64::SMOPS_MPPZZ_D_PSEUDO:
2639     return EmitMopa(AArch64::SMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2640   case AArch64::UMOPA_MPPZZ_D_PSEUDO:
2641     return EmitMopa(AArch64::UMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2642   case AArch64::UMOPS_MPPZZ_D_PSEUDO:
2643     return EmitMopa(AArch64::UMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2644   case AArch64::SUMOPA_MPPZZ_D_PSEUDO:
2645     return EmitMopa(AArch64::SUMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2646   case AArch64::SUMOPS_MPPZZ_D_PSEUDO:
2647     return EmitMopa(AArch64::SUMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2648   case AArch64::USMOPA_MPPZZ_D_PSEUDO:
2649     return EmitMopa(AArch64::USMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2650   case AArch64::USMOPS_MPPZZ_D_PSEUDO:
2651     return EmitMopa(AArch64::USMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2652   case AArch64::INSERT_MXIPZ_H_PSEUDO_B:
2653     return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_B, AArch64::ZAB0, MI,
2654                                   BB);
2655   case AArch64::INSERT_MXIPZ_H_PSEUDO_H:
2656     return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_H, AArch64::ZAH0, MI,
2657                                   BB);
2658   case AArch64::INSERT_MXIPZ_H_PSEUDO_S:
2659     return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_S, AArch64::ZAS0, MI,
2660                                   BB);
2661   case AArch64::INSERT_MXIPZ_H_PSEUDO_D:
2662     return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_D, AArch64::ZAD0, MI,
2663                                   BB);
2664   case AArch64::INSERT_MXIPZ_H_PSEUDO_Q:
2665     return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_Q, AArch64::ZAQ0, MI,
2666                                   BB);
2667   case AArch64::INSERT_MXIPZ_V_PSEUDO_B:
2668     return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_B, AArch64::ZAB0, MI,
2669                                   BB);
2670   case AArch64::INSERT_MXIPZ_V_PSEUDO_H:
2671     return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_H, AArch64::ZAH0, MI,
2672                                   BB);
2673   case AArch64::INSERT_MXIPZ_V_PSEUDO_S:
2674     return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_S, AArch64::ZAS0, MI,
2675                                   BB);
2676   case AArch64::INSERT_MXIPZ_V_PSEUDO_D:
2677     return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_D, AArch64::ZAD0, MI,
2678                                   BB);
2679   case AArch64::INSERT_MXIPZ_V_PSEUDO_Q:
2680     return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_Q, AArch64::ZAQ0, MI,
2681                                   BB);
2682   case AArch64::ZERO_M_PSEUDO:
2683     return EmitZero(MI, BB);
2684   case AArch64::ADDHA_MPPZ_PSEUDO_S:
2685     return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_S, AArch64::ZAS0, MI, BB);
2686   case AArch64::ADDVA_MPPZ_PSEUDO_S:
2687     return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_S, AArch64::ZAS0, MI, BB);
2688   case AArch64::ADDHA_MPPZ_PSEUDO_D:
2689     return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_D, AArch64::ZAD0, MI, BB);
2690   case AArch64::ADDVA_MPPZ_PSEUDO_D:
2691     return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_D, AArch64::ZAD0, MI, BB);
2692   }
2693 }
2694
2695 //===----------------------------------------------------------------------===//
2696 // AArch64 Lowering private implementation.
2697 //===----------------------------------------------------------------------===//
2698
2699 //===----------------------------------------------------------------------===//
2700 // Lowering Code
2701 //===----------------------------------------------------------------------===//
2702
2703 // Forward declarations of SVE fixed length lowering helpers
2704 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
2705 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2706 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2707 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
2708                                                 SelectionDAG &DAG);
2709 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
2710                                              EVT VT);
2711
2712 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
2713 static bool isZerosVector(const SDNode *N) {
2714   // Look through a bit convert.
2715   while (N->getOpcode() == ISD::BITCAST)
2716     N = N->getOperand(0).getNode();
2717
2718   if (ISD::isConstantSplatVectorAllZeros(N))
2719     return true;
2720
2721   if (N->getOpcode() != AArch64ISD::DUP)
2722     return false;
2723
2724   auto Opnd0 = N->getOperand(0);
2725   return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
2726 }
2727
2728 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2729 /// CC
2730 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
2731   switch (CC) {
2732   default:
2733     llvm_unreachable("Unknown condition code!");
2734   case ISD::SETNE:
2735     return AArch64CC::NE;
2736   case ISD::SETEQ:
2737     return AArch64CC::EQ;
2738   case ISD::SETGT:
2739     return AArch64CC::GT;
2740   case ISD::SETGE:
2741     return AArch64CC::GE;
2742   case ISD::SETLT:
2743     return AArch64CC::LT;
2744   case ISD::SETLE:
2745     return AArch64CC::LE;
2746   case ISD::SETUGT:
2747     return AArch64CC::HI;
2748   case ISD::SETUGE:
2749     return AArch64CC::HS;
2750   case ISD::SETULT:
2751     return AArch64CC::LO;
2752   case ISD::SETULE:
2753     return AArch64CC::LS;
2754   }
2755 }
2756
2757 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2758 static void changeFPCCToAArch64CC(ISD::CondCode CC,
2759                                   AArch64CC::CondCode &CondCode,
2760                                   AArch64CC::CondCode &CondCode2) {
2761   CondCode2 = AArch64CC::AL;
2762   switch (CC) {
2763   default:
2764     llvm_unreachable("Unknown FP condition!");
2765   case ISD::SETEQ:
2766   case ISD::SETOEQ:
2767     CondCode = AArch64CC::EQ;
2768     break;
2769   case ISD::SETGT:
2770   case ISD::SETOGT:
2771     CondCode = AArch64CC::GT;
2772     break;
2773   case ISD::SETGE:
2774   case ISD::SETOGE:
2775     CondCode = AArch64CC::GE;
2776     break;
2777   case ISD::SETOLT:
2778     CondCode = AArch64CC::MI;
2779     break;
2780   case ISD::SETOLE:
2781     CondCode = AArch64CC::LS;
2782     break;
2783   case ISD::SETONE:
2784     CondCode = AArch64CC::MI;
2785     CondCode2 = AArch64CC::GT;
2786     break;
2787   case ISD::SETO:
2788     CondCode = AArch64CC::VC;
2789     break;
2790   case ISD::SETUO:
2791     CondCode = AArch64CC::VS;
2792     break;
2793   case ISD::SETUEQ:
2794     CondCode = AArch64CC::EQ;
2795     CondCode2 = AArch64CC::VS;
2796     break;
2797   case ISD::SETUGT:
2798     CondCode = AArch64CC::HI;
2799     break;
2800   case ISD::SETUGE:
2801     CondCode = AArch64CC::PL;
2802     break;
2803   case ISD::SETLT:
2804   case ISD::SETULT:
2805     CondCode = AArch64CC::LT;
2806     break;
2807   case ISD::SETLE:
2808   case ISD::SETULE:
2809     CondCode = AArch64CC::LE;
2810     break;
2811   case ISD::SETNE:
2812   case ISD::SETUNE:
2813     CondCode = AArch64CC::NE;
2814     break;
2815   }
2816 }
2817
2818 /// Convert a DAG fp condition code to an AArch64 CC.
2819 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2820 /// should be AND'ed instead of OR'ed.
2821 static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
2822                                      AArch64CC::CondCode &CondCode,
2823                                      AArch64CC::CondCode &CondCode2) {
2824   CondCode2 = AArch64CC::AL;
2825   switch (CC) {
2826   default:
2827     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2828     assert(CondCode2 == AArch64CC::AL);
2829     break;
2830   case ISD::SETONE:
2831     // (a one b)
2832     // == ((a olt b) || (a ogt b))
2833     // == ((a ord b) && (a une b))
2834     CondCode = AArch64CC::VC;
2835     CondCode2 = AArch64CC::NE;
2836     break;
2837   case ISD::SETUEQ:
2838     // (a ueq b)
2839     // == ((a uno b) || (a oeq b))
2840     // == ((a ule b) && (a uge b))
2841     CondCode = AArch64CC::PL;
2842     CondCode2 = AArch64CC::LE;
2843     break;
2844   }
2845 }
2846
2847 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2848 /// CC usable with the vector instructions. Fewer operations are available
2849 /// without a real NZCV register, so we have to use less efficient combinations
2850 /// to get the same effect.
2851 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
2852                                         AArch64CC::CondCode &CondCode,
2853                                         AArch64CC::CondCode &CondCode2,
2854                                         bool &Invert) {
2855   Invert = false;
2856   switch (CC) {
2857   default:
2858     // Mostly the scalar mappings work fine.
2859     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2860     break;
2861   case ISD::SETUO:
2862     Invert = true;
2863     [[fallthrough]];
2864   case ISD::SETO:
2865     CondCode = AArch64CC::MI;
2866     CondCode2 = AArch64CC::GE;
2867     break;
2868   case ISD::SETUEQ:
2869   case ISD::SETULT:
2870   case ISD::SETULE:
2871   case ISD::SETUGT:
2872   case ISD::SETUGE:
2873     // All of the compare-mask comparisons are ordered, but we can switch
2874     // between the two by a double inversion. E.g. ULE == !OGT.
2875     Invert = true;
2876     changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2877                           CondCode, CondCode2);
2878     break;
2879   }
2880 }
2881
2882 static bool isLegalArithImmed(uint64_t C) {
2883   // Matches AArch64DAGToDAGISel::SelectArithImmed().
2884   bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2885   LLVM_DEBUG(dbgs() << "Is imm " << C
2886                     << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2887   return IsLegal;
2888 }
2889
2890 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2891 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2892 // can be set differently by this operation. It comes down to whether
2893 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2894 // everything is fine. If not then the optimization is wrong. Thus general
2895 // comparisons are only valid if op2 != 0.
2896 //
2897 // So, finally, the only LLVM-native comparisons that don't mention C and V
2898 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2899 // the absence of information about op2.
2900 static bool isCMN(SDValue Op, ISD::CondCode CC) {
2901   return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2902          (CC == ISD::SETEQ || CC == ISD::SETNE);
2903 }
2904
2905 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
2906                                       SelectionDAG &DAG, SDValue Chain,
2907                                       bool IsSignaling) {
2908   EVT VT = LHS.getValueType();
2909   assert(VT != MVT::f128);
2910
2911   const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
2912
2913   if (VT == MVT::f16 && !FullFP16) {
2914     LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
2915                       {Chain, LHS});
2916     RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
2917                       {LHS.getValue(1), RHS});
2918     Chain = RHS.getValue(1);
2919     VT = MVT::f32;
2920   }
2921   unsigned Opcode =
2922       IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
2923   return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2924 }
2925
2926 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
2927                               const SDLoc &dl, SelectionDAG &DAG) {
2928   EVT VT = LHS.getValueType();
2929   const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
2930
2931   if (VT.isFloatingPoint()) {
2932     assert(VT != MVT::f128);
2933     if (VT == MVT::f16 && !FullFP16) {
2934       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2935       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2936       VT = MVT::f32;
2937     }
2938     return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2939   }
2940
2941   // The CMP instruction is just an alias for SUBS, and representing it as
2942   // SUBS means that it's possible to get CSE with subtract operations.
2943   // A later phase can perform the optimization of setting the destination
2944   // register to WZR/XZR if it ends up being unused.
2945   unsigned Opcode = AArch64ISD::SUBS;
2946
2947   if (isCMN(RHS, CC)) {
2948     // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2949     Opcode = AArch64ISD::ADDS;
2950     RHS = RHS.getOperand(1);
2951   } else if (isCMN(LHS, CC)) {
2952     // As we are looking for EQ/NE compares, the operands can be commuted ; can
2953     // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2954     Opcode = AArch64ISD::ADDS;
2955     LHS = LHS.getOperand(1);
2956   } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2957     if (LHS.getOpcode() == ISD::AND) {
2958       // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2959       // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2960       // of the signed comparisons.
2961       const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2962                                            DAG.getVTList(VT, MVT_CC),
2963                                            LHS.getOperand(0),
2964                                            LHS.getOperand(1));
2965       // Replace all users of (and X, Y) with newly generated (ands X, Y)
2966       DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2967       return ANDSNode.getValue(1);
2968     } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2969       // Use result of ANDS
2970       return LHS.getValue(1);
2971     }
2972   }
2973
2974   return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2975       .getValue(1);
2976 }
2977
2978 /// \defgroup AArch64CCMP CMP;CCMP matching
2979 ///
2980 /// These functions deal with the formation of CMP;CCMP;... sequences.
2981 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2982 /// a comparison. They set the NZCV flags to a predefined value if their
2983 /// predicate is false. This allows to express arbitrary conjunctions, for
2984 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2985 /// expressed as:
2986 ///   cmp A
2987 ///   ccmp B, inv(CB), CA
2988 ///   check for CB flags
2989 ///
2990 /// This naturally lets us implement chains of AND operations with SETCC
2991 /// operands. And we can even implement some other situations by transforming
2992 /// them:
2993 ///   - We can implement (NEG SETCC) i.e. negating a single comparison by
2994 ///     negating the flags used in a CCMP/FCCMP operations.
2995 ///   - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2996 ///     by negating the flags we test for afterwards. i.e.
2997 ///     NEG (CMP CCMP CCCMP ...) can be implemented.
2998 ///   - Note that we can only ever negate all previously processed results.
2999 ///     What we can not implement by flipping the flags to test is a negation
3000 ///     of two sub-trees (because the negation affects all sub-trees emitted so
3001 ///     far, so the 2nd sub-tree we emit would also affect the first).
3002 /// With those tools we can implement some OR operations:
3003 ///   - (OR (SETCC A) (SETCC B)) can be implemented via:
3004 ///     NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3005 ///   - After transforming OR to NEG/AND combinations we may be able to use NEG
3006 ///     elimination rules from earlier to implement the whole thing as a
3007 ///     CCMP/FCCMP chain.
3008 ///
3009 /// As complete example:
3010 ///     or (or (setCA (cmp A)) (setCB (cmp B)))
3011 ///        (and (setCC (cmp C)) (setCD (cmp D)))"
3012 /// can be reassociated to:
3013 ///     or (and (setCC (cmp C)) setCD (cmp D))
3014 //         (or (setCA (cmp A)) (setCB (cmp B)))
3015 /// can be transformed to:
3016 ///     not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3017 ///              (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3018 /// which can be implemented as:
3019 ///   cmp C
3020 ///   ccmp D, inv(CD), CC
3021 ///   ccmp A, CA, inv(CD)
3022 ///   ccmp B, CB, inv(CA)
3023 ///   check for CB flags
3024 ///
3025 /// A counterexample is "or (and A B) (and C D)" which translates to
3026 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3027 /// can only implement 1 of the inner (not) operations, but not both!
3028 /// @{
3029
3030 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3031 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
3032                                          ISD::CondCode CC, SDValue CCOp,
3033                                          AArch64CC::CondCode Predicate,
3034                                          AArch64CC::CondCode OutCC,
3035                                          const SDLoc &DL, SelectionDAG &DAG) {
3036   unsigned Opcode = 0;
3037   const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3038
3039   if (LHS.getValueType().isFloatingPoint()) {
3040     assert(LHS.getValueType() != MVT::f128);
3041     if (LHS.getValueType() == MVT::f16 && !FullFP16) {
3042       LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3043       RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3044     }
3045     Opcode = AArch64ISD::FCCMP;
3046   } else if (RHS.getOpcode() == ISD::SUB) {
3047     SDValue SubOp0 = RHS.getOperand(0);
3048     if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3049       // See emitComparison() on why we can only do this for SETEQ and SETNE.
3050       Opcode = AArch64ISD::CCMN;
3051       RHS = RHS.getOperand(1);
3052     }
3053   }
3054   if (Opcode == 0)
3055     Opcode = AArch64ISD::CCMP;
3056
3057   SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3058   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
3059   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3060   SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3061   return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3062 }
3063
3064 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3065 /// expressed as a conjunction. See \ref AArch64CCMP.
3066 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
3067 ///                     changing the conditions on the SETCC tests.
3068 ///                     (this means we can call emitConjunctionRec() with
3069 ///                      Negate==true on this sub-tree)
3070 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
3071 ///                     cannot do the negation naturally. We are required to
3072 ///                     emit the subtree first in this case.
3073 /// \param WillNegate   Is true if are called when the result of this
3074 ///                     subexpression must be negated. This happens when the
3075 ///                     outer expression is an OR. We can use this fact to know
3076 ///                     that we have a double negation (or (or ...) ...) that
3077 ///                     can be implemented for free.
3078 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3079                                bool &MustBeFirst, bool WillNegate,
3080                                unsigned Depth = 0) {
3081   if (!Val.hasOneUse())
3082     return false;
3083   unsigned Opcode = Val->getOpcode();
3084   if (Opcode == ISD::SETCC) {
3085     if (Val->getOperand(0).getValueType() == MVT::f128)
3086       return false;
3087     CanNegate = true;
3088     MustBeFirst = false;
3089     return true;
3090   }
3091   // Protect against exponential runtime and stack overflow.
3092   if (Depth > 6)
3093     return false;
3094   if (Opcode == ISD::AND || Opcode == ISD::OR) {
3095     bool IsOR = Opcode == ISD::OR;
3096     SDValue O0 = Val->getOperand(0);
3097     SDValue O1 = Val->getOperand(1);
3098     bool CanNegateL;
3099     bool MustBeFirstL;
3100     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3101       return false;
3102     bool CanNegateR;
3103     bool MustBeFirstR;
3104     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3105       return false;
3106
3107     if (MustBeFirstL && MustBeFirstR)
3108       return false;
3109
3110     if (IsOR) {
3111       // For an OR expression we need to be able to naturally negate at least
3112       // one side or we cannot do the transformation at all.
3113       if (!CanNegateL && !CanNegateR)
3114         return false;
3115       // If we the result of the OR will be negated and we can naturally negate
3116       // the leafs, then this sub-tree as a whole negates naturally.
3117       CanNegate = WillNegate && CanNegateL && CanNegateR;
3118       // If we cannot naturally negate the whole sub-tree, then this must be
3119       // emitted first.
3120       MustBeFirst = !CanNegate;
3121     } else {
3122       assert(Opcode == ISD::AND && "Must be OR or AND");
3123       // We cannot naturally negate an AND operation.
3124       CanNegate = false;
3125       MustBeFirst = MustBeFirstL || MustBeFirstR;
3126     }
3127     return true;
3128   }
3129   return false;
3130 }
3131
3132 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3133 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3134 /// Tries to transform the given i1 producing node @p Val to a series compare
3135 /// and conditional compare operations. @returns an NZCV flags producing node
3136 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3137 /// transformation was not possible.
3138 /// \p Negate is true if we want this sub-tree being negated just by changing
3139 /// SETCC conditions.
3140 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
3141     AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3142     AArch64CC::CondCode Predicate) {
3143   // We're at a tree leaf, produce a conditional comparison operation.
3144   unsigned Opcode = Val->getOpcode();
3145   if (Opcode == ISD::SETCC) {
3146     SDValue LHS = Val->getOperand(0);
3147     SDValue RHS = Val->getOperand(1);
3148     ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3149     bool isInteger = LHS.getValueType().isInteger();
3150     if (Negate)
3151       CC = getSetCCInverse(CC, LHS.getValueType());
3152     SDLoc DL(Val);
3153     // Determine OutCC and handle FP special case.
3154     if (isInteger) {
3155       OutCC = changeIntCCToAArch64CC(CC);
3156     } else {
3157       assert(LHS.getValueType().isFloatingPoint());
3158       AArch64CC::CondCode ExtraCC;
3159       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3160       // Some floating point conditions can't be tested with a single condition
3161       // code. Construct an additional comparison in this case.
3162       if (ExtraCC != AArch64CC::AL) {
3163         SDValue ExtraCmp;
3164         if (!CCOp.getNode())
3165           ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3166         else
3167           ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3168                                                ExtraCC, DL, DAG);
3169         CCOp = ExtraCmp;
3170         Predicate = ExtraCC;
3171       }
3172     }
3173
3174     // Produce a normal comparison if we are first in the chain
3175     if (!CCOp)
3176       return emitComparison(LHS, RHS, CC, DL, DAG);
3177     // Otherwise produce a ccmp.
3178     return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3179                                      DAG);
3180   }
3181   assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3182
3183   bool IsOR = Opcode == ISD::OR;
3184
3185   SDValue LHS = Val->getOperand(0);
3186   bool CanNegateL;
3187   bool MustBeFirstL;
3188   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3189   assert(ValidL && "Valid conjunction/disjunction tree");
3190   (void)ValidL;
3191
3192   SDValue RHS = Val->getOperand(1);
3193   bool CanNegateR;
3194   bool MustBeFirstR;
3195   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3196   assert(ValidR && "Valid conjunction/disjunction tree");
3197   (void)ValidR;
3198
3199   // Swap sub-tree that must come first to the right side.
3200   if (MustBeFirstL) {
3201     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3202     std::swap(LHS, RHS);
3203     std::swap(CanNegateL, CanNegateR);
3204     std::swap(MustBeFirstL, MustBeFirstR);
3205   }
3206
3207   bool NegateR;
3208   bool NegateAfterR;
3209   bool NegateL;
3210   bool NegateAfterAll;
3211   if (Opcode == ISD::OR) {
3212     // Swap the sub-tree that we can negate naturally to the left.
3213     if (!CanNegateL) {
3214       assert(CanNegateR && "at least one side must be negatable");
3215       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3216       assert(!Negate);
3217       std::swap(LHS, RHS);
3218       NegateR = false;
3219       NegateAfterR = true;
3220     } else {
3221       // Negate the left sub-tree if possible, otherwise negate the result.
3222       NegateR = CanNegateR;
3223       NegateAfterR = !CanNegateR;
3224     }
3225     NegateL = true;
3226     NegateAfterAll = !Negate;
3227   } else {
3228     assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3229     assert(!Negate && "Valid conjunction/disjunction tree");
3230
3231     NegateL = false;
3232     NegateR = false;
3233     NegateAfterR = false;
3234     NegateAfterAll = false;
3235   }
3236
3237   // Emit sub-trees.
3238   AArch64CC::CondCode RHSCC;
3239   SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3240   if (NegateAfterR)
3241     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3242   SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3243   if (NegateAfterAll)
3244     OutCC = AArch64CC::getInvertedCondCode(OutCC);
3245   return CmpL;
3246 }
3247
3248 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3249 /// In some cases this is even possible with OR operations in the expression.
3250 /// See \ref AArch64CCMP.
3251 /// \see emitConjunctionRec().
3252 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
3253                                AArch64CC::CondCode &OutCC) {
3254   bool DummyCanNegate;
3255   bool DummyMustBeFirst;
3256   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3257     return SDValue();
3258
3259   return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3260 }
3261
3262 /// @}
3263
3264 /// Returns how profitable it is to fold a comparison's operand's shift and/or
3265 /// extension operations.
3266 static unsigned getCmpOperandFoldingProfit(SDValue Op) {
3267   auto isSupportedExtend = [&](SDValue V) {
3268     if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3269       return true;
3270
3271     if (V.getOpcode() == ISD::AND)
3272       if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3273         uint64_t Mask = MaskCst->getZExtValue();
3274         return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3275       }
3276
3277     return false;
3278   };
3279
3280   if (!Op.hasOneUse())
3281     return 0;
3282
3283   if (isSupportedExtend(Op))
3284     return 1;
3285
3286   unsigned Opc = Op.getOpcode();
3287   if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3288     if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3289       uint64_t Shift = ShiftCst->getZExtValue();
3290       if (isSupportedExtend(Op.getOperand(0)))
3291         return (Shift <= 4) ? 2 : 1;
3292       EVT VT = Op.getValueType();
3293       if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3294         return 1;
3295     }
3296
3297   return 0;
3298 }
3299
3300 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3301                              SDValue &AArch64cc, SelectionDAG &DAG,
3302                              const SDLoc &dl) {
3303   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3304     EVT VT = RHS.getValueType();
3305     uint64_t C = RHSC->getZExtValue();
3306     if (!isLegalArithImmed(C)) {
3307       // Constant does not fit, try adjusting it by one?
3308       switch (CC) {
3309       default:
3310         break;
3311       case ISD::SETLT:
3312       case ISD::SETGE:
3313         if ((VT == MVT::i32 && C != 0x80000000 &&
3314              isLegalArithImmed((uint32_t)(C - 1))) ||
3315             (VT == MVT::i64 && C != 0x80000000ULL &&
3316              isLegalArithImmed(C - 1ULL))) {
3317           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3318           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3319           RHS = DAG.getConstant(C, dl, VT);
3320         }
3321         break;
3322       case ISD::SETULT:
3323       case ISD::SETUGE:
3324         if ((VT == MVT::i32 && C != 0 &&
3325              isLegalArithImmed((uint32_t)(C - 1))) ||
3326             (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3327           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3328           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3329           RHS = DAG.getConstant(C, dl, VT);
3330         }
3331         break;
3332       case ISD::SETLE:
3333       case ISD::SETGT:
3334         if ((VT == MVT::i32 && C != INT32_MAX &&
3335              isLegalArithImmed((uint32_t)(C + 1))) ||
3336             (VT == MVT::i64 && C != INT64_MAX &&
3337              isLegalArithImmed(C + 1ULL))) {
3338           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3339           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3340           RHS = DAG.getConstant(C, dl, VT);
3341         }
3342         break;
3343       case ISD::SETULE:
3344       case ISD::SETUGT:
3345         if ((VT == MVT::i32 && C != UINT32_MAX &&
3346              isLegalArithImmed((uint32_t)(C + 1))) ||
3347             (VT == MVT::i64 && C != UINT64_MAX &&
3348              isLegalArithImmed(C + 1ULL))) {
3349           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3350           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3351           RHS = DAG.getConstant(C, dl, VT);
3352         }
3353         break;
3354       }
3355     }
3356   }
3357
3358   // Comparisons are canonicalized so that the RHS operand is simpler than the
3359   // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3360   // can fold some shift+extend operations on the RHS operand, so swap the
3361   // operands if that can be done.
3362   //
3363   // For example:
3364   //    lsl     w13, w11, #1
3365   //    cmp     w13, w12
3366   // can be turned into:
3367   //    cmp     w12, w11, lsl #1
3368   if (!isa<ConstantSDNode>(RHS) ||
3369       !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
3370     SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3371
3372     if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
3373       std::swap(LHS, RHS);
3374       CC = ISD::getSetCCSwappedOperands(CC);
3375     }
3376   }
3377
3378   SDValue Cmp;
3379   AArch64CC::CondCode AArch64CC;
3380   if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3381     const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3382
3383     // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3384     // For the i8 operand, the largest immediate is 255, so this can be easily
3385     // encoded in the compare instruction. For the i16 operand, however, the
3386     // largest immediate cannot be encoded in the compare.
3387     // Therefore, use a sign extending load and cmn to avoid materializing the
3388     // -1 constant. For example,
3389     // movz w1, #65535
3390     // ldrh w0, [x0, #0]
3391     // cmp w0, w1
3392     // >
3393     // ldrsh w0, [x0, #0]
3394     // cmn w0, #1
3395     // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3396     // if and only if (sext LHS) == (sext RHS). The checks are in place to
3397     // ensure both the LHS and RHS are truly zero extended and to make sure the
3398     // transformation is profitable.
3399     if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3400         cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3401         cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3402         LHS.getNode()->hasNUsesOfValue(1, 0)) {
3403       int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
3404       if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3405         SDValue SExt =
3406             DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3407                         DAG.getValueType(MVT::i16));
3408         Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3409                                                    RHS.getValueType()),
3410                              CC, dl, DAG);
3411         AArch64CC = changeIntCCToAArch64CC(CC);
3412       }
3413     }
3414
3415     if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3416       if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3417         if ((CC == ISD::SETNE) ^ RHSC->isZero())
3418           AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3419       }
3420     }
3421   }
3422
3423   if (!Cmp) {
3424     Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3425     AArch64CC = changeIntCCToAArch64CC(CC);
3426   }
3427   AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3428   return Cmp;
3429 }
3430
3431 static std::pair<SDValue, SDValue>
3432 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
3433   assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3434          "Unsupported value type");
3435   SDValue Value, Overflow;
3436   SDLoc DL(Op);
3437   SDValue LHS = Op.getOperand(0);
3438   SDValue RHS = Op.getOperand(1);
3439   unsigned Opc = 0;
3440   switch (Op.getOpcode()) {
3441   default:
3442     llvm_unreachable("Unknown overflow instruction!");
3443   case ISD::SADDO:
3444     Opc = AArch64ISD::ADDS;
3445     CC = AArch64CC::VS;
3446     break;
3447   case ISD::UADDO:
3448     Opc = AArch64ISD::ADDS;
3449     CC = AArch64CC::HS;
3450     break;
3451   case ISD::SSUBO:
3452     Opc = AArch64ISD::SUBS;
3453     CC = AArch64CC::VS;
3454     break;
3455   case ISD::USUBO:
3456     Opc = AArch64ISD::SUBS;
3457     CC = AArch64CC::LO;
3458     break;
3459   // Multiply needs a little bit extra work.
3460   case ISD::SMULO:
3461   case ISD::UMULO: {
3462     CC = AArch64CC::NE;
3463     bool IsSigned = Op.getOpcode() == ISD::SMULO;
3464     if (Op.getValueType() == MVT::i32) {
3465       // Extend to 64-bits, then perform a 64-bit multiply.
3466       unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3467       LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3468       RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3469       SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3470       Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3471
3472       // Check that the result fits into a 32-bit integer.
3473       SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3474       if (IsSigned) {
3475         // cmp xreg, wreg, sxtw
3476         SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3477         Overflow =
3478             DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3479       } else {
3480         // tst xreg, #0xffffffff00000000
3481         SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3482         Overflow =
3483             DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3484       }
3485       break;
3486     }
3487     assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3488     // For the 64 bit multiply
3489     Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3490     if (IsSigned) {
3491       SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3492       SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3493                                       DAG.getConstant(63, DL, MVT::i64));
3494       // It is important that LowerBits is last, otherwise the arithmetic
3495       // shift will not be folded into the compare (SUBS).
3496       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3497       Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3498                      .getValue(1);
3499     } else {
3500       SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3501       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3502       Overflow =
3503           DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3504                       DAG.getConstant(0, DL, MVT::i64),
3505                       UpperBits).getValue(1);
3506     }
3507     break;
3508   }
3509   } // switch (...)
3510
3511   if (Opc) {
3512     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3513
3514     // Emit the AArch64 operation with overflow check.
3515     Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3516     Overflow = Value.getValue(1);
3517   }
3518   return std::make_pair(Value, Overflow);
3519 }
3520
3521 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3522   if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3523     return LowerToScalableOp(Op, DAG);
3524
3525   SDValue Sel = Op.getOperand(0);
3526   SDValue Other = Op.getOperand(1);
3527   SDLoc dl(Sel);
3528
3529   // If the operand is an overflow checking operation, invert the condition
3530   // code and kill the Not operation. I.e., transform:
3531   // (xor (overflow_op_bool, 1))
3532   //   -->
3533   // (csel 1, 0, invert(cc), overflow_op_bool)
3534   // ... which later gets transformed to just a cset instruction with an
3535   // inverted condition code, rather than a cset + eor sequence.
3536   if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
3537     // Only lower legal XALUO ops.
3538     if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3539       return SDValue();
3540
3541     SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3542     SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3543     AArch64CC::CondCode CC;
3544     SDValue Value, Overflow;
3545     std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3546     SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3547     return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3548                        CCVal, Overflow);
3549   }
3550   // If neither operand is a SELECT_CC, give up.
3551   if (Sel.getOpcode() != ISD::SELECT_CC)
3552     std::swap(Sel, Other);
3553   if (Sel.getOpcode() != ISD::SELECT_CC)
3554     return Op;
3555
3556   // The folding we want to perform is:
3557   // (xor x, (select_cc a, b, cc, 0, -1) )
3558   //   -->
3559   // (csel x, (xor x, -1), cc ...)
3560   //
3561   // The latter will get matched to a CSINV instruction.
3562
3563   ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3564   SDValue LHS = Sel.getOperand(0);
3565   SDValue RHS = Sel.getOperand(1);
3566   SDValue TVal = Sel.getOperand(2);
3567   SDValue FVal = Sel.getOperand(3);
3568
3569   // FIXME: This could be generalized to non-integer comparisons.
3570   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3571     return Op;
3572
3573   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3574   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3575
3576   // The values aren't constants, this isn't the pattern we're looking for.
3577   if (!CFVal || !CTVal)
3578     return Op;
3579
3580   // We can commute the SELECT_CC by inverting the condition.  This
3581   // might be needed to make this fit into a CSINV pattern.
3582   if (CTVal->isAllOnes() && CFVal->isZero()) {
3583     std::swap(TVal, FVal);
3584     std::swap(CTVal, CFVal);
3585     CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3586   }
3587
3588   // If the constants line up, perform the transform!
3589   if (CTVal->isZero() && CFVal->isAllOnes()) {
3590     SDValue CCVal;
3591     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3592
3593     FVal = Other;
3594     TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3595                        DAG.getConstant(-1ULL, dl, Other.getValueType()));
3596
3597     return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3598                        CCVal, Cmp);
3599   }
3600
3601   return Op;
3602 }
3603
3604 // If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3605 // bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3606 // sets 'C' bit to 0.
3607 static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
3608   SDLoc DL(Value);
3609   EVT VT = Value.getValueType();
3610   SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3611   SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3612   SDValue Cmp =
3613       DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3614   return Cmp.getValue(1);
3615 }
3616
3617 // If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3618 // If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3619 static SDValue carryFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG,
3620                                 bool Invert) {
3621   assert(Flag.getResNo() == 1);
3622   SDLoc DL(Flag);
3623   SDValue Zero = DAG.getConstant(0, DL, VT);
3624   SDValue One = DAG.getConstant(1, DL, VT);
3625   unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3626   SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3627   return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
3628 }
3629
3630 // Value is 1 if 'V' bit of NZCV is 1, else 0
3631 static SDValue overflowFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG) {
3632   assert(Flag.getResNo() == 1);
3633   SDLoc DL(Flag);
3634   SDValue Zero = DAG.getConstant(0, DL, VT);
3635   SDValue One = DAG.getConstant(1, DL, VT);
3636   SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3637   return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
3638 }
3639
3640 // This lowering is inefficient, but it will get cleaned up by
3641 // `foldOverflowCheck`
3642 static SDValue lowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode,
3643                                 bool IsSigned) {
3644   EVT VT0 = Op.getValue(0).getValueType();
3645   EVT VT1 = Op.getValue(1).getValueType();
3646
3647   if (VT0 != MVT::i32 && VT0 != MVT::i64)
3648     return SDValue();
3649
3650   bool InvertCarry = Opcode == AArch64ISD::SBCS;
3651   SDValue OpLHS = Op.getOperand(0);
3652   SDValue OpRHS = Op.getOperand(1);
3653   SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3654
3655   SDLoc DL(Op);
3656   SDVTList VTs = DAG.getVTList(VT0, VT1);
3657
3658   SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3659                             OpRHS, OpCarryIn);
3660
3661   SDValue OutFlag =
3662       IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3663                : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3664
3665   return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3666 }
3667
3668 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
3669   // Let legalize expand this if it isn't a legal type yet.
3670   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3671     return SDValue();
3672
3673   SDLoc dl(Op);
3674   AArch64CC::CondCode CC;
3675   // The actual operation that sets the overflow or carry flag.
3676   SDValue Value, Overflow;
3677   std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3678
3679   // We use 0 and 1 as false and true values.
3680   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3681   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3682
3683   // We use an inverted condition, because the conditional select is inverted
3684   // too. This will allow it to be selected to a single instruction:
3685   // CSINC Wd, WZR, WZR, invert(cond).
3686   SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3687   Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3688                          CCVal, Overflow);
3689
3690   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3691   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3692 }
3693
3694 // Prefetch operands are:
3695 // 1: Address to prefetch
3696 // 2: bool isWrite
3697 // 3: int locality (0 = no locality ... 3 = extreme locality)
3698 // 4: bool isDataCache
3699 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
3700   SDLoc DL(Op);
3701   unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
3702   unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
3703   unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3704
3705   bool IsStream = !Locality;
3706   // When the locality number is set
3707   if (Locality) {
3708     // The front-end should have filtered out the out-of-range values
3709     assert(Locality <= 3 && "Prefetch locality out-of-range");
3710     // The locality degree is the opposite of the cache speed.
3711     // Put the number the other way around.
3712     // The encoding starts at 0 for level 1
3713     Locality = 3 - Locality;
3714   }
3715
3716   // built the mask value encoding the expected behavior.
3717   unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
3718                    (!IsData << 3) |     // IsDataCache bit
3719                    (Locality << 1) |    // Cache level bits
3720                    (unsigned)IsStream;  // Stream bit
3721   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3722                      DAG.getTargetConstant(PrfOp, DL, MVT::i32),
3723                      Op.getOperand(1));
3724 }
3725
3726 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3727                                               SelectionDAG &DAG) const {
3728   EVT VT = Op.getValueType();
3729   if (VT.isScalableVector())
3730     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3731
3732   if (useSVEForFixedLengthVectorVT(VT))
3733     return LowerFixedLengthFPExtendToSVE(Op, DAG);
3734
3735   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
3736   return SDValue();
3737 }
3738
3739 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
3740                                              SelectionDAG &DAG) const {
3741   if (Op.getValueType().isScalableVector())
3742     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
3743
3744   bool IsStrict = Op->isStrictFPOpcode();
3745   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3746   EVT SrcVT = SrcVal.getValueType();
3747
3748   if (useSVEForFixedLengthVectorVT(SrcVT))
3749     return LowerFixedLengthFPRoundToSVE(Op, DAG);
3750
3751   if (SrcVT != MVT::f128) {
3752     // Expand cases where the input is a vector bigger than NEON.
3753     if (useSVEForFixedLengthVectorVT(SrcVT))
3754       return SDValue();
3755
3756     // It's legal except when f128 is involved
3757     return Op;
3758   }
3759
3760   return SDValue();
3761 }
3762
3763 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
3764                                                     SelectionDAG &DAG) const {
3765   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3766   // Any additional optimization in this function should be recorded
3767   // in the cost tables.
3768   bool IsStrict = Op->isStrictFPOpcode();
3769   EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
3770   EVT VT = Op.getValueType();
3771
3772   if (VT.isScalableVector()) {
3773     unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
3774                           ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
3775                           : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
3776     return LowerToPredicatedOp(Op, DAG, Opcode);
3777   }
3778
3779   if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
3780     return LowerFixedLengthFPToIntToSVE(Op, DAG);
3781
3782   unsigned NumElts = InVT.getVectorNumElements();
3783
3784   // f16 conversions are promoted to f32 when full fp16 is not supported.
3785   if (InVT.getVectorElementType() == MVT::f16 &&
3786       !Subtarget->hasFullFP16()) {
3787     MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
3788     SDLoc dl(Op);
3789     if (IsStrict) {
3790       SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
3791                                 {Op.getOperand(0), Op.getOperand(1)});
3792       return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
3793                          {Ext.getValue(1), Ext.getValue(0)});
3794     }
3795     return DAG.getNode(
3796         Op.getOpcode(), dl, Op.getValueType(),
3797         DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
3798   }
3799
3800   uint64_t VTSize = VT.getFixedSizeInBits();
3801   uint64_t InVTSize = InVT.getFixedSizeInBits();
3802   if (VTSize < InVTSize) {
3803     SDLoc dl(Op);
3804     if (IsStrict) {
3805       InVT = InVT.changeVectorElementTypeToInteger();
3806       SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
3807                                {Op.getOperand(0), Op.getOperand(1)});
3808       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3809       return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
3810     }
3811     SDValue Cv =
3812         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
3813                     Op.getOperand(0));
3814     return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3815   }
3816
3817   if (VTSize > InVTSize) {
3818     SDLoc dl(Op);
3819     MVT ExtVT =
3820         MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
3821                          VT.getVectorNumElements());
3822     if (IsStrict) {
3823       SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
3824                                 {Op.getOperand(0), Op.getOperand(1)});
3825       return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
3826                          {Ext.getValue(1), Ext.getValue(0)});
3827     }
3828     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
3829     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
3830   }
3831
3832   // Use a scalar operation for conversions between single-element vectors of
3833   // the same size.
3834   if (NumElts == 1) {
3835     SDLoc dl(Op);
3836     SDValue Extract = DAG.getNode(
3837         ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
3838         Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
3839     EVT ScalarVT = VT.getScalarType();
3840     if (IsStrict)
3841       return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
3842                          {Op.getOperand(0), Extract});
3843     return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
3844   }
3845
3846   // Type changing conversions are illegal.
3847   return Op;
3848 }
3849
3850 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
3851                                               SelectionDAG &DAG) const {
3852   bool IsStrict = Op->isStrictFPOpcode();
3853   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3854
3855   if (SrcVal.getValueType().isVector())
3856     return LowerVectorFP_TO_INT(Op, DAG);
3857
3858   // f16 conversions are promoted to f32 when full fp16 is not supported.
3859   if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
3860     SDLoc dl(Op);
3861     if (IsStrict) {
3862       SDValue Ext =
3863           DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3864                       {Op.getOperand(0), SrcVal});
3865       return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
3866                          {Ext.getValue(1), Ext.getValue(0)});
3867     }
3868     return DAG.getNode(
3869         Op.getOpcode(), dl, Op.getValueType(),
3870         DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
3871   }
3872
3873   if (SrcVal.getValueType() != MVT::f128) {
3874     // It's legal except when f128 is involved
3875     return Op;
3876   }
3877
3878   return SDValue();
3879 }
3880
3881 SDValue
3882 AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
3883                                                 SelectionDAG &DAG) const {
3884   // AArch64 FP-to-int conversions saturate to the destination element size, so
3885   // we can lower common saturating conversions to simple instructions.
3886   SDValue SrcVal = Op.getOperand(0);
3887   EVT SrcVT = SrcVal.getValueType();
3888   EVT DstVT = Op.getValueType();
3889   EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3890
3891   uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
3892   uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
3893   uint64_t SatWidth = SatVT.getScalarSizeInBits();
3894   assert(SatWidth <= DstElementWidth &&
3895          "Saturation width cannot exceed result width");
3896
3897   // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
3898   // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
3899   // types, so this is hard to reach.
3900   if (DstVT.isScalableVector())
3901     return SDValue();
3902
3903   EVT SrcElementVT = SrcVT.getVectorElementType();
3904
3905   // In the absence of FP16 support, promote f16 to f32 and saturate the result.
3906   if (SrcElementVT == MVT::f16 &&
3907       (!Subtarget->hasFullFP16() || DstElementWidth > 16)) {
3908     MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
3909     SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
3910     SrcVT = F32VT;
3911     SrcElementVT = MVT::f32;
3912     SrcElementWidth = 32;
3913   } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
3914              SrcElementVT != MVT::f16)
3915     return SDValue();
3916
3917   SDLoc DL(Op);
3918   // Cases that we can emit directly.
3919   if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
3920     return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
3921                        DAG.getValueType(DstVT.getScalarType()));
3922
3923   // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
3924   // result. This is only valid if the legal cvt is larger than the saturate
3925   // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
3926   // (at least until sqxtn is selected).
3927   if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
3928     return SDValue();
3929
3930   EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
3931   SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
3932                                   DAG.getValueType(IntVT.getScalarType()));
3933   SDValue Sat;
3934   if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3935     SDValue MinC = DAG.getConstant(
3936         APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
3937     SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
3938     SDValue MaxC = DAG.getConstant(
3939         APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
3940     Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
3941   } else {
3942     SDValue MinC = DAG.getConstant(
3943         APInt::getAllOnesValue(SatWidth).zext(SrcElementWidth), DL, IntVT);
3944     Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
3945   }
3946
3947   return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
3948 }
3949
3950 SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
3951                                                   SelectionDAG &DAG) const {
3952   // AArch64 FP-to-int conversions saturate to the destination register size, so
3953   // we can lower common saturating conversions to simple instructions.
3954   SDValue SrcVal = Op.getOperand(0);
3955   EVT SrcVT = SrcVal.getValueType();
3956
3957   if (SrcVT.isVector())
3958     return LowerVectorFP_TO_INT_SAT(Op, DAG);
3959
3960   EVT DstVT = Op.getValueType();
3961   EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3962   uint64_t SatWidth = SatVT.getScalarSizeInBits();
3963   uint64_t DstWidth = DstVT.getScalarSizeInBits();
3964   assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3965
3966   // In the absence of FP16 support, promote f16 to f32 and saturate the result.
3967   if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) {
3968     SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
3969     SrcVT = MVT::f32;
3970   } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16)
3971     return SDValue();
3972
3973   SDLoc DL(Op);
3974   // Cases that we can emit directly.
3975   if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
3976        (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
3977       DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
3978     return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
3979                        DAG.getValueType(DstVT));
3980
3981   // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
3982   // result. This is only valid if the legal cvt is larger than the saturate
3983   // width.
3984   if (DstWidth < SatWidth)
3985     return SDValue();
3986
3987   SDValue NativeCvt =
3988       DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
3989   SDValue Sat;
3990   if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3991     SDValue MinC = DAG.getConstant(
3992         APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
3993     SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
3994     SDValue MaxC = DAG.getConstant(
3995         APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
3996     Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
3997   } else {
3998     SDValue MinC = DAG.getConstant(
3999         APInt::getAllOnesValue(SatWidth).zext(DstWidth), DL, DstVT);
4000     Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4001   }
4002
4003   return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4004 }
4005
4006 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4007                                                     SelectionDAG &DAG) const {
4008   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4009   // Any additional optimization in this function should be recorded
4010   // in the cost tables.
4011   bool IsStrict = Op->isStrictFPOpcode();
4012   EVT VT = Op.getValueType();
4013   SDLoc dl(Op);
4014   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4015   EVT InVT = In.getValueType();
4016   unsigned Opc = Op.getOpcode();
4017   bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4018
4019   if (VT.isScalableVector()) {
4020     if (InVT.getVectorElementType() == MVT::i1) {
4021       // We can't directly extend an SVE predicate; extend it first.
4022       unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4023       EVT CastVT = getPromotedVTForPredicate(InVT);
4024       In = DAG.getNode(CastOpc, dl, CastVT, In);
4025       return DAG.getNode(Opc, dl, VT, In);
4026     }
4027
4028     unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4029                                : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
4030     return LowerToPredicatedOp(Op, DAG, Opcode);
4031   }
4032
4033   if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
4034     return LowerFixedLengthIntToFPToSVE(Op, DAG);
4035
4036   uint64_t VTSize = VT.getFixedSizeInBits();
4037   uint64_t InVTSize = InVT.getFixedSizeInBits();
4038   if (VTSize < InVTSize) {
4039     MVT CastVT =
4040         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
4041                          InVT.getVectorNumElements());
4042     if (IsStrict) {
4043       In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4044                        {Op.getOperand(0), In});
4045       return DAG.getNode(
4046           ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4047           {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4048     }
4049     In = DAG.getNode(Opc, dl, CastVT, In);
4050     return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4051                        DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4052   }
4053
4054   if (VTSize > InVTSize) {
4055     unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4056     EVT CastVT = VT.changeVectorElementTypeToInteger();
4057     In = DAG.getNode(CastOpc, dl, CastVT, In);
4058     if (IsStrict)
4059       return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4060     return DAG.getNode(Opc, dl, VT, In);
4061   }
4062
4063   // Use a scalar operation for conversions between single-element vectors of
4064   // the same size.
4065   if (VT.getVectorNumElements() == 1) {
4066     SDValue Extract = DAG.getNode(
4067         ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
4068         In, DAG.getConstant(0, dl, MVT::i64));
4069     EVT ScalarVT = VT.getScalarType();
4070     if (IsStrict)
4071       return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4072                          {Op.getOperand(0), Extract});
4073     return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4074   }
4075
4076   return Op;
4077 }
4078
4079 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4080                                             SelectionDAG &DAG) const {
4081   if (Op.getValueType().isVector())
4082     return LowerVectorINT_TO_FP(Op, DAG);
4083
4084   bool IsStrict = Op->isStrictFPOpcode();
4085   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4086
4087   // f16 conversions are promoted to f32 when full fp16 is not supported.
4088   if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4089     SDLoc dl(Op);
4090     if (IsStrict) {
4091       SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other},
4092                                 {Op.getOperand(0), SrcVal});
4093       return DAG.getNode(
4094           ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other},
4095           {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4096     }
4097     return DAG.getNode(
4098         ISD::FP_ROUND, dl, MVT::f16,
4099         DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
4100         DAG.getIntPtrConstant(0, dl));
4101   }
4102
4103   // i128 conversions are libcalls.
4104   if (SrcVal.getValueType() == MVT::i128)
4105     return SDValue();
4106
4107   // Other conversions are legal, unless it's to the completely software-based
4108   // fp128.
4109   if (Op.getValueType() != MVT::f128)
4110     return Op;
4111   return SDValue();
4112 }
4113
4114 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4115                                             SelectionDAG &DAG) const {
4116   // For iOS, we want to call an alternative entry point: __sincos_stret,
4117   // which returns the values in two S / D registers.
4118   SDLoc dl(Op);
4119   SDValue Arg = Op.getOperand(0);
4120   EVT ArgVT = Arg.getValueType();
4121   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4122
4123   ArgListTy Args;
4124   ArgListEntry Entry;
4125
4126   Entry.Node = Arg;
4127   Entry.Ty = ArgTy;
4128   Entry.IsSExt = false;
4129   Entry.IsZExt = false;
4130   Args.push_back(Entry);
4131
4132   RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4133                                         : RTLIB::SINCOS_STRET_F32;
4134   const char *LibcallName = getLibcallName(LC);
4135   SDValue Callee =
4136       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4137
4138   StructType *RetTy = StructType::get(ArgTy, ArgTy);
4139   TargetLowering::CallLoweringInfo CLI(DAG);
4140   CLI.setDebugLoc(dl)
4141       .setChain(DAG.getEntryNode())
4142       .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4143
4144   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4145   return CallResult.first;
4146 }
4147
4148 static MVT getSVEContainerType(EVT ContentTy);
4149
4150 SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4151                                             SelectionDAG &DAG) const {
4152   EVT OpVT = Op.getValueType();
4153   EVT ArgVT = Op.getOperand(0).getValueType();
4154
4155   if (useSVEForFixedLengthVectorVT(OpVT))
4156     return LowerFixedLengthBitcastToSVE(Op, DAG);
4157
4158   if (OpVT.isScalableVector()) {
4159     // Bitcasting between unpacked vector types of different element counts is
4160     // not a NOP because the live elements are laid out differently.
4161     //                01234567
4162     // e.g. nxv2i32 = XX??XX??
4163     //      nxv4f16 = X?X?X?X?
4164     if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4165       return SDValue();
4166
4167     if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4168       assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4169              "Expected int->fp bitcast!");
4170       SDValue ExtResult =
4171           DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
4172                       Op.getOperand(0));
4173       return getSVESafeBitCast(OpVT, ExtResult, DAG);
4174     }
4175     return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4176   }
4177
4178   if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4179     return SDValue();
4180
4181   // Bitcasts between f16 and bf16 are legal.
4182   if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4183     return Op;
4184
4185   assert(ArgVT == MVT::i16);
4186   SDLoc DL(Op);
4187
4188   Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4189   Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4190   return SDValue(
4191       DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
4192                          DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
4193       0);
4194 }
4195
4196 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4197   if (OrigVT.getSizeInBits() >= 64)
4198     return OrigVT;
4199
4200   assert(OrigVT.isSimple() && "Expecting a simple value type");
4201
4202   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4203   switch (OrigSimpleTy) {
4204   default: llvm_unreachable("Unexpected Vector Type");
4205   case MVT::v2i8:
4206   case MVT::v2i16:
4207      return MVT::v2i32;
4208   case MVT::v4i8:
4209     return  MVT::v4i16;
4210   }
4211 }
4212
4213 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
4214                                                  const EVT &OrigTy,
4215                                                  const EVT &ExtTy,
4216                                                  unsigned ExtOpcode) {
4217   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4218   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4219   // 64-bits we need to insert a new extension so that it will be 64-bits.
4220   assert(ExtTy.is128BitVector() && "Unexpected extension size");
4221   if (OrigTy.getSizeInBits() >= 64)
4222     return N;
4223
4224   // Must extend size to at least 64 bits to be used as an operand for VMULL.
4225   EVT NewVT = getExtensionTo64Bits(OrigTy);
4226
4227   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4228 }
4229
4230 // Returns lane if Op extracts from a two-element vector and lane is constant
4231 // (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and None otherwise.
4232 static Optional<uint64_t> getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {
4233   SDNode *OpNode = Op.getNode();
4234   if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4235     return None;
4236
4237   EVT VT = OpNode->getOperand(0).getValueType();
4238   ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4239   if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4240     return None;
4241
4242   return C->getZExtValue();
4243 }
4244
4245 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
4246                                    bool isSigned) {
4247   EVT VT = N->getValueType(0);
4248
4249   if (N->getOpcode() != ISD::BUILD_VECTOR)
4250     return false;
4251
4252   for (const SDValue &Elt : N->op_values()) {
4253     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4254       unsigned EltSize = VT.getScalarSizeInBits();
4255       unsigned HalfSize = EltSize / 2;
4256       if (isSigned) {
4257         if (!isIntN(HalfSize, C->getSExtValue()))
4258           return false;
4259       } else {
4260         if (!isUIntN(HalfSize, C->getZExtValue()))
4261           return false;
4262       }
4263       continue;
4264     }
4265     return false;
4266   }
4267
4268   return true;
4269 }
4270
4271 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
4272   if (N->getOpcode() == ISD::SIGN_EXTEND ||
4273       N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
4274     return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
4275                                              N->getOperand(0)->getValueType(0),
4276                                              N->getValueType(0),
4277                                              N->getOpcode());
4278
4279   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4280   EVT VT = N->getValueType(0);
4281   SDLoc dl(N);
4282   unsigned EltSize = VT.getScalarSizeInBits() / 2;
4283   unsigned NumElts = VT.getVectorNumElements();
4284   MVT TruncVT = MVT::getIntegerVT(EltSize);
4285   SmallVector<SDValue, 8> Ops;
4286   for (unsigned i = 0; i != NumElts; ++i) {
4287     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
4288     const APInt &CInt = C->getAPIntValue();
4289     // Element types smaller than 32 bits are not legal, so use i32 elements.
4290     // The values are implicitly truncated so sext vs. zext doesn't matter.
4291     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4292   }
4293   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
4294 }
4295
4296 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
4297   return N->getOpcode() == ISD::SIGN_EXTEND ||
4298          N->getOpcode() == ISD::ANY_EXTEND ||
4299          isExtendedBUILD_VECTOR(N, DAG, true);
4300 }
4301
4302 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
4303   return N->getOpcode() == ISD::ZERO_EXTEND ||
4304          N->getOpcode() == ISD::ANY_EXTEND ||
4305          isExtendedBUILD_VECTOR(N, DAG, false);
4306 }
4307
4308 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
4309   unsigned Opcode = N->getOpcode();
4310   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4311     SDNode *N0 = N->getOperand(0).getNode();
4312     SDNode *N1 = N->getOperand(1).getNode();
4313     return N0->hasOneUse() && N1->hasOneUse() &&
4314       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4315   }
4316   return false;
4317 }
4318
4319 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
4320   unsigned Opcode = N->getOpcode();
4321   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4322     SDNode *N0 = N->getOperand(0).getNode();
4323     SDNode *N1 = N->getOperand(1).getNode();
4324     return N0->hasOneUse() && N1->hasOneUse() &&
4325       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4326   }
4327   return false;
4328 }
4329
4330 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
4331                                                 SelectionDAG &DAG) const {
4332   // The rounding mode is in bits 23:22 of the FPSCR.
4333   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4334   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4335   // so that the shift + and get folded into a bitfield extract.
4336   SDLoc dl(Op);
4337
4338   SDValue Chain = Op.getOperand(0);
4339   SDValue FPCR_64 = DAG.getNode(
4340       ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4341       {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4342   Chain = FPCR_64.getValue(1);
4343   SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4344   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4345                                   DAG.getConstant(1U << 22, dl, MVT::i32));
4346   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4347                               DAG.getConstant(22, dl, MVT::i32));
4348   SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4349                             DAG.getConstant(3, dl, MVT::i32));
4350   return DAG.getMergeValues({AND, Chain}, dl);
4351 }
4352
4353 SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4354                                                  SelectionDAG &DAG) const {
4355   SDLoc DL(Op);
4356   SDValue Chain = Op->getOperand(0);
4357   SDValue RMValue = Op->getOperand(1);
4358
4359   // The rounding mode is in bits 23:22 of the FPCR.
4360   // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4361   // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4362   // ((arg - 1) & 3) << 22).
4363   //
4364   // The argument of llvm.set.rounding must be within the segment [0, 3], so
4365   // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4366   // generated llvm.set.rounding to ensure this condition.
4367
4368   // Calculate new value of FPCR[23:22].
4369   RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4370                         DAG.getConstant(1, DL, MVT::i32));
4371   RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4372                         DAG.getConstant(0x3, DL, MVT::i32));
4373   RMValue =
4374       DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4375                   DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4376   RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4377
4378   // Get current value of FPCR.
4379   SDValue Ops[] = {
4380       Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4381   SDValue FPCR =
4382       DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4383   Chain = FPCR.getValue(1);
4384   FPCR = FPCR.getValue(0);
4385
4386   // Put new rounding mode into FPSCR[23:22].
4387   const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4388   FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4389                      DAG.getConstant(RMMask, DL, MVT::i64));
4390   FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4391   SDValue Ops2[] = {
4392       Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4393       FPCR};
4394   return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4395 }
4396
4397 static unsigned selectUmullSmull(SDNode *&N0, SDNode *&N1, SelectionDAG &DAG,
4398                                  bool &IsMLA) {
4399   bool IsN0SExt = isSignExtended(N0, DAG);
4400   bool IsN1SExt = isSignExtended(N1, DAG);
4401   if (IsN0SExt && IsN1SExt)
4402     return AArch64ISD::SMULL;
4403
4404   bool IsN0ZExt = isZeroExtended(N0, DAG);
4405   bool IsN1ZExt = isZeroExtended(N1, DAG);
4406
4407   if (IsN0ZExt && IsN1ZExt)
4408     return AArch64ISD::UMULL;
4409
4410   if (!IsN1SExt && !IsN1ZExt)
4411     return 0;
4412   // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4413   // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4414   if (IsN1SExt && isAddSubSExt(N0, DAG)) {
4415     IsMLA = true;
4416     return AArch64ISD::SMULL;
4417   }
4418   if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
4419     IsMLA = true;
4420     return AArch64ISD::UMULL;
4421   }
4422   if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
4423     std::swap(N0, N1);
4424     IsMLA = true;
4425     return AArch64ISD::UMULL;
4426   }
4427   return 0;
4428 }
4429
4430 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4431   EVT VT = Op.getValueType();
4432
4433   // If SVE is available then i64 vector multiplications can also be made legal.
4434   bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
4435
4436   if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
4437     return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4438
4439   // Multiplications are only custom-lowered for 128-bit vectors so that
4440   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
4441   assert(VT.is128BitVector() && VT.isInteger() &&
4442          "unexpected type for custom-lowering ISD::MUL");
4443   SDNode *N0 = Op.getOperand(0).getNode();
4444   SDNode *N1 = Op.getOperand(1).getNode();
4445   bool isMLA = false;
4446   unsigned NewOpc = selectUmullSmull(N0, N1, DAG, isMLA);
4447
4448   if (!NewOpc) {
4449     if (VT == MVT::v2i64)
4450       // Fall through to expand this.  It is not legal.
4451       return SDValue();
4452     else
4453       // Other vector multiplications are legal.
4454       return Op;
4455   }
4456
4457   // Legalize to a S/UMULL instruction
4458   SDLoc DL(Op);
4459   SDValue Op0;
4460   SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
4461   if (!isMLA) {
4462     Op0 = skipExtensionForVectorMULL(N0, DAG);
4463     assert(Op0.getValueType().is64BitVector() &&
4464            Op1.getValueType().is64BitVector() &&
4465            "unexpected types for extended operands to VMULL");
4466     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
4467   }
4468   // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
4469   // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
4470   // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
4471   SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
4472   SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
4473   EVT Op1VT = Op1.getValueType();
4474   return DAG.getNode(N0->getOpcode(), DL, VT,
4475                      DAG.getNode(NewOpc, DL, VT,
4476                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
4477                      DAG.getNode(NewOpc, DL, VT,
4478                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
4479 }
4480
4481 static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
4482                                int Pattern) {
4483   if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
4484     return DAG.getConstant(1, DL, MVT::nxv1i1);
4485   return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
4486                      DAG.getTargetConstant(Pattern, DL, MVT::i32));
4487 }
4488
4489 // Returns a safe bitcast between two scalable vector predicates, where
4490 // any newly created lanes from a widening bitcast are defined as zero.
4491 static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
4492   SDLoc DL(Op);
4493   EVT InVT = Op.getValueType();
4494
4495   assert(InVT.getVectorElementType() == MVT::i1 &&
4496          VT.getVectorElementType() == MVT::i1 &&
4497          "Expected a predicate-to-predicate bitcast");
4498   assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
4499          InVT.isScalableVector() &&
4500          DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
4501          "Only expect to cast between legal scalable predicate types!");
4502
4503   // Return the operand if the cast isn't changing type,
4504   // e.g. <n x 16 x i1> -> <n x 16 x i1>
4505   if (InVT == VT)
4506     return Op;
4507
4508   SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
4509
4510   // We only have to zero the lanes if new lanes are being defined, e.g. when
4511   // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
4512   // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
4513   // we can return here.
4514   if (InVT.bitsGT(VT))
4515     return Reinterpret;
4516
4517   // Check if the other lanes are already known to be zeroed by
4518   // construction.
4519   if (isZeroingInactiveLanes(Op))
4520     return Reinterpret;
4521
4522   // Zero the newly introduced lanes.
4523   SDValue Mask = DAG.getConstant(1, DL, InVT);
4524   Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
4525   return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
4526 }
4527
4528 SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain,
4529                                            SMEAttrs Attrs, SDLoc DL,
4530                                            EVT VT) const {
4531   if (Attrs.hasStreamingInterfaceOrBody())
4532     return DAG.getConstant(1, DL, VT);
4533
4534   if (Attrs.hasNonStreamingInterfaceAndBody())
4535     return DAG.getConstant(0, DL, VT);
4536
4537   assert(Attrs.hasStreamingCompatibleInterface() && "Unexpected interface");
4538
4539   SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
4540                                          getPointerTy(DAG.getDataLayout()));
4541   Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
4542   Type *RetTy = StructType::get(Int64Ty, Int64Ty);
4543   TargetLowering::CallLoweringInfo CLI(DAG);
4544   ArgListTy Args;
4545   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
4546       CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
4547       RetTy, Callee, std::move(Args));
4548   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4549   SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
4550   return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
4551                      Mask);
4552 }
4553
4554 static Optional<SMEAttrs> getCalleeAttrsFromExternalFunction(SDValue V) {
4555   if (auto *ES = dyn_cast<ExternalSymbolSDNode>(V)) {
4556     StringRef S(ES->getSymbol());
4557     if (S == "__arm_sme_state" || S == "__arm_tpidr2_save")
4558       return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved);
4559     if (S == "__arm_tpidr2_restore")
4560       return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared);
4561   }
4562   return None;
4563 }
4564
4565 SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
4566                                                       SelectionDAG &DAG) const {
4567   unsigned IntNo = Op.getConstantOperandVal(1);
4568   SDLoc DL(Op);
4569   switch (IntNo) {
4570   default:
4571     return SDValue(); // Don't custom lower most intrinsics.
4572   case Intrinsic::aarch64_mops_memset_tag: {
4573     auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
4574     SDValue Chain = Node->getChain();
4575     SDValue Dst = Op.getOperand(2);
4576     SDValue Val = Op.getOperand(3);
4577     Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
4578     SDValue Size = Op.getOperand(4);
4579     auto Alignment = Node->getMemOperand()->getAlign();
4580     bool IsVol = Node->isVolatile();
4581     auto DstPtrInfo = Node->getPointerInfo();
4582
4583     const auto &SDI =
4584         static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
4585     SDValue MS =
4586         SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
4587                      Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
4588
4589     // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
4590     // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
4591     // LowerOperationWrapper will complain that the number of results has
4592     // changed.
4593     return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
4594   }
4595   case Intrinsic::aarch64_sme_za_enable:
4596     return DAG.getNode(
4597         AArch64ISD::SMSTART, DL, MVT::Other,
4598         Op->getOperand(0), // Chain
4599         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
4600         DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
4601   case Intrinsic::aarch64_sme_za_disable:
4602     return DAG.getNode(
4603         AArch64ISD::SMSTOP, DL, MVT::Other,
4604         Op->getOperand(0), // Chain
4605         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
4606         DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
4607   }
4608 }
4609
4610 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
4611                                                      SelectionDAG &DAG) const {
4612   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4613   SDLoc dl(Op);
4614   switch (IntNo) {
4615   default: return SDValue();    // Don't custom lower most intrinsics.
4616   case Intrinsic::thread_pointer: {
4617     EVT PtrVT = getPointerTy(DAG.getDataLayout());
4618     return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
4619   }
4620   case Intrinsic::aarch64_neon_abs: {
4621     EVT Ty = Op.getValueType();
4622     if (Ty == MVT::i64) {
4623       SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
4624                                    Op.getOperand(1));
4625       Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
4626       return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
4627     } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
4628       return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
4629     } else {
4630       report_fatal_error("Unexpected type for AArch64 NEON intrinic");
4631     }
4632   }
4633   case Intrinsic::aarch64_neon_pmull64: {
4634     SDValue LHS = Op.getOperand(1);
4635     SDValue RHS = Op.getOperand(2);
4636
4637     Optional<uint64_t> LHSLane = getConstantLaneNumOfExtractHalfOperand(LHS);
4638     Optional<uint64_t> RHSLane = getConstantLaneNumOfExtractHalfOperand(RHS);
4639
4640     assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
4641     assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
4642
4643     // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
4644     // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
4645     // which ISel recognizes better. For example, generate a ldr into d*
4646     // registers as opposed to a GPR load followed by a fmov.
4647     auto TryVectorizeOperand =
4648         [](SDValue N, Optional<uint64_t> NLane, Optional<uint64_t> OtherLane,
4649            const SDLoc &dl, SelectionDAG &DAG) -> SDValue {
4650       // If the operand is an higher half itself, rewrite it to
4651       // extract_high_v2i64; this way aarch64_neon_pmull64 could
4652       // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
4653       if (NLane && *NLane == 1)
4654         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
4655                            N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
4656
4657       // Operand N is not a higher half but the other operand is.
4658       if (OtherLane && *OtherLane == 1) {
4659         // If this operand is a lower half, rewrite it to
4660         // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
4661         // align lanes of two operands. A roundtrip sequence (to move from lane
4662         // 1 to lane 0) is like this:
4663         //   mov x8, v0.d[1]
4664         //   fmov d0, x8
4665         if (NLane && *NLane == 0)
4666           return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
4667                              DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
4668                                          N.getOperand(0),
4669                                          DAG.getConstant(0, dl, MVT::i64)),
4670                              DAG.getConstant(1, dl, MVT::i64));
4671
4672         // Otherwise just dup from main to all lanes.
4673         return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
4674       }
4675
4676       // Neither operand is an extract of higher half, so codegen may just use
4677       // the non-high version of PMULL instruction. Use v1i64 to represent i64.
4678       assert(N.getValueType() == MVT::i64 &&
4679              "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
4680       return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
4681     };
4682
4683     LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
4684     RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
4685
4686     return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
4687   }
4688   case Intrinsic::aarch64_neon_smax:
4689     return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
4690                        Op.getOperand(1), Op.getOperand(2));
4691   case Intrinsic::aarch64_neon_umax:
4692     return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
4693                        Op.getOperand(1), Op.getOperand(2));
4694   case Intrinsic::aarch64_neon_smin:
4695     return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
4696                        Op.getOperand(1), Op.getOperand(2));
4697   case Intrinsic::aarch64_neon_umin:
4698     return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
4699                        Op.getOperand(1), Op.getOperand(2));
4700   case Intrinsic::aarch64_neon_scalar_sqxtn:
4701   case Intrinsic::aarch64_neon_scalar_sqxtun:
4702   case Intrinsic::aarch64_neon_scalar_uqxtn: {
4703     assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
4704     if (Op.getValueType() == MVT::i32)
4705       return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
4706                          DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
4707                                      Op.getOperand(0),
4708                                      DAG.getNode(ISD::BITCAST, dl, MVT::f64,
4709                                                  Op.getOperand(1))));
4710     return SDValue();
4711   }
4712   case Intrinsic::aarch64_sve_sunpkhi:
4713     return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
4714                        Op.getOperand(1));
4715   case Intrinsic::aarch64_sve_sunpklo:
4716     return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
4717                        Op.getOperand(1));
4718   case Intrinsic::aarch64_sve_uunpkhi:
4719     return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
4720                        Op.getOperand(1));
4721   case Intrinsic::aarch64_sve_uunpklo:
4722     return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
4723                        Op.getOperand(1));
4724   case Intrinsic::aarch64_sve_clasta_n:
4725     return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
4726                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4727   case Intrinsic::aarch64_sve_clastb_n:
4728     return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
4729                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4730   case Intrinsic::aarch64_sve_lasta:
4731     return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
4732                        Op.getOperand(1), Op.getOperand(2));
4733   case Intrinsic::aarch64_sve_lastb:
4734     return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
4735                        Op.getOperand(1), Op.getOperand(2));
4736   case Intrinsic::aarch64_sve_rev:
4737     return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
4738                        Op.getOperand(1));
4739   case Intrinsic::aarch64_sve_tbl:
4740     return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
4741                        Op.getOperand(1), Op.getOperand(2));
4742   case Intrinsic::aarch64_sve_trn1:
4743     return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
4744                        Op.getOperand(1), Op.getOperand(2));
4745   case Intrinsic::aarch64_sve_trn2:
4746     return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
4747                        Op.getOperand(1), Op.getOperand(2));
4748   case Intrinsic::aarch64_sve_uzp1:
4749     return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
4750                        Op.getOperand(1), Op.getOperand(2));
4751   case Intrinsic::aarch64_sve_uzp2:
4752     return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
4753                        Op.getOperand(1), Op.getOperand(2));
4754   case Intrinsic::aarch64_sve_zip1:
4755     return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
4756                        Op.getOperand(1), Op.getOperand(2));
4757   case Intrinsic::aarch64_sve_zip2:
4758     return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
4759                        Op.getOperand(1), Op.getOperand(2));
4760   case Intrinsic::aarch64_sve_splice:
4761     return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
4762                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4763   case Intrinsic::aarch64_sve_ptrue:
4764     return getPTrue(DAG, dl, Op.getValueType(),
4765                     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
4766   case Intrinsic::aarch64_sve_clz:
4767     return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
4768                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4769   case Intrinsic::aarch64_sme_cntsb:
4770     return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
4771                        DAG.getConstant(1, dl, MVT::i32));
4772   case Intrinsic::aarch64_sme_cntsh: {
4773     SDValue One = DAG.getConstant(1, dl, MVT::i32);
4774     SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
4775     return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
4776   }
4777   case Intrinsic::aarch64_sme_cntsw: {
4778     SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
4779                                 DAG.getConstant(1, dl, MVT::i32));
4780     return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
4781                        DAG.getConstant(2, dl, MVT::i32));
4782   }
4783   case Intrinsic::aarch64_sme_cntsd: {
4784     SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
4785                                 DAG.getConstant(1, dl, MVT::i32));
4786     return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
4787                        DAG.getConstant(3, dl, MVT::i32));
4788   }
4789   case Intrinsic::aarch64_sve_cnt: {
4790     SDValue Data = Op.getOperand(3);
4791     // CTPOP only supports integer operands.
4792     if (Data.getValueType().isFloatingPoint())
4793       Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
4794     return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
4795                        Op.getOperand(2), Data, Op.getOperand(1));
4796   }
4797   case Intrinsic::aarch64_sve_dupq_lane:
4798     return LowerDUPQLane(Op, DAG);
4799   case Intrinsic::aarch64_sve_convert_from_svbool:
4800     return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
4801   case Intrinsic::aarch64_sve_convert_to_svbool:
4802     return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
4803   case Intrinsic::aarch64_sve_fneg:
4804     return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
4805                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4806   case Intrinsic::aarch64_sve_frintp:
4807     return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
4808                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4809   case Intrinsic::aarch64_sve_frintm:
4810     return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
4811                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4812   case Intrinsic::aarch64_sve_frinti:
4813     return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
4814                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4815   case Intrinsic::aarch64_sve_frintx:
4816     return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
4817                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4818   case Intrinsic::aarch64_sve_frinta:
4819     return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
4820                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4821   case Intrinsic::aarch64_sve_frintn:
4822     return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
4823                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4824   case Intrinsic::aarch64_sve_frintz:
4825     return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
4826                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4827   case Intrinsic::aarch64_sve_ucvtf:
4828     return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl,
4829                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4830                        Op.getOperand(1));
4831   case Intrinsic::aarch64_sve_scvtf:
4832     return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl,
4833                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4834                        Op.getOperand(1));
4835   case Intrinsic::aarch64_sve_fcvtzu:
4836     return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
4837                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4838                        Op.getOperand(1));
4839   case Intrinsic::aarch64_sve_fcvtzs:
4840     return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
4841                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4842                        Op.getOperand(1));
4843   case Intrinsic::aarch64_sve_fsqrt:
4844     return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
4845                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4846   case Intrinsic::aarch64_sve_frecpx:
4847     return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
4848                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4849   case Intrinsic::aarch64_sve_frecpe_x:
4850     return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
4851                        Op.getOperand(1));
4852   case Intrinsic::aarch64_sve_frecps_x:
4853     return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
4854                        Op.getOperand(1), Op.getOperand(2));
4855   case Intrinsic::aarch64_sve_frsqrte_x:
4856     return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
4857                        Op.getOperand(1));
4858   case Intrinsic::aarch64_sve_frsqrts_x:
4859     return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
4860                        Op.getOperand(1), Op.getOperand(2));
4861   case Intrinsic::aarch64_sve_fabs:
4862     return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
4863                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4864   case Intrinsic::aarch64_sve_abs:
4865     return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
4866                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4867   case Intrinsic::aarch64_sve_neg:
4868     return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
4869                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4870   case Intrinsic::aarch64_sve_insr: {
4871     SDValue Scalar = Op.getOperand(2);
4872     EVT ScalarTy = Scalar.getValueType();
4873     if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
4874       Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
4875
4876     return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
4877                        Op.getOperand(1), Scalar);
4878   }
4879   case Intrinsic::aarch64_sve_rbit:
4880     return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl,
4881                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4882                        Op.getOperand(1));
4883   case Intrinsic::aarch64_sve_revb:
4884     return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
4885                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4886   case Intrinsic::aarch64_sve_revh:
4887     return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
4888                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4889   case Intrinsic::aarch64_sve_revw:
4890     return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
4891                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4892   case Intrinsic::aarch64_sve_revd:
4893     return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
4894                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4895   case Intrinsic::aarch64_sve_sxtb:
4896     return DAG.getNode(
4897         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4898         Op.getOperand(2), Op.getOperand(3),
4899         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
4900         Op.getOperand(1));
4901   case Intrinsic::aarch64_sve_sxth:
4902     return DAG.getNode(
4903         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4904         Op.getOperand(2), Op.getOperand(3),
4905         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
4906         Op.getOperand(1));
4907   case Intrinsic::aarch64_sve_sxtw:
4908     return DAG.getNode(
4909         AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4910         Op.getOperand(2), Op.getOperand(3),
4911         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
4912         Op.getOperand(1));
4913   case Intrinsic::aarch64_sve_uxtb:
4914     return DAG.getNode(
4915         AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4916         Op.getOperand(2), Op.getOperand(3),
4917         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
4918         Op.getOperand(1));
4919   case Intrinsic::aarch64_sve_uxth:
4920     return DAG.getNode(
4921         AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4922         Op.getOperand(2), Op.getOperand(3),
4923         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
4924         Op.getOperand(1));
4925   case Intrinsic::aarch64_sve_uxtw:
4926     return DAG.getNode(
4927         AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4928         Op.getOperand(2), Op.getOperand(3),
4929         DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
4930         Op.getOperand(1));
4931   case Intrinsic::localaddress: {
4932     const auto &MF = DAG.getMachineFunction();
4933     const auto *RegInfo = Subtarget->getRegisterInfo();
4934     unsigned Reg = RegInfo->getLocalAddressRegister(MF);
4935     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
4936                               Op.getSimpleValueType());
4937   }
4938
4939   case Intrinsic::eh_recoverfp: {
4940     // FIXME: This needs to be implemented to correctly handle highly aligned
4941     // stack objects. For now we simply return the incoming FP. Refer D53541
4942     // for more details.
4943     SDValue FnOp = Op.getOperand(1);
4944     SDValue IncomingFPOp = Op.getOperand(2);
4945     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
4946     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
4947     if (!Fn)
4948       report_fatal_error(
4949           "llvm.eh.recoverfp must take a function as the first argument");
4950     return IncomingFPOp;
4951   }
4952
4953   case Intrinsic::aarch64_neon_vsri:
4954   case Intrinsic::aarch64_neon_vsli: {
4955     EVT Ty = Op.getValueType();
4956
4957     if (!Ty.isVector())
4958       report_fatal_error("Unexpected type for aarch64_neon_vsli");
4959
4960     assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
4961
4962     bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
4963     unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
4964     return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
4965                        Op.getOperand(3));
4966   }
4967
4968   case Intrinsic::aarch64_neon_srhadd:
4969   case Intrinsic::aarch64_neon_urhadd:
4970   case Intrinsic::aarch64_neon_shadd:
4971   case Intrinsic::aarch64_neon_uhadd: {
4972     bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
4973                         IntNo == Intrinsic::aarch64_neon_shadd);
4974     bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
4975                           IntNo == Intrinsic::aarch64_neon_urhadd);
4976     unsigned Opcode = IsSignedAdd
4977                           ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
4978                           : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
4979     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4980                        Op.getOperand(2));
4981   }
4982   case Intrinsic::aarch64_neon_sabd:
4983   case Intrinsic::aarch64_neon_uabd: {
4984     unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU
4985                                                             : ISD::ABDS;
4986     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4987                        Op.getOperand(2));
4988   }
4989   case Intrinsic::aarch64_neon_saddlp:
4990   case Intrinsic::aarch64_neon_uaddlp: {
4991     unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
4992                           ? AArch64ISD::UADDLP
4993                           : AArch64ISD::SADDLP;
4994     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
4995   }
4996   case Intrinsic::aarch64_neon_sdot:
4997   case Intrinsic::aarch64_neon_udot:
4998   case Intrinsic::aarch64_sve_sdot:
4999   case Intrinsic::aarch64_sve_udot: {
5000     unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5001                        IntNo == Intrinsic::aarch64_sve_udot)
5002                           ? AArch64ISD::UDOT
5003                           : AArch64ISD::SDOT;
5004     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5005                        Op.getOperand(2), Op.getOperand(3));
5006   }
5007   case Intrinsic::get_active_lane_mask: {
5008     SDValue ID =
5009         DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5010     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
5011                        Op.getOperand(1), Op.getOperand(2));
5012   }
5013   }
5014 }
5015
5016 bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5017   if (VT.getVectorElementType() == MVT::i8 ||
5018       VT.getVectorElementType() == MVT::i16) {
5019     EltTy = MVT::i32;
5020     return true;
5021   }
5022   return false;
5023 }
5024
5025 bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT,
5026                                                           EVT DataVT) const {
5027   // SVE only supports implicit extension of 32-bit indices.
5028   if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5029     return false;
5030
5031   // Indices cannot be smaller than the main data type.
5032   if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5033     return false;
5034
5035   // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5036   // element container type, which would violate the previous clause.
5037   return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5038 }
5039
5040 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5041   return ExtVal.getValueType().isScalableVector() ||
5042          useSVEForFixedLengthVectorVT(
5043              ExtVal.getValueType(),
5044              /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors());
5045 }
5046
5047 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5048   std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5049       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5050        AArch64ISD::GLD1_MERGE_ZERO},
5051       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5052        AArch64ISD::GLD1_UXTW_MERGE_ZERO},
5053       {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5054        AArch64ISD::GLD1_MERGE_ZERO},
5055       {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5056        AArch64ISD::GLD1_SXTW_MERGE_ZERO},
5057       {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5058        AArch64ISD::GLD1_SCALED_MERGE_ZERO},
5059       {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5060        AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
5061       {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5062        AArch64ISD::GLD1_SCALED_MERGE_ZERO},
5063       {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5064        AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
5065   };
5066   auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5067   return AddrModes.find(Key)->second;
5068 }
5069
5070 unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5071   switch (Opcode) {
5072   default:
5073     llvm_unreachable("unimplemented opcode");
5074     return Opcode;
5075   case AArch64ISD::GLD1_MERGE_ZERO:
5076     return AArch64ISD::GLD1S_MERGE_ZERO;
5077   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
5078     return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
5079   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
5080     return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
5081   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
5082     return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
5083   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
5084     return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
5085   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
5086     return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
5087   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
5088     return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
5089   }
5090 }
5091
5092 SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5093                                             SelectionDAG &DAG) const {
5094   MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5095
5096   SDLoc DL(Op);
5097   SDValue Chain = MGT->getChain();
5098   SDValue PassThru = MGT->getPassThru();
5099   SDValue Mask = MGT->getMask();
5100   SDValue BasePtr = MGT->getBasePtr();
5101   SDValue Index = MGT->getIndex();
5102   SDValue Scale = MGT->getScale();
5103   EVT VT = Op.getValueType();
5104   EVT MemVT = MGT->getMemoryVT();
5105   ISD::LoadExtType ExtType = MGT->getExtensionType();
5106   ISD::MemIndexType IndexType = MGT->getIndexType();
5107
5108   // SVE supports zero (and so undef) passthrough values only, everything else
5109   // must be handled manually by an explicit select on the load's output.
5110   if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5111     SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5112     SDValue Load =
5113         DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5114                             MGT->getMemOperand(), IndexType, ExtType);
5115     SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5116     return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5117   }
5118
5119   bool IsScaled = MGT->isIndexScaled();
5120   bool IsSigned = MGT->isIndexSigned();
5121
5122   // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5123   // must be calculated before hand.
5124   uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
5125   if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5126     assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5127     EVT IndexVT = Index.getValueType();
5128     Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5129                         DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5130     Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5131
5132     SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5133     return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5134                                MGT->getMemOperand(), IndexType, ExtType);
5135   }
5136
5137   // Lower fixed length gather to a scalable equivalent.
5138   if (VT.isFixedLengthVector()) {
5139     assert(Subtarget->useSVEForFixedLengthVectors() &&
5140            "Cannot lower when not using SVE for fixed vectors!");
5141
5142     // NOTE: Handle floating-point as if integer then bitcast the result.
5143     EVT DataVT = VT.changeVectorElementTypeToInteger();
5144     MemVT = MemVT.changeVectorElementTypeToInteger();
5145
5146     // Find the smallest integer fixed length vector we can use for the gather.
5147     EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5148     if (DataVT.getVectorElementType() == MVT::i64 ||
5149         Index.getValueType().getVectorElementType() == MVT::i64 ||
5150         Mask.getValueType().getVectorElementType() == MVT::i64)
5151       PromotedVT = VT.changeVectorElementType(MVT::i64);
5152
5153     // Promote vector operands except for passthrough, which we know is either
5154     // undef or zero, and thus best constructed directly.
5155     unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5156     Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5157     Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5158
5159     // A promoted result type forces the need for an extending load.
5160     if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5161       ExtType = ISD::EXTLOAD;
5162
5163     EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5164
5165     // Convert fixed length vector operands to scalable.
5166     MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5167     Index = convertToScalableVector(DAG, ContainerVT, Index);
5168     Mask = convertFixedMaskToScalableVector(Mask, DAG);
5169     PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
5170                                    : DAG.getConstant(0, DL, ContainerVT);
5171
5172     // Emit equivalent scalable vector gather.
5173     SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5174     SDValue Load =
5175         DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5176                             Ops, MGT->getMemOperand(), IndexType, ExtType);
5177
5178     // Extract fixed length data then convert to the required result type.
5179     SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
5180     Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
5181     if (VT.isFloatingPoint())
5182       Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
5183
5184     return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5185   }
5186
5187   // Everything else is legal.
5188   return Op;
5189 }
5190
5191 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5192                                              SelectionDAG &DAG) const {
5193   MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
5194
5195   SDLoc DL(Op);
5196   SDValue Chain = MSC->getChain();
5197   SDValue StoreVal = MSC->getValue();
5198   SDValue Mask = MSC->getMask();
5199   SDValue BasePtr = MSC->getBasePtr();
5200   SDValue Index = MSC->getIndex();
5201   SDValue Scale = MSC->getScale();
5202   EVT VT = StoreVal.getValueType();
5203   EVT MemVT = MSC->getMemoryVT();
5204   ISD::MemIndexType IndexType = MSC->getIndexType();
5205   bool Truncating = MSC->isTruncatingStore();
5206
5207   bool IsScaled = MSC->isIndexScaled();
5208   bool IsSigned = MSC->isIndexSigned();
5209
5210   // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5211   // must be calculated before hand.
5212   uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
5213   if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5214     assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5215     EVT IndexVT = Index.getValueType();
5216     Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5217                         DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5218     Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5219
5220     SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5221     return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5222                                 MSC->getMemOperand(), IndexType, Truncating);
5223   }
5224
5225   // Lower fixed length scatter to a scalable equivalent.
5226   if (VT.isFixedLengthVector()) {
5227     assert(Subtarget->useSVEForFixedLengthVectors() &&
5228            "Cannot lower when not using SVE for fixed vectors!");
5229
5230     // Once bitcast we treat floating-point scatters as if integer.
5231     if (VT.isFloatingPoint()) {
5232       VT = VT.changeVectorElementTypeToInteger();
5233       MemVT = MemVT.changeVectorElementTypeToInteger();
5234       StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
5235     }
5236
5237     // Find the smallest integer fixed length vector we can use for the scatter.
5238     EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5239     if (VT.getVectorElementType() == MVT::i64 ||
5240         Index.getValueType().getVectorElementType() == MVT::i64 ||
5241         Mask.getValueType().getVectorElementType() == MVT::i64)
5242       PromotedVT = VT.changeVectorElementType(MVT::i64);
5243
5244     // Promote vector operands.
5245     unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5246     Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5247     Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5248     StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
5249
5250     // A promoted value type forces the need for a truncating store.
5251     if (PromotedVT != VT)
5252       Truncating = true;
5253
5254     EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5255
5256     // Convert fixed length vector operands to scalable.
5257     MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5258     Index = convertToScalableVector(DAG, ContainerVT, Index);
5259     Mask = convertFixedMaskToScalableVector(Mask, DAG);
5260     StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
5261
5262     // Emit equivalent scalable vector scatter.
5263     SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5264     return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5265                                 MSC->getMemOperand(), IndexType, Truncating);
5266   }
5267
5268   // Everything else is legal.
5269   return Op;
5270 }
5271
5272 SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
5273   SDLoc DL(Op);
5274   MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
5275   assert(LoadNode && "Expected custom lowering of a masked load node");
5276   EVT VT = Op->getValueType(0);
5277
5278   if (useSVEForFixedLengthVectorVT(
5279           VT,
5280           /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
5281     return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
5282
5283   SDValue PassThru = LoadNode->getPassThru();
5284   SDValue Mask = LoadNode->getMask();
5285
5286   if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
5287     return Op;
5288
5289   SDValue Load = DAG.getMaskedLoad(
5290       VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
5291       LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
5292       LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
5293       LoadNode->getExtensionType());
5294
5295   SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5296
5297   return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5298 }
5299
5300 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
5301 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
5302                                         EVT VT, EVT MemVT,
5303                                         SelectionDAG &DAG) {
5304   assert(VT.isVector() && "VT should be a vector type");
5305   assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
5306
5307   SDValue Value = ST->getValue();
5308
5309   // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
5310   // the word lane which represent the v4i8 subvector.  It optimizes the store
5311   // to:
5312   //
5313   //   xtn  v0.8b, v0.8h
5314   //   str  s0, [x0]
5315
5316   SDValue Undef = DAG.getUNDEF(MVT::i16);
5317   SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
5318                                         {Undef, Undef, Undef, Undef});
5319
5320   SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
5321                                  Value, UndefVec);
5322   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
5323
5324   Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
5325   SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
5326                                      Trunc, DAG.getConstant(0, DL, MVT::i64));
5327
5328   return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
5329                       ST->getBasePtr(), ST->getMemOperand());
5330 }
5331
5332 // Custom lowering for any store, vector or scalar and/or default or with
5333 // a truncate operations.  Currently only custom lower truncate operation
5334 // from vector v4i16 to v4i8 or volatile stores of i128.
5335 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
5336                                           SelectionDAG &DAG) const {
5337   SDLoc Dl(Op);
5338   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
5339   assert (StoreNode && "Can only custom lower store nodes");
5340
5341   SDValue Value = StoreNode->getValue();
5342
5343   EVT VT = Value.getValueType();
5344   EVT MemVT = StoreNode->getMemoryVT();
5345
5346   if (VT.isVector()) {
5347     if (useSVEForFixedLengthVectorVT(
5348             VT,
5349             /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
5350       return LowerFixedLengthVectorStoreToSVE(Op, DAG);
5351
5352     unsigned AS = StoreNode->getAddressSpace();
5353     Align Alignment = StoreNode->getAlign();
5354     if (Alignment < MemVT.getStoreSize() &&
5355         !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
5356                                         StoreNode->getMemOperand()->getFlags(),
5357                                         nullptr)) {
5358       return scalarizeVectorStore(StoreNode, DAG);
5359     }
5360
5361     if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
5362         MemVT == MVT::v4i8) {
5363       return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
5364     }
5365     // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
5366     // the custom lowering, as there are no un-paired non-temporal stores and
5367     // legalization will break up 256 bit inputs.
5368     ElementCount EC = MemVT.getVectorElementCount();
5369     if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
5370         EC.isKnownEven() &&
5371         ((MemVT.getScalarSizeInBits() == 8u ||
5372           MemVT.getScalarSizeInBits() == 16u ||
5373           MemVT.getScalarSizeInBits() == 32u ||
5374           MemVT.getScalarSizeInBits() == 64u))) {
5375       SDValue Lo =
5376           DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
5377                       MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
5378                       StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
5379       SDValue Hi =
5380           DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
5381                       MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
5382                       StoreNode->getValue(),
5383                       DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
5384       SDValue Result = DAG.getMemIntrinsicNode(
5385           AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
5386           {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
5387           StoreNode->getMemoryVT(), StoreNode->getMemOperand());
5388       return Result;
5389     }
5390   } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
5391     return LowerStore128(Op, DAG);
5392   } else if (MemVT == MVT::i64x8) {
5393     SDValue Value = StoreNode->getValue();
5394     assert(Value->getValueType(0) == MVT::i64x8);
5395     SDValue Chain = StoreNode->getChain();
5396     SDValue Base = StoreNode->getBasePtr();
5397     EVT PtrVT = Base.getValueType();
5398     for (unsigned i = 0; i < 8; i++) {
5399       SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
5400                                  Value, DAG.getConstant(i, Dl, MVT::i32));
5401       SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
5402                                 DAG.getConstant(i * 8, Dl, PtrVT));
5403       Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
5404                            StoreNode->getOriginalAlign());
5405     }
5406     return Chain;
5407   }
5408
5409   return SDValue();
5410 }
5411
5412 /// Lower atomic or volatile 128-bit stores to a single STP instruction.
5413 SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
5414                                              SelectionDAG &DAG) const {
5415   MemSDNode *StoreNode = cast<MemSDNode>(Op);
5416   assert(StoreNode->getMemoryVT() == MVT::i128);
5417   assert(StoreNode->isVolatile() || StoreNode->isAtomic());
5418   assert(!StoreNode->isAtomic() ||
5419          StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
5420          StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
5421
5422   SDValue Value = StoreNode->getOpcode() == ISD::STORE
5423                       ? StoreNode->getOperand(1)
5424                       : StoreNode->getOperand(2);
5425   SDLoc DL(Op);
5426   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
5427                            DAG.getConstant(0, DL, MVT::i64));
5428   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
5429                            DAG.getConstant(1, DL, MVT::i64));
5430   SDValue Result = DAG.getMemIntrinsicNode(
5431       AArch64ISD::STP, DL, DAG.getVTList(MVT::Other),
5432       {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
5433       StoreNode->getMemoryVT(), StoreNode->getMemOperand());
5434   return Result;
5435 }
5436
5437 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
5438                                          SelectionDAG &DAG) const {
5439   SDLoc DL(Op);
5440   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
5441   assert(LoadNode && "Expected custom lowering of a load node");
5442
5443   if (LoadNode->getMemoryVT() == MVT::i64x8) {
5444     SmallVector<SDValue, 8> Ops;
5445     SDValue Base = LoadNode->getBasePtr();
5446     SDValue Chain = LoadNode->getChain();
5447     EVT PtrVT = Base.getValueType();
5448     for (unsigned i = 0; i < 8; i++) {
5449       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
5450                                 DAG.getConstant(i * 8, DL, PtrVT));
5451       SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
5452                                  LoadNode->getPointerInfo(),
5453                                  LoadNode->getOriginalAlign());
5454       Ops.push_back(Part);
5455       Chain = SDValue(Part.getNode(), 1);
5456     }
5457     SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
5458     return DAG.getMergeValues({Loaded, Chain}, DL);
5459   }
5460
5461   // Custom lowering for extending v4i8 vector loads.
5462   EVT VT = Op->getValueType(0);
5463   assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
5464
5465   if (LoadNode->getMemoryVT() != MVT::v4i8)
5466     return SDValue();
5467
5468   unsigned ExtType;
5469   if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
5470     ExtType = ISD::SIGN_EXTEND;
5471   else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
5472            LoadNode->getExtensionType() == ISD::EXTLOAD)
5473     ExtType = ISD::ZERO_EXTEND;
5474   else
5475     return SDValue();
5476
5477   SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
5478                              LoadNode->getBasePtr(), MachinePointerInfo());
5479   SDValue Chain = Load.getValue(1);
5480   SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
5481   SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
5482   SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
5483   Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
5484                     DAG.getConstant(0, DL, MVT::i64));
5485   if (VT == MVT::v4i32)
5486     Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
5487   return DAG.getMergeValues({Ext, Chain}, DL);
5488 }
5489
5490 // Generate SUBS and CSEL for integer abs.
5491 SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5492   MVT VT = Op.getSimpleValueType();
5493
5494   if (VT.isVector())
5495     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
5496
5497   SDLoc DL(Op);
5498   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
5499                             Op.getOperand(0));
5500   // Generate SUBS & CSEL.
5501   SDValue Cmp =
5502       DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
5503                   Op.getOperand(0), DAG.getConstant(0, DL, VT));
5504   return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
5505                      DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
5506                      Cmp.getValue(1));
5507 }
5508
5509 static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
5510   SDValue Chain = Op.getOperand(0);
5511   SDValue Cond = Op.getOperand(1);
5512   SDValue Dest = Op.getOperand(2);
5513
5514   AArch64CC::CondCode CC;
5515   if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
5516     SDLoc dl(Op);
5517     SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
5518     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
5519                        Cmp);
5520   }
5521
5522   return SDValue();
5523 }
5524
5525 SDValue AArch64TargetLowering::LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const {
5526   assert(Op->getOpcode() == ISD::ZERO_EXTEND && "Expected ZERO_EXTEND");
5527
5528   if (Op.getValueType().isFixedLengthVector())
5529     return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
5530
5531   // Try to lower to VSELECT to allow zext to transform into
5532   // a predicated instruction like add, sub or mul.
5533   SDValue Value = Op->getOperand(0);
5534   if (!Value->getValueType(0).isScalableVector() ||
5535       Value->getValueType(0).getScalarType() != MVT::i1)
5536     return SDValue();
5537
5538   SDLoc DL = SDLoc(Op);
5539   EVT VT = Op->getValueType(0);
5540   SDValue Ones = DAG.getConstant(1, DL, VT);
5541   SDValue Zeros = DAG.getConstant(0, DL, VT);
5542   return DAG.getNode(ISD::VSELECT, DL, VT, Value, Ones, Zeros);
5543 }
5544
5545 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
5546                                               SelectionDAG &DAG) const {
5547   LLVM_DEBUG(dbgs() << "Custom lowering: ");
5548   LLVM_DEBUG(Op.dump());
5549
5550   switch (Op.getOpcode()) {
5551   default:
5552     llvm_unreachable("unimplemented operand");
5553     return SDValue();
5554   case ISD::BITCAST:
5555     return LowerBITCAST(Op, DAG);
5556   case ISD::GlobalAddress:
5557     return LowerGlobalAddress(Op, DAG);
5558   case ISD::GlobalTLSAddress:
5559     return LowerGlobalTLSAddress(Op, DAG);
5560   case ISD::SETCC:
5561   case ISD::STRICT_FSETCC:
5562   case ISD::STRICT_FSETCCS:
5563     return LowerSETCC(Op, DAG);
5564   case ISD::SETCCCARRY:
5565     return LowerSETCCCARRY(Op, DAG);
5566   case ISD::BRCOND:
5567     return LowerBRCOND(Op, DAG);
5568   case ISD::BR_CC:
5569     return LowerBR_CC(Op, DAG);
5570   case ISD::SELECT:
5571     return LowerSELECT(Op, DAG);
5572   case ISD::SELECT_CC:
5573     return LowerSELECT_CC(Op, DAG);
5574   case ISD::JumpTable:
5575     return LowerJumpTable(Op, DAG);
5576   case ISD::BR_JT:
5577     return LowerBR_JT(Op, DAG);
5578   case ISD::ConstantPool:
5579     return LowerConstantPool(Op, DAG);
5580   case ISD::BlockAddress:
5581     return LowerBlockAddress(Op, DAG);
5582   case ISD::VASTART:
5583     return LowerVASTART(Op, DAG);
5584   case ISD::VACOPY:
5585     return LowerVACOPY(Op, DAG);
5586   case ISD::VAARG:
5587     return LowerVAARG(Op, DAG);
5588   case ISD::ADDCARRY:
5589     return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
5590   case ISD::SUBCARRY:
5591     return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
5592   case ISD::SADDO_CARRY:
5593     return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
5594   case ISD::SSUBO_CARRY:
5595     return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
5596   case ISD::SADDO:
5597   case ISD::UADDO:
5598   case ISD::SSUBO:
5599   case ISD::USUBO:
5600   case ISD::SMULO:
5601   case ISD::UMULO:
5602     return LowerXALUO(Op, DAG);
5603   case ISD::FADD:
5604     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
5605   case ISD::FSUB:
5606     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
5607   case ISD::FMUL:
5608     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
5609   case ISD::FMA:
5610     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
5611   case ISD::FDIV:
5612     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
5613   case ISD::FNEG:
5614     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
5615   case ISD::FCEIL:
5616     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
5617   case ISD::FFLOOR:
5618     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
5619   case ISD::FNEARBYINT:
5620     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
5621   case ISD::FRINT:
5622     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
5623   case ISD::FROUND:
5624     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
5625   case ISD::FROUNDEVEN:
5626     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
5627   case ISD::FTRUNC:
5628     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
5629   case ISD::FSQRT:
5630     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
5631   case ISD::FABS:
5632     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
5633   case ISD::FP_ROUND:
5634   case ISD::STRICT_FP_ROUND:
5635     return LowerFP_ROUND(Op, DAG);
5636   case ISD::FP_EXTEND:
5637     return LowerFP_EXTEND(Op, DAG);
5638   case ISD::FRAMEADDR:
5639     return LowerFRAMEADDR(Op, DAG);
5640   case ISD::SPONENTRY:
5641     return LowerSPONENTRY(Op, DAG);
5642   case ISD::RETURNADDR:
5643     return LowerRETURNADDR(Op, DAG);
5644   case ISD::ADDROFRETURNADDR:
5645     return LowerADDROFRETURNADDR(Op, DAG);
5646   case ISD::CONCAT_VECTORS:
5647     return LowerCONCAT_VECTORS(Op, DAG);
5648   case ISD::INSERT_VECTOR_ELT:
5649     return LowerINSERT_VECTOR_ELT(Op, DAG);
5650   case ISD::EXTRACT_VECTOR_ELT:
5651     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
5652   case ISD::BUILD_VECTOR:
5653     return LowerBUILD_VECTOR(Op, DAG);
5654   case ISD::VECTOR_SHUFFLE:
5655     return LowerVECTOR_SHUFFLE(Op, DAG);
5656   case ISD::SPLAT_VECTOR:
5657     return LowerSPLAT_VECTOR(Op, DAG);
5658   case ISD::EXTRACT_SUBVECTOR:
5659     return LowerEXTRACT_SUBVECTOR(Op, DAG);
5660   case ISD::INSERT_SUBVECTOR:
5661     return LowerINSERT_SUBVECTOR(Op, DAG);
5662   case ISD::SDIV:
5663   case ISD::UDIV:
5664     return LowerDIV(Op, DAG);
5665   case ISD::SMIN:
5666   case ISD::UMIN:
5667   case ISD::SMAX:
5668   case ISD::UMAX:
5669     return LowerMinMax(Op, DAG);
5670   case ISD::SRA:
5671   case ISD::SRL:
5672   case ISD::SHL:
5673     return LowerVectorSRA_SRL_SHL(Op, DAG);
5674   case ISD::SHL_PARTS:
5675   case ISD::SRL_PARTS:
5676   case ISD::SRA_PARTS:
5677     return LowerShiftParts(Op, DAG);
5678   case ISD::CTPOP:
5679   case ISD::PARITY:
5680     return LowerCTPOP_PARITY(Op, DAG);
5681   case ISD::FCOPYSIGN:
5682     return LowerFCOPYSIGN(Op, DAG);
5683   case ISD::OR:
5684     return LowerVectorOR(Op, DAG);
5685   case ISD::XOR:
5686     return LowerXOR(Op, DAG);
5687   case ISD::PREFETCH:
5688     return LowerPREFETCH(Op, DAG);
5689   case ISD::SINT_TO_FP:
5690   case ISD::UINT_TO_FP:
5691   case ISD::STRICT_SINT_TO_FP:
5692   case ISD::STRICT_UINT_TO_FP:
5693     return LowerINT_TO_FP(Op, DAG);
5694   case ISD::FP_TO_SINT:
5695   case ISD::FP_TO_UINT:
5696   case ISD::STRICT_FP_TO_SINT:
5697   case ISD::STRICT_FP_TO_UINT:
5698     return LowerFP_TO_INT(Op, DAG);
5699   case ISD::FP_TO_SINT_SAT:
5700   case ISD::FP_TO_UINT_SAT:
5701     return LowerFP_TO_INT_SAT(Op, DAG);
5702   case ISD::FSINCOS:
5703     return LowerFSINCOS(Op, DAG);
5704   case ISD::FLT_ROUNDS_:
5705     return LowerFLT_ROUNDS_(Op, DAG);
5706   case ISD::SET_ROUNDING:
5707     return LowerSET_ROUNDING(Op, DAG);
5708   case ISD::MUL:
5709     return LowerMUL(Op, DAG);
5710   case ISD::MULHS:
5711     return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
5712   case ISD::MULHU:
5713     return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
5714   case ISD::INTRINSIC_VOID:
5715   case ISD::INTRINSIC_W_CHAIN:
5716     return LowerINTRINSIC_W_CHAIN(Op, DAG);
5717   case ISD::INTRINSIC_WO_CHAIN:
5718     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5719   case ISD::ATOMIC_STORE:
5720     if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
5721       assert(Subtarget->hasLSE2());
5722       return LowerStore128(Op, DAG);
5723     }
5724     return SDValue();
5725   case ISD::STORE:
5726     return LowerSTORE(Op, DAG);
5727   case ISD::MSTORE:
5728     return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
5729   case ISD::MGATHER:
5730     return LowerMGATHER(Op, DAG);
5731   case ISD::MSCATTER:
5732     return LowerMSCATTER(Op, DAG);
5733   case ISD::VECREDUCE_SEQ_FADD:
5734     return LowerVECREDUCE_SEQ_FADD(Op, DAG);
5735   case ISD::VECREDUCE_ADD:
5736   case ISD::VECREDUCE_AND:
5737   case ISD::VECREDUCE_OR:
5738   case ISD::VECREDUCE_XOR:
5739   case ISD::VECREDUCE_SMAX:
5740   case ISD::VECREDUCE_SMIN:
5741   case ISD::VECREDUCE_UMAX:
5742   case ISD::VECREDUCE_UMIN:
5743   case ISD::VECREDUCE_FADD:
5744   case ISD::VECREDUCE_FMAX:
5745   case ISD::VECREDUCE_FMIN:
5746     return LowerVECREDUCE(Op, DAG);
5747   case ISD::ATOMIC_LOAD_SUB:
5748     return LowerATOMIC_LOAD_SUB(Op, DAG);
5749   case ISD::ATOMIC_LOAD_AND:
5750     return LowerATOMIC_LOAD_AND(Op, DAG);
5751   case ISD::DYNAMIC_STACKALLOC:
5752     return LowerDYNAMIC_STACKALLOC(Op, DAG);
5753   case ISD::VSCALE:
5754     return LowerVSCALE(Op, DAG);
5755   case ISD::ANY_EXTEND:
5756   case ISD::SIGN_EXTEND:
5757     return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
5758   case ISD::ZERO_EXTEND:
5759     return LowerZERO_EXTEND(Op, DAG);
5760   case ISD::SIGN_EXTEND_INREG: {
5761     // Only custom lower when ExtraVT has a legal byte based element type.
5762     EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5763     EVT ExtraEltVT = ExtraVT.getVectorElementType();
5764     if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
5765         (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
5766       return SDValue();
5767
5768     return LowerToPredicatedOp(Op, DAG,
5769                                AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
5770   }
5771   case ISD::TRUNCATE:
5772     return LowerTRUNCATE(Op, DAG);
5773   case ISD::MLOAD:
5774     return LowerMLOAD(Op, DAG);
5775   case ISD::LOAD:
5776     if (useSVEForFixedLengthVectorVT(Op.getValueType(),
5777                                      Subtarget->forceStreamingCompatibleSVE()))
5778       return LowerFixedLengthVectorLoadToSVE(Op, DAG);
5779     return LowerLOAD(Op, DAG);
5780   case ISD::ADD:
5781   case ISD::AND:
5782   case ISD::SUB:
5783     return LowerToScalableOp(Op, DAG);
5784   case ISD::FMAXIMUM:
5785     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
5786   case ISD::FMAXNUM:
5787     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
5788   case ISD::FMINIMUM:
5789     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
5790   case ISD::FMINNUM:
5791     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
5792   case ISD::VSELECT:
5793     return LowerFixedLengthVectorSelectToSVE(Op, DAG);
5794   case ISD::ABS:
5795     return LowerABS(Op, DAG);
5796   case ISD::ABDS:
5797     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
5798   case ISD::ABDU:
5799     return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
5800   case ISD::BITREVERSE:
5801     return LowerBitreverse(Op, DAG);
5802   case ISD::BSWAP:
5803     return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
5804   case ISD::CTLZ:
5805     return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
5806   case ISD::CTTZ:
5807     return LowerCTTZ(Op, DAG);
5808   case ISD::VECTOR_SPLICE:
5809     return LowerVECTOR_SPLICE(Op, DAG);
5810   case ISD::STRICT_LROUND:
5811   case ISD::STRICT_LLROUND:
5812   case ISD::STRICT_LRINT:
5813   case ISD::STRICT_LLRINT: {
5814     assert(Op.getOperand(1).getValueType() == MVT::f16 &&
5815            "Expected custom lowering of rounding operations only for f16");
5816     SDLoc DL(Op);
5817     SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
5818                               {Op.getOperand(0), Op.getOperand(1)});
5819     return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
5820                        {Ext.getValue(1), Ext.getValue(0)});
5821   }
5822   }
5823 }
5824
5825 bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
5826   return !Subtarget->useSVEForFixedLengthVectors();
5827 }
5828
5829 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
5830     EVT VT, bool OverrideNEON) const {
5831   if (!VT.isFixedLengthVector() || !VT.isSimple())
5832     return false;
5833
5834   // Don't use SVE for vectors we cannot scalarize if required.
5835   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
5836   // Fixed length predicates should be promoted to i8.
5837   // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
5838   case MVT::i1:
5839   default:
5840     return false;
5841   case MVT::i8:
5842   case MVT::i16:
5843   case MVT::i32:
5844   case MVT::i64:
5845   case MVT::f16:
5846   case MVT::f32:
5847   case MVT::f64:
5848     break;
5849   }
5850
5851   // All SVE implementations support NEON sized vectors.
5852   if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
5853     return Subtarget->hasSVE();
5854
5855   // Ensure NEON MVTs only belong to a single register class.
5856   if (VT.getFixedSizeInBits() <= 128)
5857     return false;
5858
5859   // Ensure wider than NEON code generation is enabled.
5860   if (!Subtarget->useSVEForFixedLengthVectors())
5861     return false;
5862
5863   // Don't use SVE for types that don't fit.
5864   if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
5865     return false;
5866
5867   // TODO: Perhaps an artificial restriction, but worth having whilst getting
5868   // the base fixed length SVE support in place.
5869   if (!VT.isPow2VectorType())
5870     return false;
5871
5872   return true;
5873 }
5874
5875 //===----------------------------------------------------------------------===//
5876 //                      Calling Convention Implementation
5877 //===----------------------------------------------------------------------===//
5878
5879 static unsigned getIntrinsicID(const SDNode *N) {
5880   unsigned Opcode = N->getOpcode();
5881   switch (Opcode) {
5882   default:
5883     return Intrinsic::not_intrinsic;
5884   case ISD::INTRINSIC_WO_CHAIN: {
5885     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
5886     if (IID < Intrinsic::num_intrinsics)
5887       return IID;
5888     return Intrinsic::not_intrinsic;
5889   }
5890   }
5891 }
5892
5893 bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
5894                                                 SDValue N1) const {
5895   if (!N0.hasOneUse())
5896     return false;
5897
5898   unsigned IID = getIntrinsicID(N1.getNode());
5899   // Avoid reassociating expressions that can be lowered to smlal/umlal.
5900   if (IID == Intrinsic::aarch64_neon_umull ||
5901       N1.getOpcode() == AArch64ISD::UMULL ||
5902       IID == Intrinsic::aarch64_neon_smull ||
5903       N1.getOpcode() == AArch64ISD::SMULL)
5904     return N0.getOpcode() != ISD::ADD;
5905
5906   return true;
5907 }
5908
5909 /// Selects the correct CCAssignFn for a given CallingConvention value.
5910 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
5911                                                      bool IsVarArg) const {
5912   switch (CC) {
5913   default:
5914     report_fatal_error("Unsupported calling convention.");
5915   case CallingConv::WebKit_JS:
5916     return CC_AArch64_WebKit_JS;
5917   case CallingConv::GHC:
5918     return CC_AArch64_GHC;
5919   case CallingConv::C:
5920   case CallingConv::Fast:
5921   case CallingConv::PreserveMost:
5922   case CallingConv::CXX_FAST_TLS:
5923   case CallingConv::Swift:
5924   case CallingConv::SwiftTail:
5925   case CallingConv::Tail:
5926     if (Subtarget->isTargetWindows() && IsVarArg) {
5927       if (Subtarget->isWindowsArm64EC())
5928         return CC_AArch64_Arm64EC_VarArg;
5929       return CC_AArch64_Win64_VarArg;
5930     }
5931     if (!Subtarget->isTargetDarwin())
5932       return CC_AArch64_AAPCS;
5933     if (!IsVarArg)
5934       return CC_AArch64_DarwinPCS;
5935     return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
5936                                       : CC_AArch64_DarwinPCS_VarArg;
5937    case CallingConv::Win64:
5938      if (IsVarArg) {
5939        if (Subtarget->isWindowsArm64EC())
5940          return CC_AArch64_Arm64EC_VarArg;
5941        return CC_AArch64_Win64_VarArg;
5942      }
5943      return CC_AArch64_AAPCS;
5944    case CallingConv::CFGuard_Check:
5945      return CC_AArch64_Win64_CFGuard_Check;
5946    case CallingConv::AArch64_VectorCall:
5947    case CallingConv::AArch64_SVE_VectorCall:
5948    case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
5949    case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
5950      return CC_AArch64_AAPCS;
5951   }
5952 }
5953
5954 CCAssignFn *
5955 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
5956   return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
5957                                       : RetCC_AArch64_AAPCS;
5958 }
5959
5960
5961 /// Returns true if the Function has ZA state and contains at least one call to
5962 /// a function that requires setting up a lazy-save buffer.
5963 static bool requiresBufferForLazySave(const Function &F) {
5964   SMEAttrs CallerAttrs(F);
5965   if (!CallerAttrs.hasZAState())
5966     return false;
5967
5968   for (const BasicBlock &BB : F)
5969     for (const Instruction &I : BB)
5970       if (const CallInst *Call = dyn_cast<CallInst>(&I))
5971         if (CallerAttrs.requiresLazySave(SMEAttrs(*Call)))
5972           return true;
5973   return false;
5974 }
5975
5976 unsigned AArch64TargetLowering::allocateLazySaveBuffer(
5977     SDValue &Chain, const SDLoc &DL, SelectionDAG &DAG, Register &Reg) const {
5978   MachineFunction &MF = DAG.getMachineFunction();
5979   MachineFrameInfo &MFI = MF.getFrameInfo();
5980
5981   // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
5982   SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5983                           DAG.getConstant(1, DL, MVT::i32));
5984   SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
5985   SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
5986   SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
5987   SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
5988   unsigned FI = MFI.CreateVariableSizedObject(Align(1), nullptr);
5989   Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
5990   Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, Reg, Buffer.getValue(0));
5991
5992   // Allocate an additional TPIDR2 object on the stack (16 bytes)
5993   unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
5994
5995   // Store the buffer pointer to the TPIDR2 stack object.
5996   MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, FI);
5997   SDValue Ptr = DAG.getFrameIndex(
5998       FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
5999   Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6000
6001   return TPIDR2Obj;
6002 }
6003
6004 SDValue AArch64TargetLowering::LowerFormalArguments(
6005     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6006     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6007     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6008   MachineFunction &MF = DAG.getMachineFunction();
6009   const Function &F = MF.getFunction();
6010   MachineFrameInfo &MFI = MF.getFrameInfo();
6011   bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6012   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6013
6014   SmallVector<ISD::OutputArg, 4> Outs;
6015   GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
6016                 DAG.getTargetLoweringInfo(), MF.getDataLayout());
6017   if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6018     FuncInfo->setIsSVECC(true);
6019
6020   // Assign locations to all of the incoming arguments.
6021   SmallVector<CCValAssign, 16> ArgLocs;
6022   DenseMap<unsigned, SDValue> CopiedRegs;
6023   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6024
6025   // At this point, Ins[].VT may already be promoted to i32. To correctly
6026   // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6027   // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6028   // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6029   // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6030   // LocVT.
6031   unsigned NumArgs = Ins.size();
6032   Function::const_arg_iterator CurOrigArg = F.arg_begin();
6033   unsigned CurArgIdx = 0;
6034   for (unsigned i = 0; i != NumArgs; ++i) {
6035     MVT ValVT = Ins[i].VT;
6036     if (Ins[i].isOrigArg()) {
6037       std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
6038       CurArgIdx = Ins[i].getOrigArgIndex();
6039
6040       // Get type of the original argument.
6041       EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
6042                                   /*AllowUnknown*/ true);
6043       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6044       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6045       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6046         ValVT = MVT::i8;
6047       else if (ActualMVT == MVT::i16)
6048         ValVT = MVT::i16;
6049     }
6050     bool UseVarArgCC = false;
6051     if (IsWin64)
6052       UseVarArgCC = isVarArg;
6053     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
6054     bool Res =
6055         AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
6056     assert(!Res && "Call operand has unhandled type");
6057     (void)Res;
6058   }
6059
6060   SMEAttrs Attrs(MF.getFunction());
6061   bool IsLocallyStreaming =
6062       !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6063   assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6064   SDValue Glue = Chain.getValue(1);
6065
6066   SmallVector<SDValue, 16> ArgValues;
6067   unsigned ExtraArgLocs = 0;
6068   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
6069     CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
6070
6071     if (Ins[i].Flags.isByVal()) {
6072       // Byval is used for HFAs in the PCS, but the system should work in a
6073       // non-compliant manner for larger structs.
6074       EVT PtrVT = getPointerTy(DAG.getDataLayout());
6075       int Size = Ins[i].Flags.getByValSize();
6076       unsigned NumRegs = (Size + 7) / 8;
6077
6078       // FIXME: This works on big-endian for composite byvals, which are the common
6079       // case. It should also work for fundamental types too.
6080       unsigned FrameIdx =
6081         MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
6082       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
6083       InVals.push_back(FrameIdxN);
6084
6085       continue;
6086     }
6087
6088     if (Ins[i].Flags.isSwiftAsync())
6089       MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6090
6091     SDValue ArgValue;
6092     if (VA.isRegLoc()) {
6093       // Arguments stored in registers.
6094       EVT RegVT = VA.getLocVT();
6095       const TargetRegisterClass *RC;
6096
6097       if (RegVT == MVT::i32)
6098         RC = &AArch64::GPR32RegClass;
6099       else if (RegVT == MVT::i64)
6100         RC = &AArch64::GPR64RegClass;
6101       else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
6102         RC = &AArch64::FPR16RegClass;
6103       else if (RegVT == MVT::f32)
6104         RC = &AArch64::FPR32RegClass;
6105       else if (RegVT == MVT::f64 || RegVT.is64BitVector())
6106         RC = &AArch64::FPR64RegClass;
6107       else if (RegVT == MVT::f128 || RegVT.is128BitVector())
6108         RC = &AArch64::FPR128RegClass;
6109       else if (RegVT.isScalableVector() &&
6110                RegVT.getVectorElementType() == MVT::i1) {
6111         FuncInfo->setIsSVECC(true);
6112         RC = &AArch64::PPRRegClass;
6113       } else if (RegVT.isScalableVector()) {
6114         FuncInfo->setIsSVECC(true);
6115         RC = &AArch64::ZPRRegClass;
6116       } else
6117         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
6118
6119       // Transform the arguments in physical registers into virtual ones.
6120       Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
6121
6122       if (IsLocallyStreaming) {
6123         // LocallyStreamingFunctions must insert the SMSTART in the correct
6124         // position, so we use Glue to ensure no instructions can be scheduled
6125         // between the chain of:
6126         //        t0: ch,glue = EntryNode
6127         //      t1:  res,ch,glue = CopyFromReg
6128         //     ...
6129         //   tn: res,ch,glue = CopyFromReg t(n-1), ..
6130         // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
6131         // ^^^^^^
6132         // This will be the new Chain/Root node.
6133         ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
6134         Glue = ArgValue.getValue(2);
6135       } else
6136         ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
6137
6138       // If this is an 8, 16 or 32-bit value, it is really passed promoted
6139       // to 64 bits.  Insert an assert[sz]ext to capture this, then
6140       // truncate to the right size.
6141       switch (VA.getLocInfo()) {
6142       default:
6143         llvm_unreachable("Unknown loc info!");
6144       case CCValAssign::Full:
6145         break;
6146       case CCValAssign::Indirect:
6147         assert((VA.getValVT().isScalableVector() ||
6148                 Subtarget->isWindowsArm64EC()) &&
6149                "Indirect arguments should be scalable on most subtargets");
6150         break;
6151       case CCValAssign::BCvt:
6152         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
6153         break;
6154       case CCValAssign::AExt:
6155       case CCValAssign::SExt:
6156       case CCValAssign::ZExt:
6157         break;
6158       case CCValAssign::AExtUpper:
6159         ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
6160                                DAG.getConstant(32, DL, RegVT));
6161         ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
6162         break;
6163       }
6164     } else { // VA.isRegLoc()
6165       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
6166       unsigned ArgOffset = VA.getLocMemOffset();
6167       unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
6168                               ? VA.getLocVT().getSizeInBits()
6169                               : VA.getValVT().getSizeInBits()) / 8;
6170
6171       uint32_t BEAlign = 0;
6172       if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
6173           !Ins[i].Flags.isInConsecutiveRegs())
6174         BEAlign = 8 - ArgSize;
6175
6176       SDValue FIN;
6177       MachinePointerInfo PtrInfo;
6178       if (isVarArg && Subtarget->isWindowsArm64EC()) {
6179         // In the ARM64EC varargs convention, fixed arguments on the stack are
6180         // accessed relative to x4, not sp.
6181         unsigned ObjOffset = ArgOffset + BEAlign;
6182         Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
6183         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
6184         FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
6185                           DAG.getConstant(ObjOffset, DL, MVT::i64));
6186         PtrInfo = MachinePointerInfo::getUnknownStack(MF);
6187       } else {
6188         int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
6189
6190         // Create load nodes to retrieve arguments from the stack.
6191         FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
6192         PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6193       }
6194
6195       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
6196       ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
6197       MVT MemVT = VA.getValVT();
6198
6199       switch (VA.getLocInfo()) {
6200       default:
6201         break;
6202       case CCValAssign::Trunc:
6203       case CCValAssign::BCvt:
6204         MemVT = VA.getLocVT();
6205         break;
6206       case CCValAssign::Indirect:
6207         assert((VA.getValVT().isScalableVector() ||
6208                 Subtarget->isWindowsArm64EC()) &&
6209                "Indirect arguments should be scalable on most subtargets");
6210         MemVT = VA.getLocVT();
6211         break;
6212       case CCValAssign::SExt:
6213         ExtType = ISD::SEXTLOAD;
6214         break;
6215       case CCValAssign::ZExt:
6216         ExtType = ISD::ZEXTLOAD;
6217         break;
6218       case CCValAssign::AExt:
6219         ExtType = ISD::EXTLOAD;
6220         break;
6221       }
6222
6223       ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
6224                                 MemVT);
6225     }
6226
6227     if (VA.getLocInfo() == CCValAssign::Indirect) {
6228       assert(
6229           (VA.getValVT().isScalableVector() || Subtarget->isWindowsArm64EC()) &&
6230           "Indirect arguments should be scalable on most subtargets");
6231
6232       uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
6233       unsigned NumParts = 1;
6234       if (Ins[i].Flags.isInConsecutiveRegs()) {
6235         assert(!Ins[i].Flags.isInConsecutiveRegsLast());
6236         while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
6237           ++NumParts;
6238       }
6239
6240       MVT PartLoad = VA.getValVT();
6241       SDValue Ptr = ArgValue;
6242
6243       // Ensure we generate all loads for each tuple part, whilst updating the
6244       // pointer after each load correctly using vscale.
6245       while (NumParts > 0) {
6246         ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
6247         InVals.push_back(ArgValue);
6248         NumParts--;
6249         if (NumParts > 0) {
6250           SDValue BytesIncrement;
6251           if (PartLoad.isScalableVector()) {
6252             BytesIncrement = DAG.getVScale(
6253                 DL, Ptr.getValueType(),
6254                 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
6255           } else {
6256             BytesIncrement = DAG.getConstant(
6257                 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize), DL,
6258                 Ptr.getValueType());
6259           }
6260           SDNodeFlags Flags;
6261           Flags.setNoUnsignedWrap(true);
6262           Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6263                             BytesIncrement, Flags);
6264           ExtraArgLocs++;
6265           i++;
6266         }
6267       }
6268     } else {
6269       if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
6270         ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
6271                                ArgValue, DAG.getValueType(MVT::i32));
6272
6273       // i1 arguments are zero-extended to i8 by the caller. Emit a
6274       // hint to reflect this.
6275       if (Ins[i].isOrigArg()) {
6276         Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
6277         if (OrigArg->getType()->isIntegerTy(1)) {
6278           if (!Ins[i].Flags.isZExt()) {
6279             ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
6280                                    ArgValue.getValueType(), ArgValue);
6281           }
6282         }
6283       }
6284
6285       InVals.push_back(ArgValue);
6286     }
6287   }
6288   assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
6289
6290   // Insert the SMSTART if this is a locally streaming function and
6291   // make sure it is Glued to the last CopyFromReg value.
6292   if (IsLocallyStreaming) {
6293     const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6294     Chain = DAG.getNode(
6295         AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6296         {DAG.getRoot(),
6297           DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
6298          DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64),
6299          DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue});
6300     // Ensure that the SMSTART happens after the CopyWithChain such that its
6301     // chain result is used.
6302     for (unsigned I=0; I<InVals.size(); ++I) {
6303       Register Reg = MF.getRegInfo().createVirtualRegister(
6304           getRegClassFor(InVals[I].getValueType().getSimpleVT()));
6305       SDValue X = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
6306       InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
6307                                      InVals[I].getValueType());
6308     }
6309   }
6310
6311   // varargs
6312   if (isVarArg) {
6313     if (!Subtarget->isTargetDarwin() || IsWin64) {
6314       // The AAPCS variadic function ABI is identical to the non-variadic
6315       // one. As a result there may be more arguments in registers and we should
6316       // save them for future reference.
6317       // Win64 variadic functions also pass arguments in registers, but all float
6318       // arguments are passed in integer registers.
6319       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
6320     }
6321
6322     // This will point to the next argument passed via stack.
6323     unsigned StackOffset = CCInfo.getNextStackOffset();
6324     // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
6325     StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
6326     FuncInfo->setVarArgsStackOffset(StackOffset);
6327     FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
6328
6329     if (MFI.hasMustTailInVarArgFunc()) {
6330       SmallVector<MVT, 2> RegParmTypes;
6331       RegParmTypes.push_back(MVT::i64);
6332       RegParmTypes.push_back(MVT::f128);
6333       // Compute the set of forwarded registers. The rest are scratch.
6334       SmallVectorImpl<ForwardedRegister> &Forwards =
6335                                        FuncInfo->getForwardedMustTailRegParms();
6336       CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
6337                                                CC_AArch64_AAPCS);
6338
6339       // Conservatively forward X8, since it might be used for aggregate return.
6340       if (!CCInfo.isAllocated(AArch64::X8)) {
6341         Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
6342         Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
6343       }
6344     }
6345   }
6346
6347   // On Windows, InReg pointers must be returned, so record the pointer in a
6348   // virtual register at the start of the function so it can be returned in the
6349   // epilogue.
6350   if (IsWin64) {
6351     for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
6352       if (Ins[I].Flags.isInReg() && Ins[I].Flags.isSRet()) {
6353         assert(!FuncInfo->getSRetReturnReg());
6354
6355         MVT PtrTy = getPointerTy(DAG.getDataLayout());
6356         Register Reg =
6357             MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
6358         FuncInfo->setSRetReturnReg(Reg);
6359
6360         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
6361         Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
6362         break;
6363       }
6364     }
6365   }
6366
6367   unsigned StackArgSize = CCInfo.getNextStackOffset();
6368   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
6369   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
6370     // This is a non-standard ABI so by fiat I say we're allowed to make full
6371     // use of the stack area to be popped, which must be aligned to 16 bytes in
6372     // any case:
6373     StackArgSize = alignTo(StackArgSize, 16);
6374
6375     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
6376     // a multiple of 16.
6377     FuncInfo->setArgumentStackToRestore(StackArgSize);
6378
6379     // This realignment carries over to the available bytes below. Our own
6380     // callers will guarantee the space is free by giving an aligned value to
6381     // CALLSEQ_START.
6382   }
6383   // Even if we're not expected to free up the space, it's useful to know how
6384   // much is there while considering tail calls (because we can reuse it).
6385   FuncInfo->setBytesInStackArgArea(StackArgSize);
6386
6387   if (Subtarget->hasCustomCallingConv())
6388     Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
6389
6390   if (requiresBufferForLazySave(MF.getFunction())) {
6391     // Set up a buffer once and store the buffer in the MachineFunctionInfo.
6392     Register Reg;
6393     unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG, Reg);
6394     FuncInfo->setLazySaveBufferReg(Reg);
6395     FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
6396   }
6397
6398   return Chain;
6399 }
6400
6401 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
6402                                                 SelectionDAG &DAG,
6403                                                 const SDLoc &DL,
6404                                                 SDValue &Chain) const {
6405   MachineFunction &MF = DAG.getMachineFunction();
6406   MachineFrameInfo &MFI = MF.getFrameInfo();
6407   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6408   auto PtrVT = getPointerTy(DAG.getDataLayout());
6409   bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
6410
6411   SmallVector<SDValue, 8> MemOps;
6412
6413   static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
6414                                           AArch64::X3, AArch64::X4, AArch64::X5,
6415                                           AArch64::X6, AArch64::X7 };
6416   unsigned NumGPRArgRegs = std::size(GPRArgRegs);
6417   if (Subtarget->isWindowsArm64EC()) {
6418     // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
6419     // functions.
6420     NumGPRArgRegs = 4;
6421   }
6422   unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
6423
6424   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
6425   int GPRIdx = 0;
6426   if (GPRSaveSize != 0) {
6427     if (IsWin64) {
6428       GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
6429       if (GPRSaveSize & 15)
6430         // The extra size here, if triggered, will always be 8.
6431         MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
6432     } else
6433       GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
6434
6435     SDValue FIN;
6436     if (Subtarget->isWindowsArm64EC()) {
6437       // With the Arm64EC ABI, we reserve the save area as usual, but we
6438       // compute its address relative to x4.  For a normal AArch64->AArch64
6439       // call, x4 == sp on entry, but calls from an entry thunk can pass in a
6440       // different address.
6441       Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
6442       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
6443       FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
6444                         DAG.getConstant(GPRSaveSize, DL, MVT::i64));
6445     } else {
6446       FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
6447     }
6448
6449     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
6450       Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
6451       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
6452       SDValue Store =
6453           DAG.getStore(Val.getValue(1), DL, Val, FIN,
6454                        IsWin64 ? MachinePointerInfo::getFixedStack(
6455                                      MF, GPRIdx, (i - FirstVariadicGPR) * 8)
6456                                : MachinePointerInfo::getStack(MF, i * 8));
6457       MemOps.push_back(Store);
6458       FIN =
6459           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
6460     }
6461   }
6462   FuncInfo->setVarArgsGPRIndex(GPRIdx);
6463   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
6464
6465   if (Subtarget->hasFPARMv8() && !IsWin64) {
6466     static const MCPhysReg FPRArgRegs[] = {
6467         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
6468         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
6469     static const unsigned NumFPRArgRegs = std::size(FPRArgRegs);
6470     unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
6471
6472     unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
6473     int FPRIdx = 0;
6474     if (FPRSaveSize != 0) {
6475       FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
6476
6477       SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
6478
6479       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
6480         Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
6481         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
6482
6483         SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
6484                                      MachinePointerInfo::getStack(MF, i * 16));
6485         MemOps.push_back(Store);
6486         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
6487                           DAG.getConstant(16, DL, PtrVT));
6488       }
6489     }
6490     FuncInfo->setVarArgsFPRIndex(FPRIdx);
6491     FuncInfo->setVarArgsFPRSize(FPRSaveSize);
6492   }
6493
6494   if (!MemOps.empty()) {
6495     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
6496   }
6497 }
6498
6499 /// LowerCallResult - Lower the result values of a call into the
6500 /// appropriate copies out of appropriate physical registers.
6501 SDValue AArch64TargetLowering::LowerCallResult(
6502     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
6503     const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
6504     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
6505     SDValue ThisVal) const {
6506   DenseMap<unsigned, SDValue> CopiedRegs;
6507   // Copy all of the result registers out of their specified physreg.
6508   for (unsigned i = 0; i != RVLocs.size(); ++i) {
6509     CCValAssign VA = RVLocs[i];
6510
6511     // Pass 'this' value directly from the argument to return value, to avoid
6512     // reg unit interference
6513     if (i == 0 && isThisReturn) {
6514       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
6515              "unexpected return calling convention register assignment");
6516       InVals.push_back(ThisVal);
6517       continue;
6518     }
6519
6520     // Avoid copying a physreg twice since RegAllocFast is incompetent and only
6521     // allows one use of a physreg per block.
6522     SDValue Val = CopiedRegs.lookup(VA.getLocReg());
6523     if (!Val) {
6524       Val =
6525           DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
6526       Chain = Val.getValue(1);
6527       InFlag = Val.getValue(2);
6528       CopiedRegs[VA.getLocReg()] = Val;
6529     }
6530
6531     switch (VA.getLocInfo()) {
6532     default:
6533       llvm_unreachable("Unknown loc info!");
6534     case CCValAssign::Full:
6535       break;
6536     case CCValAssign::BCvt:
6537       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
6538       break;
6539     case CCValAssign::AExtUpper:
6540       Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
6541                         DAG.getConstant(32, DL, VA.getLocVT()));
6542       [[fallthrough]];
6543     case CCValAssign::AExt:
6544       [[fallthrough]];
6545     case CCValAssign::ZExt:
6546       Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
6547       break;
6548     }
6549
6550     InVals.push_back(Val);
6551   }
6552
6553   return Chain;
6554 }
6555
6556 /// Return true if the calling convention is one that we can guarantee TCO for.
6557 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
6558   return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
6559          CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
6560 }
6561
6562 /// Return true if we might ever do TCO for calls with this calling convention.
6563 static bool mayTailCallThisCC(CallingConv::ID CC) {
6564   switch (CC) {
6565   case CallingConv::C:
6566   case CallingConv::AArch64_SVE_VectorCall:
6567   case CallingConv::PreserveMost:
6568   case CallingConv::Swift:
6569   case CallingConv::SwiftTail:
6570   case CallingConv::Tail:
6571   case CallingConv::Fast:
6572     return true;
6573   default:
6574     return false;
6575   }
6576 }
6577
6578 static void analyzeCallOperands(const AArch64TargetLowering &TLI,
6579                                 const AArch64Subtarget *Subtarget,
6580                                 const TargetLowering::CallLoweringInfo &CLI,
6581                                 CCState &CCInfo) {
6582   const SelectionDAG &DAG = CLI.DAG;
6583   CallingConv::ID CalleeCC = CLI.CallConv;
6584   bool IsVarArg = CLI.IsVarArg;
6585   const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
6586   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
6587
6588   unsigned NumArgs = Outs.size();
6589   for (unsigned i = 0; i != NumArgs; ++i) {
6590     MVT ArgVT = Outs[i].VT;
6591     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
6592
6593     bool UseVarArgCC = false;
6594     if (IsVarArg) {
6595       // On Windows, the fixed arguments in a vararg call are passed in GPRs
6596       // too, so use the vararg CC to force them to integer registers.
6597       if (IsCalleeWin64) {
6598         UseVarArgCC = true;
6599       } else {
6600         UseVarArgCC = !Outs[i].IsFixed;
6601       }
6602     } else {
6603       // Get type of the original argument.
6604       EVT ActualVT =
6605           TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
6606                        /*AllowUnknown*/ true);
6607       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
6608       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6609       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6610         ArgVT = MVT::i8;
6611       else if (ActualMVT == MVT::i16)
6612         ArgVT = MVT::i16;
6613     }
6614
6615     CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
6616     bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
6617     assert(!Res && "Call operand has unhandled type");
6618     (void)Res;
6619   }
6620 }
6621
6622 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
6623     const CallLoweringInfo &CLI) const {
6624   CallingConv::ID CalleeCC = CLI.CallConv;
6625   if (!mayTailCallThisCC(CalleeCC))
6626     return false;
6627
6628   SDValue Callee = CLI.Callee;
6629   bool IsVarArg = CLI.IsVarArg;
6630   const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
6631   const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
6632   const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
6633   const SelectionDAG &DAG = CLI.DAG;
6634   MachineFunction &MF = DAG.getMachineFunction();
6635   const Function &CallerF = MF.getFunction();
6636   CallingConv::ID CallerCC = CallerF.getCallingConv();
6637
6638   // SME Streaming functions are not eligible for TCO as they may require
6639   // the streaming mode or ZA to be restored after returning from the call.
6640   SMEAttrs CallerAttrs(MF.getFunction());
6641   auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
6642   if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
6643       CallerAttrs.requiresLazySave(CalleeAttrs))
6644     return false;
6645
6646   // Functions using the C or Fast calling convention that have an SVE signature
6647   // preserve more registers and should assume the SVE_VectorCall CC.
6648   // The check for matching callee-saved regs will determine whether it is
6649   // eligible for TCO.
6650   if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
6651       MF.getInfo<AArch64FunctionInfo>()->isSVECC())
6652     CallerCC = CallingConv::AArch64_SVE_VectorCall;
6653
6654   bool CCMatch = CallerCC == CalleeCC;
6655
6656   // When using the Windows calling convention on a non-windows OS, we want
6657   // to back up and restore X18 in such functions; we can't do a tail call
6658   // from those functions.
6659   if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
6660       CalleeCC != CallingConv::Win64)
6661     return false;
6662
6663   // Byval parameters hand the function a pointer directly into the stack area
6664   // we want to reuse during a tail call. Working around this *is* possible (see
6665   // X86) but less efficient and uglier in LowerCall.
6666   for (Function::const_arg_iterator i = CallerF.arg_begin(),
6667                                     e = CallerF.arg_end();
6668        i != e; ++i) {
6669     if (i->hasByValAttr())
6670       return false;
6671
6672     // On Windows, "inreg" attributes signify non-aggregate indirect returns.
6673     // In this case, it is necessary to save/restore X0 in the callee. Tail
6674     // call opt interferes with this. So we disable tail call opt when the
6675     // caller has an argument with "inreg" attribute.
6676
6677     // FIXME: Check whether the callee also has an "inreg" argument.
6678     if (i->hasInRegAttr())
6679       return false;
6680   }
6681
6682   if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
6683     return CCMatch;
6684
6685   // Externally-defined functions with weak linkage should not be
6686   // tail-called on AArch64 when the OS does not support dynamic
6687   // pre-emption of symbols, as the AAELF spec requires normal calls
6688   // to undefined weak functions to be replaced with a NOP or jump to the
6689   // next instruction. The behaviour of branch instructions in this
6690   // situation (as used for tail calls) is implementation-defined, so we
6691   // cannot rely on the linker replacing the tail call with a return.
6692   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
6693     const GlobalValue *GV = G->getGlobal();
6694     const Triple &TT = getTargetMachine().getTargetTriple();
6695     if (GV->hasExternalWeakLinkage() &&
6696         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
6697       return false;
6698   }
6699
6700   // Now we search for cases where we can use a tail call without changing the
6701   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
6702   // concept.
6703
6704   // I want anyone implementing a new calling convention to think long and hard
6705   // about this assert.
6706   assert((!IsVarArg || CalleeCC == CallingConv::C) &&
6707          "Unexpected variadic calling convention");
6708
6709   LLVMContext &C = *DAG.getContext();
6710   // Check that the call results are passed in the same way.
6711   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
6712                                   CCAssignFnForCall(CalleeCC, IsVarArg),
6713                                   CCAssignFnForCall(CallerCC, IsVarArg)))
6714     return false;
6715   // The callee has to preserve all registers the caller needs to preserve.
6716   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6717   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
6718   if (!CCMatch) {
6719     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
6720     if (Subtarget->hasCustomCallingConv()) {
6721       TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
6722       TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
6723     }
6724     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
6725       return false;
6726   }
6727
6728   // Nothing more to check if the callee is taking no arguments
6729   if (Outs.empty())
6730     return true;
6731
6732   SmallVector<CCValAssign, 16> ArgLocs;
6733   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
6734
6735   analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
6736
6737   if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
6738     // When we are musttail, additional checks have been done and we can safely ignore this check
6739     // At least two cases here: if caller is fastcc then we can't have any
6740     // memory arguments (we'd be expected to clean up the stack afterwards). If
6741     // caller is C then we could potentially use its argument area.
6742
6743     // FIXME: for now we take the most conservative of these in both cases:
6744     // disallow all variadic memory operands.
6745     for (const CCValAssign &ArgLoc : ArgLocs)
6746       if (!ArgLoc.isRegLoc())
6747         return false;
6748   }
6749
6750   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6751
6752   // If any of the arguments is passed indirectly, it must be SVE, so the
6753   // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
6754   // allocate space on the stack. That is why we determine this explicitly here
6755   // the call cannot be a tailcall.
6756   if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
6757         assert((A.getLocInfo() != CCValAssign::Indirect ||
6758                 A.getValVT().isScalableVector() ||
6759                 Subtarget->isWindowsArm64EC()) &&
6760                "Expected value to be scalable");
6761         return A.getLocInfo() == CCValAssign::Indirect;
6762       }))
6763     return false;
6764
6765   // If the stack arguments for this call do not fit into our own save area then
6766   // the call cannot be made tail.
6767   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
6768     return false;
6769
6770   const MachineRegisterInfo &MRI = MF.getRegInfo();
6771   if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
6772     return false;
6773
6774   return true;
6775 }
6776
6777 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
6778                                                    SelectionDAG &DAG,
6779                                                    MachineFrameInfo &MFI,
6780                                                    int ClobberedFI) const {
6781   SmallVector<SDValue, 8> ArgChains;
6782   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
6783   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
6784
6785   // Include the original chain at the beginning of the list. When this is
6786   // used by target LowerCall hooks, this helps legalize find the
6787   // CALLSEQ_BEGIN node.
6788   ArgChains.push_back(Chain);
6789
6790   // Add a chain value for each stack argument corresponding
6791   for (SDNode *U : DAG.getEntryNode().getNode()->uses())
6792     if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
6793       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
6794         if (FI->getIndex() < 0) {
6795           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
6796           int64_t InLastByte = InFirstByte;
6797           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
6798
6799           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
6800               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
6801             ArgChains.push_back(SDValue(L, 1));
6802         }
6803
6804   // Build a tokenfactor for all the chains.
6805   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
6806 }
6807
6808 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
6809                                                    bool TailCallOpt) const {
6810   return (CallCC == CallingConv::Fast && TailCallOpt) ||
6811          CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
6812 }
6813
6814 // Check if the value is zero-extended from i1 to i8
6815 static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
6816   unsigned SizeInBits = Arg.getValueType().getSizeInBits();
6817   if (SizeInBits < 8)
6818     return false;
6819
6820   APInt RequredZero(SizeInBits, 0xFE);
6821   KnownBits Bits = DAG.computeKnownBits(Arg, 4);
6822   bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
6823   return ZExtBool;
6824 }
6825
6826 SDValue AArch64TargetLowering::changeStreamingMode(
6827     SelectionDAG &DAG, SDLoc DL, bool Enable,
6828     SDValue Chain, SDValue InFlag, SDValue PStateSM, bool Entry) const {
6829   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6830   SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
6831   SDValue MSROp =
6832       DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
6833
6834   SDValue ExpectedSMVal =
6835       DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64);
6836   SmallVector<SDValue> Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask};
6837
6838   if (InFlag)
6839     Ops.push_back(InFlag);
6840
6841   unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
6842   return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
6843 }
6844
6845 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
6846 /// and add input and output parameter nodes.
6847 SDValue
6848 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
6849                                  SmallVectorImpl<SDValue> &InVals) const {
6850   SelectionDAG &DAG = CLI.DAG;
6851   SDLoc &DL = CLI.DL;
6852   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
6853   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
6854   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
6855   SDValue Chain = CLI.Chain;
6856   SDValue Callee = CLI.Callee;
6857   bool &IsTailCall = CLI.IsTailCall;
6858   CallingConv::ID &CallConv = CLI.CallConv;
6859   bool IsVarArg = CLI.IsVarArg;
6860
6861   MachineFunction &MF = DAG.getMachineFunction();
6862   MachineFunction::CallSiteInfo CSInfo;
6863   bool IsThisReturn = false;
6864
6865   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6866   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
6867   bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
6868   bool IsSibCall = false;
6869   bool GuardWithBTI = false;
6870
6871   if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
6872       !Subtarget->noBTIAtReturnTwice()) {
6873     GuardWithBTI = FuncInfo->branchTargetEnforcement();
6874   }
6875
6876   // Analyze operands of the call, assigning locations to each operand.
6877   SmallVector<CCValAssign, 16> ArgLocs;
6878   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
6879
6880   if (IsVarArg) {
6881     unsigned NumArgs = Outs.size();
6882
6883     for (unsigned i = 0; i != NumArgs; ++i) {
6884       if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
6885         report_fatal_error("Passing SVE types to variadic functions is "
6886                            "currently not supported");
6887     }
6888   }
6889
6890   analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
6891
6892   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
6893   // Assign locations to each value returned by this call.
6894   SmallVector<CCValAssign, 16> RVLocs;
6895   CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
6896                     *DAG.getContext());
6897   RetCCInfo.AnalyzeCallResult(Ins, RetCC);
6898
6899   // Check callee args/returns for SVE registers and set calling convention
6900   // accordingly.
6901   if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
6902     auto HasSVERegLoc = [](CCValAssign &Loc) {
6903       if (!Loc.isRegLoc())
6904         return false;
6905       return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
6906              AArch64::PPRRegClass.contains(Loc.getLocReg());
6907     };
6908     if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
6909       CallConv = CallingConv::AArch64_SVE_VectorCall;
6910   }
6911
6912   if (IsTailCall) {
6913     // Check if it's really possible to do a tail call.
6914     IsTailCall = isEligibleForTailCallOptimization(CLI);
6915
6916     // A sibling call is one where we're under the usual C ABI and not planning
6917     // to change that but can still do a tail call:
6918     if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
6919         CallConv != CallingConv::SwiftTail)
6920       IsSibCall = true;
6921
6922     if (IsTailCall)
6923       ++NumTailCalls;
6924   }
6925
6926   if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
6927     report_fatal_error("failed to perform tail call elimination on a call "
6928                        "site marked musttail");
6929
6930   // Get a count of how many bytes are to be pushed on the stack.
6931   unsigned NumBytes = CCInfo.getNextStackOffset();
6932
6933   if (IsSibCall) {
6934     // Since we're not changing the ABI to make this a tail call, the memory
6935     // operands are already available in the caller's incoming argument space.
6936     NumBytes = 0;
6937   }
6938
6939   // FPDiff is the byte offset of the call's argument area from the callee's.
6940   // Stores to callee stack arguments will be placed in FixedStackSlots offset
6941   // by this amount for a tail call. In a sibling call it must be 0 because the
6942   // caller will deallocate the entire stack and the callee still expects its
6943   // arguments to begin at SP+0. Completely unused for non-tail calls.
6944   int FPDiff = 0;
6945
6946   if (IsTailCall && !IsSibCall) {
6947     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
6948
6949     // Since callee will pop argument stack as a tail call, we must keep the
6950     // popped size 16-byte aligned.
6951     NumBytes = alignTo(NumBytes, 16);
6952
6953     // FPDiff will be negative if this tail call requires more space than we
6954     // would automatically have in our incoming argument space. Positive if we
6955     // can actually shrink the stack.
6956     FPDiff = NumReusableBytes - NumBytes;
6957
6958     // Update the required reserved area if this is the tail call requiring the
6959     // most argument stack space.
6960     if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
6961       FuncInfo->setTailCallReservedStack(-FPDiff);
6962
6963     // The stack pointer must be 16-byte aligned at all times it's used for a
6964     // memory operation, which in practice means at *all* times and in
6965     // particular across call boundaries. Therefore our own arguments started at
6966     // a 16-byte aligned SP and the delta applied for the tail call should
6967     // satisfy the same constraint.
6968     assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
6969   }
6970
6971   // Determine whether we need any streaming mode changes.
6972   SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
6973   if (CLI.CB)
6974     CalleeAttrs = SMEAttrs(*CLI.CB);
6975   else if (Optional<SMEAttrs> Attrs =
6976                getCalleeAttrsFromExternalFunction(CLI.Callee))
6977     CalleeAttrs = *Attrs;
6978
6979   bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
6980
6981   MachineFrameInfo &MFI = MF.getFrameInfo();
6982   if (RequiresLazySave) {
6983     // Set up a lazy save mechanism by storing the runtime live slices
6984     // (worst-case N*N) to the TPIDR2 stack object.
6985     SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6986                             DAG.getConstant(1, DL, MVT::i32));
6987     SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6988     unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
6989
6990     if (!TPIDR2Obj) {
6991       Register Reg;
6992       TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG, Reg);
6993     }
6994
6995     MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
6996     SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
6997         DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
6998     SDValue BufferPtrAddr =
6999         DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
7000                     DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
7001     Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16);
7002     Chain = DAG.getNode(
7003         ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
7004         DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7005         TPIDR2ObjAddr);
7006   }
7007
7008   SDValue PStateSM;
7009   Optional<bool> RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
7010   if (RequiresSMChange)
7011     PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64);
7012
7013   // Adjust the stack pointer for the new arguments...
7014   // These operations are automatically eliminated by the prolog/epilog pass
7015   if (!IsSibCall)
7016     Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
7017
7018   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
7019                                         getPointerTy(DAG.getDataLayout()));
7020
7021   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
7022   SmallSet<unsigned, 8> RegsUsed;
7023   SmallVector<SDValue, 8> MemOpChains;
7024   auto PtrVT = getPointerTy(DAG.getDataLayout());
7025
7026   if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
7027     const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
7028     for (const auto &F : Forwards) {
7029       SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
7030        RegsToPass.emplace_back(F.PReg, Val);
7031     }
7032   }
7033
7034   // Walk the register/memloc assignments, inserting copies/loads.
7035   unsigned ExtraArgLocs = 0;
7036   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
7037     CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7038     SDValue Arg = OutVals[i];
7039     ISD::ArgFlagsTy Flags = Outs[i].Flags;
7040
7041     // Promote the value if needed.
7042     switch (VA.getLocInfo()) {
7043     default:
7044       llvm_unreachable("Unknown loc info!");
7045     case CCValAssign::Full:
7046       break;
7047     case CCValAssign::SExt:
7048       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
7049       break;
7050     case CCValAssign::ZExt:
7051       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
7052       break;
7053     case CCValAssign::AExt:
7054       if (Outs[i].ArgVT == MVT::i1) {
7055         // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
7056         //
7057         // Check if we actually have to do this, because the value may
7058         // already be zero-extended.
7059         //
7060         // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
7061         // and rely on DAGCombiner to fold this, because the following
7062         // (anyext i32) is combined with (zext i8) in DAG.getNode:
7063         //
7064         //   (ext (zext x)) -> (zext x)
7065         //
7066         // This will give us (zext i32), which we cannot remove, so
7067         // try to check this beforehand.
7068         if (!checkZExtBool(Arg, DAG)) {
7069           Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
7070           Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
7071         }
7072       }
7073       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
7074       break;
7075     case CCValAssign::AExtUpper:
7076       assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
7077       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
7078       Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
7079                         DAG.getConstant(32, DL, VA.getLocVT()));
7080       break;
7081     case CCValAssign::BCvt:
7082       Arg = DAG.getBitcast(VA.getLocVT(), Arg);
7083       break;
7084     case CCValAssign::Trunc:
7085       Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7086       break;
7087     case CCValAssign::FPExt:
7088       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
7089       break;
7090     case CCValAssign::Indirect:
7091       bool isScalable = VA.getValVT().isScalableVector();
7092       assert((isScalable || Subtarget->isWindowsArm64EC()) &&
7093              "Indirect arguments should be scalable on most subtargets");
7094
7095       uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
7096       uint64_t PartSize = StoreSize;
7097       unsigned NumParts = 1;
7098       if (Outs[i].Flags.isInConsecutiveRegs()) {
7099         assert(!Outs[i].Flags.isInConsecutiveRegsLast());
7100         while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7101           ++NumParts;
7102         StoreSize *= NumParts;
7103       }
7104
7105       Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
7106       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
7107       int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
7108       if (isScalable)
7109         MFI.setStackID(FI, TargetStackID::ScalableVector);
7110
7111       MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
7112       SDValue Ptr = DAG.getFrameIndex(
7113           FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7114       SDValue SpillSlot = Ptr;
7115
7116       // Ensure we generate all stores for each tuple part, whilst updating the
7117       // pointer after each store correctly using vscale.
7118       while (NumParts) {
7119         Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
7120         NumParts--;
7121         if (NumParts > 0) {
7122           SDValue BytesIncrement;
7123           if (isScalable) {
7124             BytesIncrement = DAG.getVScale(
7125                 DL, Ptr.getValueType(),
7126                 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
7127           } else {
7128             BytesIncrement = DAG.getConstant(
7129                 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize), DL,
7130                 Ptr.getValueType());
7131           }
7132           SDNodeFlags Flags;
7133           Flags.setNoUnsignedWrap(true);
7134
7135           MPI = MachinePointerInfo(MPI.getAddrSpace());
7136           Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7137                             BytesIncrement, Flags);
7138           ExtraArgLocs++;
7139           i++;
7140         }
7141       }
7142
7143       Arg = SpillSlot;
7144       break;
7145     }
7146
7147     if (VA.isRegLoc()) {
7148       if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
7149           Outs[0].VT == MVT::i64) {
7150         assert(VA.getLocVT() == MVT::i64 &&
7151                "unexpected calling convention register assignment");
7152         assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
7153                "unexpected use of 'returned'");
7154         IsThisReturn = true;
7155       }
7156       if (RegsUsed.count(VA.getLocReg())) {
7157         // If this register has already been used then we're trying to pack
7158         // parts of an [N x i32] into an X-register. The extension type will
7159         // take care of putting the two halves in the right place but we have to
7160         // combine them.
7161         SDValue &Bits =
7162             llvm::find_if(RegsToPass,
7163                           [=](const std::pair<unsigned, SDValue> &Elt) {
7164                             return Elt.first == VA.getLocReg();
7165                           })
7166                 ->second;
7167         Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
7168         // Call site info is used for function's parameter entry value
7169         // tracking. For now we track only simple cases when parameter
7170         // is transferred through whole register.
7171         llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
7172           return ArgReg.Reg == VA.getLocReg();
7173         });
7174       } else {
7175         // Add an extra level of indirection for streaming mode changes by
7176         // using a pseudo copy node that cannot be rematerialised between a
7177         // smstart/smstop and the call by the simple register coalescer.
7178         if (RequiresSMChange && isa<FrameIndexSDNode>(Arg))
7179           Arg = DAG.getNode(AArch64ISD::OBSCURE_COPY, DL, MVT::i64, Arg);
7180         RegsToPass.emplace_back(VA.getLocReg(), Arg);
7181         RegsUsed.insert(VA.getLocReg());
7182         const TargetOptions &Options = DAG.getTarget().Options;
7183         if (Options.EmitCallSiteInfo)
7184           CSInfo.emplace_back(VA.getLocReg(), i);
7185       }
7186     } else {
7187       assert(VA.isMemLoc());
7188
7189       SDValue DstAddr;
7190       MachinePointerInfo DstInfo;
7191
7192       // FIXME: This works on big-endian for composite byvals, which are the
7193       // common case. It should also work for fundamental types too.
7194       uint32_t BEAlign = 0;
7195       unsigned OpSize;
7196       if (VA.getLocInfo() == CCValAssign::Indirect ||
7197           VA.getLocInfo() == CCValAssign::Trunc)
7198         OpSize = VA.getLocVT().getFixedSizeInBits();
7199       else
7200         OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
7201                                  : VA.getValVT().getSizeInBits();
7202       OpSize = (OpSize + 7) / 8;
7203       if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
7204           !Flags.isInConsecutiveRegs()) {
7205         if (OpSize < 8)
7206           BEAlign = 8 - OpSize;
7207       }
7208       unsigned LocMemOffset = VA.getLocMemOffset();
7209       int32_t Offset = LocMemOffset + BEAlign;
7210       SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
7211       PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
7212
7213       if (IsTailCall) {
7214         Offset = Offset + FPDiff;
7215         int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
7216
7217         DstAddr = DAG.getFrameIndex(FI, PtrVT);
7218         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
7219
7220         // Make sure any stack arguments overlapping with where we're storing
7221         // are loaded before this eventual operation. Otherwise they'll be
7222         // clobbered.
7223         Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
7224       } else {
7225         SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
7226
7227         DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
7228         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
7229       }
7230
7231       if (Outs[i].Flags.isByVal()) {
7232         SDValue SizeNode =
7233             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
7234         SDValue Cpy = DAG.getMemcpy(
7235             Chain, DL, DstAddr, Arg, SizeNode,
7236             Outs[i].Flags.getNonZeroByValAlign(),
7237             /*isVol = */ false, /*AlwaysInline = */ false,
7238             /*isTailCall = */ false, DstInfo, MachinePointerInfo());
7239
7240         MemOpChains.push_back(Cpy);
7241       } else {
7242         // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
7243         // promoted to a legal register type i32, we should truncate Arg back to
7244         // i1/i8/i16.
7245         if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
7246             VA.getValVT() == MVT::i16)
7247           Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
7248
7249         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
7250         MemOpChains.push_back(Store);
7251       }
7252     }
7253   }
7254
7255   if (IsVarArg && Subtarget->isWindowsArm64EC()) {
7256     // For vararg calls, the Arm64EC ABI requires values in x4 and x5
7257     // describing the argument list.  x4 contains the address of the
7258     // first stack parameter. x5 contains the size in bytes of all parameters
7259     // passed on the stack.
7260     RegsToPass.emplace_back(AArch64::X4, StackPtr);
7261     RegsToPass.emplace_back(AArch64::X5,
7262                             DAG.getConstant(NumBytes, DL, MVT::i64));
7263   }
7264
7265   if (!MemOpChains.empty())
7266     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
7267
7268   SDValue InFlag;
7269   if (RequiresSMChange) {
7270     SDValue NewChain = changeStreamingMode(DAG, DL, *RequiresSMChange, Chain,
7271                                            InFlag, PStateSM, true);
7272     Chain = NewChain.getValue(0);
7273     InFlag = NewChain.getValue(1);
7274   }
7275
7276   // Build a sequence of copy-to-reg nodes chained together with token chain
7277   // and flag operands which copy the outgoing args into the appropriate regs.
7278   for (auto &RegToPass : RegsToPass) {
7279     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
7280                              RegToPass.second, InFlag);
7281     InFlag = Chain.getValue(1);
7282   }
7283
7284   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
7285   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
7286   // node so that legalize doesn't hack it.
7287   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7288     auto GV = G->getGlobal();
7289     unsigned OpFlags =
7290         Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
7291     if (OpFlags & AArch64II::MO_GOT) {
7292       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
7293       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
7294     } else {
7295       const GlobalValue *GV = G->getGlobal();
7296       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
7297     }
7298   } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
7299     if (getTargetMachine().getCodeModel() == CodeModel::Large &&
7300         Subtarget->isTargetMachO()) {
7301       const char *Sym = S->getSymbol();
7302       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
7303       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
7304     } else {
7305       const char *Sym = S->getSymbol();
7306       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
7307     }
7308   }
7309
7310   // We don't usually want to end the call-sequence here because we would tidy
7311   // the frame up *after* the call, however in the ABI-changing tail-call case
7312   // we've carefully laid out the parameters so that when sp is reset they'll be
7313   // in the correct location.
7314   if (IsTailCall && !IsSibCall) {
7315     Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InFlag, DL);
7316     InFlag = Chain.getValue(1);
7317   }
7318
7319   std::vector<SDValue> Ops;
7320   Ops.push_back(Chain);
7321   Ops.push_back(Callee);
7322
7323   if (IsTailCall) {
7324     // Each tail call may have to adjust the stack by a different amount, so
7325     // this information must travel along with the operation for eventual
7326     // consumption by emitEpilogue.
7327     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
7328   }
7329
7330   // Add argument registers to the end of the list so that they are known live
7331   // into the call.
7332   for (auto &RegToPass : RegsToPass)
7333     Ops.push_back(DAG.getRegister(RegToPass.first,
7334                                   RegToPass.second.getValueType()));
7335
7336   // Add a register mask operand representing the call-preserved registers.
7337   const uint32_t *Mask;
7338   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7339   if (IsThisReturn) {
7340     // For 'this' returns, use the X0-preserving mask if applicable
7341     Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
7342     if (!Mask) {
7343       IsThisReturn = false;
7344       Mask = TRI->getCallPreservedMask(MF, CallConv);
7345     }
7346   } else
7347     Mask = TRI->getCallPreservedMask(MF, CallConv);
7348
7349   if (Subtarget->hasCustomCallingConv())
7350     TRI->UpdateCustomCallPreservedMask(MF, &Mask);
7351
7352   if (TRI->isAnyArgRegReserved(MF))
7353     TRI->emitReservedArgRegCallError(MF);
7354
7355   assert(Mask && "Missing call preserved mask for calling convention");
7356   Ops.push_back(DAG.getRegisterMask(Mask));
7357
7358   if (InFlag.getNode())
7359     Ops.push_back(InFlag);
7360
7361   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7362
7363   // If we're doing a tall call, use a TC_RETURN here rather than an
7364   // actual call instruction.
7365   if (IsTailCall) {
7366     MF.getFrameInfo().setHasTailCall();
7367     SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
7368
7369     if (IsCFICall)
7370       Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
7371
7372     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
7373     return Ret;
7374   }
7375
7376   unsigned CallOpc = AArch64ISD::CALL;
7377   // Calls with operand bundle "clang.arc.attachedcall" are special. They should
7378   // be expanded to the call, directly followed by a special marker sequence and
7379   // a call to an ObjC library function.  Use CALL_RVMARKER to do that.
7380   if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
7381     assert(!IsTailCall &&
7382            "tail calls cannot be marked with clang.arc.attachedcall");
7383     CallOpc = AArch64ISD::CALL_RVMARKER;
7384
7385     // Add a target global address for the retainRV/claimRV runtime function
7386     // just before the call target.
7387     Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
7388     auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
7389     Ops.insert(Ops.begin() + 1, GA);
7390   } else if (GuardWithBTI)
7391     CallOpc = AArch64ISD::CALL_BTI;
7392
7393   // Returns a chain and a flag for retval copy to use.
7394   Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
7395
7396   if (IsCFICall)
7397     Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
7398
7399   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
7400   InFlag = Chain.getValue(1);
7401   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
7402
7403   uint64_t CalleePopBytes =
7404       DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
7405
7406   Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InFlag, DL);
7407   InFlag = Chain.getValue(1);
7408
7409   // Handle result values, copying them out of physregs into vregs that we
7410   // return.
7411   SDValue Result = LowerCallResult(Chain, InFlag, CallConv, IsVarArg, RVLocs,
7412                                    DL, DAG, InVals, IsThisReturn,
7413                                    IsThisReturn ? OutVals[0] : SDValue());
7414
7415   if (!Ins.empty())
7416     InFlag = Result.getValue(Result->getNumValues() - 1);
7417
7418   if (RequiresSMChange) {
7419     assert(PStateSM && "Expected a PStateSM to be set");
7420     Result = changeStreamingMode(DAG, DL, !*RequiresSMChange, Result, InFlag,
7421                                  PStateSM, false);
7422   }
7423
7424   if (RequiresLazySave) {
7425     // Unconditionally resume ZA.
7426     Result = DAG.getNode(
7427         AArch64ISD::SMSTART, DL, MVT::Other, Result,
7428         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
7429         DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
7430
7431     // Conditionally restore the lazy save using a pseudo node.
7432     unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
7433     SDValue RegMask = DAG.getRegisterMask(
7434         TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
7435     SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
7436         "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
7437     SDValue TPIDR2_EL0 = DAG.getNode(
7438         ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
7439         DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
7440
7441     // Copy the address of the TPIDR2 block into X0 before 'calling' the
7442     // RESTORE_ZA pseudo.
7443     SDValue Glue;
7444     SDValue TPIDR2Block = DAG.getFrameIndex(
7445         FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7446     Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
7447     Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
7448                          {Result, TPIDR2_EL0,
7449                           DAG.getRegister(AArch64::X0, MVT::i64),
7450                           RestoreRoutine,
7451                           RegMask,
7452                           Result.getValue(1)});
7453
7454     // Finally reset the TPIDR2_EL0 register to 0.
7455     Result = DAG.getNode(
7456         ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
7457         DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7458         DAG.getConstant(0, DL, MVT::i64));
7459   }
7460
7461   if (RequiresSMChange || RequiresLazySave) {
7462     for (unsigned I = 0; I < InVals.size(); ++I) {
7463       // The smstart/smstop is chained as part of the call, but when the
7464       // resulting chain is discarded (which happens when the call is not part
7465       // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
7466       // smstart/smstop is chained to the result value. We can do that by doing
7467       // a vreg -> vreg copy.
7468       Register Reg = MF.getRegInfo().createVirtualRegister(
7469           getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7470       SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
7471       InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
7472                                      InVals[I].getValueType());
7473     }
7474   }
7475
7476   return Result;
7477 }
7478
7479 bool AArch64TargetLowering::CanLowerReturn(
7480     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
7481     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
7482   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7483   SmallVector<CCValAssign, 16> RVLocs;
7484   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
7485   return CCInfo.CheckReturn(Outs, RetCC);
7486 }
7487
7488 SDValue
7489 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
7490                                    bool isVarArg,
7491                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
7492                                    const SmallVectorImpl<SDValue> &OutVals,
7493                                    const SDLoc &DL, SelectionDAG &DAG) const {
7494   auto &MF = DAG.getMachineFunction();
7495   auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7496
7497   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7498   SmallVector<CCValAssign, 16> RVLocs;
7499   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
7500   CCInfo.AnalyzeReturn(Outs, RetCC);
7501
7502   // Copy the result values into the output registers.
7503   SDValue Flag;
7504   SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
7505   SmallSet<unsigned, 4> RegsUsed;
7506   for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
7507        ++i, ++realRVLocIdx) {
7508     CCValAssign &VA = RVLocs[i];
7509     assert(VA.isRegLoc() && "Can only return in registers!");
7510     SDValue Arg = OutVals[realRVLocIdx];
7511
7512     switch (VA.getLocInfo()) {
7513     default:
7514       llvm_unreachable("Unknown loc info!");
7515     case CCValAssign::Full:
7516       if (Outs[i].ArgVT == MVT::i1) {
7517         // AAPCS requires i1 to be zero-extended to i8 by the producer of the
7518         // value. This is strictly redundant on Darwin (which uses "zeroext
7519         // i1"), but will be optimised out before ISel.
7520         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
7521         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
7522       }
7523       break;
7524     case CCValAssign::BCvt:
7525       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
7526       break;
7527     case CCValAssign::AExt:
7528     case CCValAssign::ZExt:
7529       Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7530       break;
7531     case CCValAssign::AExtUpper:
7532       assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
7533       Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7534       Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
7535                         DAG.getConstant(32, DL, VA.getLocVT()));
7536       break;
7537     }
7538
7539     if (RegsUsed.count(VA.getLocReg())) {
7540       SDValue &Bits =
7541           llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
7542             return Elt.first == VA.getLocReg();
7543           })->second;
7544       Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
7545     } else {
7546       RetVals.emplace_back(VA.getLocReg(), Arg);
7547       RegsUsed.insert(VA.getLocReg());
7548     }
7549   }
7550
7551   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7552
7553   // Emit SMSTOP before returning from a locally streaming function
7554   SMEAttrs FuncAttrs(MF.getFunction());
7555   if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
7556     Chain = DAG.getNode(
7557         AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
7558         DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
7559         DAG.getConstant(1, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64),
7560         DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()));
7561     Flag = Chain.getValue(1);
7562   }
7563
7564   SmallVector<SDValue, 4> RetOps(1, Chain);
7565   for (auto &RetVal : RetVals) {
7566     Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
7567     Flag = Chain.getValue(1);
7568     RetOps.push_back(
7569         DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
7570   }
7571
7572   // Windows AArch64 ABIs require that for returning structs by value we copy
7573   // the sret argument into X0 for the return.
7574   // We saved the argument into a virtual register in the entry block,
7575   // so now we copy the value out and into X0.
7576   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
7577     SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
7578                                      getPointerTy(MF.getDataLayout()));
7579
7580     unsigned RetValReg = AArch64::X0;
7581     Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
7582     Flag = Chain.getValue(1);
7583
7584     RetOps.push_back(
7585       DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
7586   }
7587
7588   const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
7589   if (I) {
7590     for (; *I; ++I) {
7591       if (AArch64::GPR64RegClass.contains(*I))
7592         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
7593       else if (AArch64::FPR64RegClass.contains(*I))
7594         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
7595       else
7596         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
7597     }
7598   }
7599
7600   RetOps[0] = Chain; // Update chain.
7601
7602   // Add the flag if we have it.
7603   if (Flag.getNode())
7604     RetOps.push_back(Flag);
7605
7606   return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
7607 }
7608
7609 //===----------------------------------------------------------------------===//
7610 //  Other Lowering Code
7611 //===----------------------------------------------------------------------===//
7612
7613 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
7614                                              SelectionDAG &DAG,
7615                                              unsigned Flag) const {
7616   return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
7617                                     N->getOffset(), Flag);
7618 }
7619
7620 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
7621                                              SelectionDAG &DAG,
7622                                              unsigned Flag) const {
7623   return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
7624 }
7625
7626 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
7627                                              SelectionDAG &DAG,
7628                                              unsigned Flag) const {
7629   return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
7630                                    N->getOffset(), Flag);
7631 }
7632
7633 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
7634                                              SelectionDAG &DAG,
7635                                              unsigned Flag) const {
7636   return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
7637 }
7638
7639 // (loadGOT sym)
7640 template <class NodeTy>
7641 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
7642                                       unsigned Flags) const {
7643   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
7644   SDLoc DL(N);
7645   EVT Ty = getPointerTy(DAG.getDataLayout());
7646   SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
7647   // FIXME: Once remat is capable of dealing with instructions with register
7648   // operands, expand this into two nodes instead of using a wrapper node.
7649   return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
7650 }
7651
7652 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
7653 template <class NodeTy>
7654 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
7655                                             unsigned Flags) const {
7656   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
7657   SDLoc DL(N);
7658   EVT Ty = getPointerTy(DAG.getDataLayout());
7659   const unsigned char MO_NC = AArch64II::MO_NC;
7660   return DAG.getNode(
7661       AArch64ISD::WrapperLarge, DL, Ty,
7662       getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
7663       getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
7664       getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
7665       getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
7666 }
7667
7668 // (addlow (adrp %hi(sym)) %lo(sym))
7669 template <class NodeTy>
7670 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
7671                                        unsigned Flags) const {
7672   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
7673   SDLoc DL(N);
7674   EVT Ty = getPointerTy(DAG.getDataLayout());
7675   SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
7676   SDValue Lo = getTargetNode(N, Ty, DAG,
7677                              AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
7678   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
7679   return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
7680 }
7681
7682 // (adr sym)
7683 template <class NodeTy>
7684 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
7685                                            unsigned Flags) const {
7686   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
7687   SDLoc DL(N);
7688   EVT Ty = getPointerTy(DAG.getDataLayout());
7689   SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
7690   return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
7691 }
7692
7693 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
7694                                                   SelectionDAG &DAG) const {
7695   GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
7696   const GlobalValue *GV = GN->getGlobal();
7697   unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
7698
7699   if (OpFlags != AArch64II::MO_NO_FLAG)
7700     assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
7701            "unexpected offset in global node");
7702
7703   // This also catches the large code model case for Darwin, and tiny code
7704   // model with got relocations.
7705   if ((OpFlags & AArch64II::MO_GOT) != 0) {
7706     return getGOT(GN, DAG, OpFlags);
7707   }
7708
7709   SDValue Result;
7710   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
7711     Result = getAddrLarge(GN, DAG, OpFlags);
7712   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7713     Result = getAddrTiny(GN, DAG, OpFlags);
7714   } else {
7715     Result = getAddr(GN, DAG, OpFlags);
7716   }
7717   EVT PtrVT = getPointerTy(DAG.getDataLayout());
7718   SDLoc DL(GN);
7719   if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_DLLIMPORTAUX |
7720                  AArch64II::MO_COFFSTUB))
7721     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
7722                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
7723   return Result;
7724 }
7725
7726 /// Convert a TLS address reference into the correct sequence of loads
7727 /// and calls to compute the variable's address (for Darwin, currently) and
7728 /// return an SDValue containing the final node.
7729
7730 /// Darwin only has one TLS scheme which must be capable of dealing with the
7731 /// fully general situation, in the worst case. This means:
7732 ///     + "extern __thread" declaration.
7733 ///     + Defined in a possibly unknown dynamic library.
7734 ///
7735 /// The general system is that each __thread variable has a [3 x i64] descriptor
7736 /// which contains information used by the runtime to calculate the address. The
7737 /// only part of this the compiler needs to know about is the first xword, which
7738 /// contains a function pointer that must be called with the address of the
7739 /// entire descriptor in "x0".
7740 ///
7741 /// Since this descriptor may be in a different unit, in general even the
7742 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
7743 /// is:
7744 ///     adrp x0, _var@TLVPPAGE
7745 ///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
7746 ///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
7747 ///                                      ; the function pointer
7748 ///     blr x1                           ; Uses descriptor address in x0
7749 ///     ; Address of _var is now in x0.
7750 ///
7751 /// If the address of _var's descriptor *is* known to the linker, then it can
7752 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
7753 /// a slight efficiency gain.
7754 SDValue
7755 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
7756                                                    SelectionDAG &DAG) const {
7757   assert(Subtarget->isTargetDarwin() &&
7758          "This function expects a Darwin target");
7759
7760   SDLoc DL(Op);
7761   MVT PtrVT = getPointerTy(DAG.getDataLayout());
7762   MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
7763   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
7764
7765   SDValue TLVPAddr =
7766       DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
7767   SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
7768
7769   // The first entry in the descriptor is a function pointer that we must call
7770   // to obtain the address of the variable.
7771   SDValue Chain = DAG.getEntryNode();
7772   SDValue FuncTLVGet = DAG.getLoad(
7773       PtrMemVT, DL, Chain, DescAddr,
7774       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
7775       Align(PtrMemVT.getSizeInBits() / 8),
7776       MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
7777   Chain = FuncTLVGet.getValue(1);
7778
7779   // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
7780   FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
7781
7782   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7783   MFI.setAdjustsStack(true);
7784
7785   // TLS calls preserve all registers except those that absolutely must be
7786   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
7787   // silly).
7788   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7789   const uint32_t *Mask = TRI->getTLSCallPreservedMask();
7790   if (Subtarget->hasCustomCallingConv())
7791     TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
7792
7793   // Finally, we can make the call. This is just a degenerate version of a
7794   // normal AArch64 call node: x0 takes the address of the descriptor, and
7795   // returns the address of the variable in this thread.
7796   Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
7797   Chain =
7798       DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
7799                   Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
7800                   DAG.getRegisterMask(Mask), Chain.getValue(1));
7801   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
7802 }
7803
7804 /// Convert a thread-local variable reference into a sequence of instructions to
7805 /// compute the variable's address for the local exec TLS model of ELF targets.
7806 /// The sequence depends on the maximum TLS area size.
7807 SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
7808                                                     SDValue ThreadBase,
7809                                                     const SDLoc &DL,
7810                                                     SelectionDAG &DAG) const {
7811   EVT PtrVT = getPointerTy(DAG.getDataLayout());
7812   SDValue TPOff, Addr;
7813
7814   switch (DAG.getTarget().Options.TLSSize) {
7815   default:
7816     llvm_unreachable("Unexpected TLS size");
7817
7818   case 12: {
7819     // mrs   x0, TPIDR_EL0
7820     // add   x0, x0, :tprel_lo12:a
7821     SDValue Var = DAG.getTargetGlobalAddress(
7822         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
7823     return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
7824                                       Var,
7825                                       DAG.getTargetConstant(0, DL, MVT::i32)),
7826                    0);
7827   }
7828
7829   case 24: {
7830     // mrs   x0, TPIDR_EL0
7831     // add   x0, x0, :tprel_hi12:a
7832     // add   x0, x0, :tprel_lo12_nc:a
7833     SDValue HiVar = DAG.getTargetGlobalAddress(
7834         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
7835     SDValue LoVar = DAG.getTargetGlobalAddress(
7836         GV, DL, PtrVT, 0,
7837         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
7838     Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
7839                                       HiVar,
7840                                       DAG.getTargetConstant(0, DL, MVT::i32)),
7841                    0);
7842     return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
7843                                       LoVar,
7844                                       DAG.getTargetConstant(0, DL, MVT::i32)),
7845                    0);
7846   }
7847
7848   case 32: {
7849     // mrs   x1, TPIDR_EL0
7850     // movz  x0, #:tprel_g1:a
7851     // movk  x0, #:tprel_g0_nc:a
7852     // add   x0, x1, x0
7853     SDValue HiVar = DAG.getTargetGlobalAddress(
7854         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
7855     SDValue LoVar = DAG.getTargetGlobalAddress(
7856         GV, DL, PtrVT, 0,
7857         AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
7858     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
7859                                        DAG.getTargetConstant(16, DL, MVT::i32)),
7860                     0);
7861     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
7862                                        DAG.getTargetConstant(0, DL, MVT::i32)),
7863                     0);
7864     return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
7865   }
7866
7867   case 48: {
7868     // mrs   x1, TPIDR_EL0
7869     // movz  x0, #:tprel_g2:a
7870     // movk  x0, #:tprel_g1_nc:a
7871     // movk  x0, #:tprel_g0_nc:a
7872     // add   x0, x1, x0
7873     SDValue HiVar = DAG.getTargetGlobalAddress(
7874         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
7875     SDValue MiVar = DAG.getTargetGlobalAddress(
7876         GV, DL, PtrVT, 0,
7877         AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
7878     SDValue LoVar = DAG.getTargetGlobalAddress(
7879         GV, DL, PtrVT, 0,
7880         AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
7881     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
7882                                        DAG.getTargetConstant(32, DL, MVT::i32)),
7883                     0);
7884     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
7885                                        DAG.getTargetConstant(16, DL, MVT::i32)),
7886                     0);
7887     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
7888                                        DAG.getTargetConstant(0, DL, MVT::i32)),
7889                     0);
7890     return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
7891   }
7892   }
7893 }
7894
7895 /// When accessing thread-local variables under either the general-dynamic or
7896 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
7897 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
7898 /// is a function pointer to carry out the resolution.
7899 ///
7900 /// The sequence is:
7901 ///    adrp  x0, :tlsdesc:var
7902 ///    ldr   x1, [x0, #:tlsdesc_lo12:var]
7903 ///    add   x0, x0, #:tlsdesc_lo12:var
7904 ///    .tlsdesccall var
7905 ///    blr   x1
7906 ///    (TPIDR_EL0 offset now in x0)
7907 ///
7908 ///  The above sequence must be produced unscheduled, to enable the linker to
7909 ///  optimize/relax this sequence.
7910 ///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
7911 ///  above sequence, and expanded really late in the compilation flow, to ensure
7912 ///  the sequence is produced as per above.
7913 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
7914                                                       const SDLoc &DL,
7915                                                       SelectionDAG &DAG) const {
7916   EVT PtrVT = getPointerTy(DAG.getDataLayout());
7917
7918   SDValue Chain = DAG.getEntryNode();
7919   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7920
7921   Chain =
7922       DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
7923   SDValue Glue = Chain.getValue(1);
7924
7925   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
7926 }
7927
7928 SDValue
7929 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
7930                                                 SelectionDAG &DAG) const {
7931   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
7932
7933   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
7934
7935   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
7936
7937   if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
7938     if (Model == TLSModel::LocalDynamic)
7939       Model = TLSModel::GeneralDynamic;
7940   }
7941
7942   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
7943       Model != TLSModel::LocalExec)
7944     report_fatal_error("ELF TLS only supported in small memory model or "
7945                        "in local exec TLS model");
7946   // Different choices can be made for the maximum size of the TLS area for a
7947   // module. For the small address model, the default TLS size is 16MiB and the
7948   // maximum TLS size is 4GiB.
7949   // FIXME: add tiny and large code model support for TLS access models other
7950   // than local exec. We currently generate the same code as small for tiny,
7951   // which may be larger than needed.
7952
7953   SDValue TPOff;
7954   EVT PtrVT = getPointerTy(DAG.getDataLayout());
7955   SDLoc DL(Op);
7956   const GlobalValue *GV = GA->getGlobal();
7957
7958   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
7959
7960   if (Model == TLSModel::LocalExec) {
7961     return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
7962   } else if (Model == TLSModel::InitialExec) {
7963     TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
7964     TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
7965   } else if (Model == TLSModel::LocalDynamic) {
7966     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
7967     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
7968     // the beginning of the module's TLS region, followed by a DTPREL offset
7969     // calculation.
7970
7971     // These accesses will need deduplicating if there's more than one.
7972     AArch64FunctionInfo *MFI =
7973         DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
7974     MFI->incNumLocalDynamicTLSAccesses();
7975
7976     // The call needs a relocation too for linker relaxation. It doesn't make
7977     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
7978     // the address.
7979     SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
7980                                                   AArch64II::MO_TLS);
7981
7982     // Now we can calculate the offset from TPIDR_EL0 to this module's
7983     // thread-local area.
7984     TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
7985
7986     // Now use :dtprel_whatever: operations to calculate this variable's offset
7987     // in its thread-storage area.
7988     SDValue HiVar = DAG.getTargetGlobalAddress(
7989         GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
7990     SDValue LoVar = DAG.getTargetGlobalAddress(
7991         GV, DL, MVT::i64, 0,
7992         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
7993
7994     TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
7995                                        DAG.getTargetConstant(0, DL, MVT::i32)),
7996                     0);
7997     TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
7998                                        DAG.getTargetConstant(0, DL, MVT::i32)),
7999                     0);
8000   } else if (Model == TLSModel::GeneralDynamic) {
8001     // The call needs a relocation too for linker relaxation. It doesn't make
8002     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
8003     // the address.
8004     SDValue SymAddr =
8005         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8006
8007     // Finally we can make a call to calculate the offset from tpidr_el0.
8008     TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
8009   } else
8010     llvm_unreachable("Unsupported ELF TLS access model");
8011
8012   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8013 }
8014
8015 SDValue
8016 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
8017                                                     SelectionDAG &DAG) const {
8018   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
8019
8020   SDValue Chain = DAG.getEntryNode();
8021   EVT PtrVT = getPointerTy(DAG.getDataLayout());
8022   SDLoc DL(Op);
8023
8024   SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
8025
8026   // Load the ThreadLocalStoragePointer from the TEB
8027   // A pointer to the TLS array is located at offset 0x58 from the TEB.
8028   SDValue TLSArray =
8029       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
8030   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
8031   Chain = TLSArray.getValue(1);
8032
8033   // Load the TLS index from the C runtime;
8034   // This does the same as getAddr(), but without having a GlobalAddressSDNode.
8035   // This also does the same as LOADgot, but using a generic i32 load,
8036   // while LOADgot only loads i64.
8037   SDValue TLSIndexHi =
8038       DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
8039   SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
8040       "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8041   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
8042   SDValue TLSIndex =
8043       DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
8044   TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
8045   Chain = TLSIndex.getValue(1);
8046
8047   // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
8048   // offset into the TLSArray.
8049   TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
8050   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
8051                              DAG.getConstant(3, DL, PtrVT));
8052   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
8053                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
8054                             MachinePointerInfo());
8055   Chain = TLS.getValue(1);
8056
8057   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8058   const GlobalValue *GV = GA->getGlobal();
8059   SDValue TGAHi = DAG.getTargetGlobalAddress(
8060       GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8061   SDValue TGALo = DAG.getTargetGlobalAddress(
8062       GV, DL, PtrVT, 0,
8063       AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8064
8065   // Add the offset from the start of the .tls section (section base).
8066   SDValue Addr =
8067       SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
8068                                  DAG.getTargetConstant(0, DL, MVT::i32)),
8069               0);
8070   Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
8071   return Addr;
8072 }
8073
8074 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
8075                                                      SelectionDAG &DAG) const {
8076   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8077   if (DAG.getTarget().useEmulatedTLS())
8078     return LowerToTLSEmulatedModel(GA, DAG);
8079
8080   if (Subtarget->isTargetDarwin())
8081     return LowerDarwinGlobalTLSAddress(Op, DAG);
8082   if (Subtarget->isTargetELF())
8083     return LowerELFGlobalTLSAddress(Op, DAG);
8084   if (Subtarget->isTargetWindows())
8085     return LowerWindowsGlobalTLSAddress(Op, DAG);
8086
8087   llvm_unreachable("Unexpected platform trying to use TLS");
8088 }
8089
8090 // Looks through \param Val to determine the bit that can be used to
8091 // check the sign of the value. It returns the unextended value and
8092 // the sign bit position.
8093 std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
8094   if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
8095     return {Val.getOperand(0),
8096             cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
8097                 1};
8098
8099   if (Val.getOpcode() == ISD::SIGN_EXTEND)
8100     return {Val.getOperand(0),
8101             Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
8102
8103   return {Val, Val.getValueSizeInBits() - 1};
8104 }
8105
8106 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
8107   SDValue Chain = Op.getOperand(0);
8108   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
8109   SDValue LHS = Op.getOperand(2);
8110   SDValue RHS = Op.getOperand(3);
8111   SDValue Dest = Op.getOperand(4);
8112   SDLoc dl(Op);
8113
8114   MachineFunction &MF = DAG.getMachineFunction();
8115   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
8116   // will not be produced, as they are conditional branch instructions that do
8117   // not set flags.
8118   bool ProduceNonFlagSettingCondBr =
8119       !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
8120
8121   // Handle f128 first, since lowering it will result in comparing the return
8122   // value of a libcall against zero, which is just what the rest of LowerBR_CC
8123   // is expecting to deal with.
8124   if (LHS.getValueType() == MVT::f128) {
8125     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
8126
8127     // If softenSetCCOperands returned a scalar, we need to compare the result
8128     // against zero to select between true and false values.
8129     if (!RHS.getNode()) {
8130       RHS = DAG.getConstant(0, dl, LHS.getValueType());
8131       CC = ISD::SETNE;
8132     }
8133   }
8134
8135   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
8136   // instruction.
8137   if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
8138       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
8139     // Only lower legal XALUO ops.
8140     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
8141       return SDValue();
8142
8143     // The actual operation with overflow check.
8144     AArch64CC::CondCode OFCC;
8145     SDValue Value, Overflow;
8146     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
8147
8148     if (CC == ISD::SETNE)
8149       OFCC = getInvertedCondCode(OFCC);
8150     SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
8151
8152     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
8153                        Overflow);
8154   }
8155
8156   if (LHS.getValueType().isInteger()) {
8157     assert((LHS.getValueType() == RHS.getValueType()) &&
8158            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
8159
8160     // If the RHS of the comparison is zero, we can potentially fold this
8161     // to a specialized branch.
8162     const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
8163     if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
8164       if (CC == ISD::SETEQ) {
8165         // See if we can use a TBZ to fold in an AND as well.
8166         // TBZ has a smaller branch displacement than CBZ.  If the offset is
8167         // out of bounds, a late MI-layer pass rewrites branches.
8168         // 403.gcc is an example that hits this case.
8169         if (LHS.getOpcode() == ISD::AND &&
8170             isa<ConstantSDNode>(LHS.getOperand(1)) &&
8171             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
8172           SDValue Test = LHS.getOperand(0);
8173           uint64_t Mask = LHS.getConstantOperandVal(1);
8174           return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
8175                              DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
8176                              Dest);
8177         }
8178
8179         return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
8180       } else if (CC == ISD::SETNE) {
8181         // See if we can use a TBZ to fold in an AND as well.
8182         // TBZ has a smaller branch displacement than CBZ.  If the offset is
8183         // out of bounds, a late MI-layer pass rewrites branches.
8184         // 403.gcc is an example that hits this case.
8185         if (LHS.getOpcode() == ISD::AND &&
8186             isa<ConstantSDNode>(LHS.getOperand(1)) &&
8187             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
8188           SDValue Test = LHS.getOperand(0);
8189           uint64_t Mask = LHS.getConstantOperandVal(1);
8190           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
8191                              DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
8192                              Dest);
8193         }
8194
8195         return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
8196       } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
8197         // Don't combine AND since emitComparison converts the AND to an ANDS
8198         // (a.k.a. TST) and the test in the test bit and branch instruction
8199         // becomes redundant.  This would also increase register pressure.
8200         uint64_t SignBitPos;
8201         std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
8202         return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
8203                            DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
8204       }
8205     }
8206     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
8207         LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
8208       // Don't combine AND since emitComparison converts the AND to an ANDS
8209       // (a.k.a. TST) and the test in the test bit and branch instruction
8210       // becomes redundant.  This would also increase register pressure.
8211       uint64_t SignBitPos;
8212       std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
8213       return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
8214                          DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
8215     }
8216
8217     SDValue CCVal;
8218     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
8219     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
8220                        Cmp);
8221   }
8222
8223   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
8224          LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
8225
8226   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
8227   // clean.  Some of them require two branches to implement.
8228   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
8229   AArch64CC::CondCode CC1, CC2;
8230   changeFPCCToAArch64CC(CC, CC1, CC2);
8231   SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8232   SDValue BR1 =
8233       DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
8234   if (CC2 != AArch64CC::AL) {
8235     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
8236     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
8237                        Cmp);
8238   }
8239
8240   return BR1;
8241 }
8242
8243 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
8244                                               SelectionDAG &DAG) const {
8245   if (!Subtarget->hasNEON())
8246     return SDValue();
8247
8248   EVT VT = Op.getValueType();
8249   EVT IntVT = VT.changeTypeToInteger();
8250   SDLoc DL(Op);
8251
8252   SDValue In1 = Op.getOperand(0);
8253   SDValue In2 = Op.getOperand(1);
8254   EVT SrcVT = In2.getValueType();
8255
8256   if (!SrcVT.bitsEq(VT))
8257     In2 = DAG.getFPExtendOrRound(In2, DL, VT);
8258
8259   if (VT.isScalableVector())
8260     IntVT =
8261         getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger());
8262
8263   if (VT.isFixedLengthVector() && useSVEForFixedLengthVectorVT(VT)) {
8264     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
8265
8266     In1 = convertToScalableVector(DAG, ContainerVT, In1);
8267     In2 = convertToScalableVector(DAG, ContainerVT, In2);
8268
8269     SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
8270     return convertFromScalableVector(DAG, VT, Res);
8271   }
8272
8273   auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
8274     if (VT.isScalableVector())
8275       return getSVESafeBitCast(VT, Op, DAG);
8276
8277     return DAG.getBitcast(VT, Op);
8278   };
8279
8280   SDValue VecVal1, VecVal2;
8281   EVT VecVT;
8282   auto SetVecVal = [&](int Idx = -1) {
8283     if (!VT.isVector()) {
8284       VecVal1 =
8285           DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
8286       VecVal2 =
8287           DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
8288     } else {
8289       VecVal1 = BitCast(VecVT, In1, DAG);
8290       VecVal2 = BitCast(VecVT, In2, DAG);
8291     }
8292   };
8293   if (VT.isVector()) {
8294     VecVT = IntVT;
8295     SetVecVal();
8296   } else if (VT == MVT::f64) {
8297     VecVT = MVT::v2i64;
8298     SetVecVal(AArch64::dsub);
8299   } else if (VT == MVT::f32) {
8300     VecVT = MVT::v4i32;
8301     SetVecVal(AArch64::ssub);
8302   } else if (VT == MVT::f16) {
8303     VecVT = MVT::v8i16;
8304     SetVecVal(AArch64::hsub);
8305   } else {
8306     llvm_unreachable("Invalid type for copysign!");
8307   }
8308
8309   unsigned BitWidth = In1.getScalarValueSizeInBits();
8310   SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
8311
8312   // We want to materialize a mask with every bit but the high bit set, but the
8313   // AdvSIMD immediate moves cannot materialize that in a single instruction for
8314   // 64-bit elements. Instead, materialize all bits set and then negate that.
8315   if (VT == MVT::f64 || VT == MVT::v2f64) {
8316     SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
8317     SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
8318     SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
8319     SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
8320   }
8321
8322   SDValue BSP =
8323       DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
8324   if (VT == MVT::f16)
8325     return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
8326   if (VT == MVT::f32)
8327     return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
8328   if (VT == MVT::f64)
8329     return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
8330
8331   return BitCast(VT, BSP, DAG);
8332 }
8333
8334 SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
8335                                                  SelectionDAG &DAG) const {
8336   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
8337           Attribute::NoImplicitFloat))
8338     return SDValue();
8339
8340   if (!Subtarget->hasNEON())
8341     return SDValue();
8342
8343   bool IsParity = Op.getOpcode() == ISD::PARITY;
8344
8345   // While there is no integer popcount instruction, it can
8346   // be more efficiently lowered to the following sequence that uses
8347   // AdvSIMD registers/instructions as long as the copies to/from
8348   // the AdvSIMD registers are cheap.
8349   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
8350   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
8351   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
8352   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
8353   SDValue Val = Op.getOperand(0);
8354   SDLoc DL(Op);
8355   EVT VT = Op.getValueType();
8356
8357   if (VT == MVT::i32 || VT == MVT::i64) {
8358     if (VT == MVT::i32)
8359       Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
8360     Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
8361
8362     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
8363     SDValue UaddLV = DAG.getNode(
8364         ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
8365         DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
8366
8367     if (IsParity)
8368       UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
8369                            DAG.getConstant(1, DL, MVT::i32));
8370
8371     if (VT == MVT::i64)
8372       UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
8373     return UaddLV;
8374   } else if (VT == MVT::i128) {
8375     Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
8376
8377     SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
8378     SDValue UaddLV = DAG.getNode(
8379         ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
8380         DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
8381
8382     if (IsParity)
8383       UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
8384                            DAG.getConstant(1, DL, MVT::i32));
8385
8386     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
8387   }
8388
8389   assert(!IsParity && "ISD::PARITY of vector types not supported");
8390
8391   if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
8392     return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
8393
8394   assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
8395           VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
8396          "Unexpected type for custom ctpop lowering");
8397
8398   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
8399   Val = DAG.getBitcast(VT8Bit, Val);
8400   Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
8401
8402   // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
8403   unsigned EltSize = 8;
8404   unsigned NumElts = VT.is64BitVector() ? 8 : 16;
8405   while (EltSize != VT.getScalarSizeInBits()) {
8406     EltSize *= 2;
8407     NumElts /= 2;
8408     MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
8409     Val = DAG.getNode(
8410         ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
8411         DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
8412   }
8413
8414   return Val;
8415 }
8416
8417 SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
8418   EVT VT = Op.getValueType();
8419   assert(VT.isScalableVector() ||
8420          useSVEForFixedLengthVectorVT(
8421              VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
8422
8423   SDLoc DL(Op);
8424   SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
8425   return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
8426 }
8427
8428 SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
8429                                            SelectionDAG &DAG) const {
8430
8431   EVT VT = Op.getValueType();
8432   SDLoc DL(Op);
8433   unsigned Opcode = Op.getOpcode();
8434   ISD::CondCode CC;
8435   switch (Opcode) {
8436   default:
8437     llvm_unreachable("Wrong instruction");
8438   case ISD::SMAX:
8439     CC = ISD::SETGT;
8440     break;
8441   case ISD::SMIN:
8442     CC = ISD::SETLT;
8443     break;
8444   case ISD::UMAX:
8445     CC = ISD::SETUGT;
8446     break;
8447   case ISD::UMIN:
8448     CC = ISD::SETULT;
8449     break;
8450   }
8451
8452   if (VT.isScalableVector() ||
8453       useSVEForFixedLengthVectorVT(
8454           VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
8455     switch (Opcode) {
8456     default:
8457       llvm_unreachable("Wrong instruction");
8458     case ISD::SMAX:
8459       return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
8460     case ISD::SMIN:
8461       return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
8462     case ISD::UMAX:
8463       return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
8464     case ISD::UMIN:
8465       return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
8466     }
8467   }
8468
8469   SDValue Op0 = Op.getOperand(0);
8470   SDValue Op1 = Op.getOperand(1);
8471   SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
8472   return DAG.getSelect(DL, VT, Cond, Op0, Op1);
8473 }
8474
8475 SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
8476                                                SelectionDAG &DAG) const {
8477   EVT VT = Op.getValueType();
8478
8479   if (VT.isScalableVector() ||
8480       useSVEForFixedLengthVectorVT(
8481           VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
8482     return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
8483
8484   SDLoc DL(Op);
8485   SDValue REVB;
8486   MVT VST;
8487
8488   switch (VT.getSimpleVT().SimpleTy) {
8489   default:
8490     llvm_unreachable("Invalid type for bitreverse!");
8491
8492   case MVT::v2i32: {
8493     VST = MVT::v8i8;
8494     REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
8495
8496     break;
8497   }
8498
8499   case MVT::v4i32: {
8500     VST = MVT::v16i8;
8501     REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
8502
8503     break;
8504   }
8505
8506   case MVT::v1i64: {
8507     VST = MVT::v8i8;
8508     REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
8509
8510     break;
8511   }
8512
8513   case MVT::v2i64: {
8514     VST = MVT::v16i8;
8515     REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
8516
8517     break;
8518   }
8519   }
8520
8521   return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
8522                      DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
8523 }
8524
8525 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
8526
8527   if (Op.getValueType().isVector())
8528     return LowerVSETCC(Op, DAG);
8529
8530   bool IsStrict = Op->isStrictFPOpcode();
8531   bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
8532   unsigned OpNo = IsStrict ? 1 : 0;
8533   SDValue Chain;
8534   if (IsStrict)
8535     Chain = Op.getOperand(0);
8536   SDValue LHS = Op.getOperand(OpNo + 0);
8537   SDValue RHS = Op.getOperand(OpNo + 1);
8538   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
8539   SDLoc dl(Op);
8540
8541   // We chose ZeroOrOneBooleanContents, so use zero and one.
8542   EVT VT = Op.getValueType();
8543   SDValue TVal = DAG.getConstant(1, dl, VT);
8544   SDValue FVal = DAG.getConstant(0, dl, VT);
8545
8546   // Handle f128 first, since one possible outcome is a normal integer
8547   // comparison which gets picked up by the next if statement.
8548   if (LHS.getValueType() == MVT::f128) {
8549     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
8550                         IsSignaling);
8551
8552     // If softenSetCCOperands returned a scalar, use it.
8553     if (!RHS.getNode()) {
8554       assert(LHS.getValueType() == Op.getValueType() &&
8555              "Unexpected setcc expansion!");
8556       return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
8557     }
8558   }
8559
8560   if (LHS.getValueType().isInteger()) {
8561     SDValue CCVal;
8562     SDValue Cmp = getAArch64Cmp(
8563         LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
8564
8565     // Note that we inverted the condition above, so we reverse the order of
8566     // the true and false operands here.  This will allow the setcc to be
8567     // matched to a single CSINC instruction.
8568     SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
8569     return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
8570   }
8571
8572   // Now we know we're dealing with FP values.
8573   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
8574          LHS.getValueType() == MVT::f64);
8575
8576   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
8577   // and do the comparison.
8578   SDValue Cmp;
8579   if (IsStrict)
8580     Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
8581   else
8582     Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
8583
8584   AArch64CC::CondCode CC1, CC2;
8585   changeFPCCToAArch64CC(CC, CC1, CC2);
8586   SDValue Res;
8587   if (CC2 == AArch64CC::AL) {
8588     changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
8589                           CC2);
8590     SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8591
8592     // Note that we inverted the condition above, so we reverse the order of
8593     // the true and false operands here.  This will allow the setcc to be
8594     // matched to a single CSINC instruction.
8595     Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
8596   } else {
8597     // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
8598     // totally clean.  Some of them require two CSELs to implement.  As is in
8599     // this case, we emit the first CSEL and then emit a second using the output
8600     // of the first as the RHS.  We're effectively OR'ing the two CC's together.
8601
8602     // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
8603     SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8604     SDValue CS1 =
8605         DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
8606
8607     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
8608     Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
8609   }
8610   return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
8611 }
8612
8613 SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
8614                                                SelectionDAG &DAG) const {
8615
8616   SDValue LHS = Op.getOperand(0);
8617   SDValue RHS = Op.getOperand(1);
8618   EVT VT = LHS.getValueType();
8619   if (VT != MVT::i32 && VT != MVT::i64)
8620     return SDValue();
8621
8622   SDLoc DL(Op);
8623   SDValue Carry = Op.getOperand(2);
8624   // SBCS uses a carry not a borrow so the carry flag should be inverted first.
8625   SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
8626   SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
8627                             LHS, RHS, InvCarry);
8628
8629   EVT OpVT = Op.getValueType();
8630   SDValue TVal = DAG.getConstant(1, DL, OpVT);
8631   SDValue FVal = DAG.getConstant(0, DL, OpVT);
8632
8633   ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
8634   ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT);
8635   SDValue CCVal =
8636       DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
8637   // Inputs are swapped because the condition is inverted. This will allow
8638   // matching with a single CSINC instruction.
8639   return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
8640                      Cmp.getValue(1));
8641 }
8642
8643 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
8644                                               SDValue RHS, SDValue TVal,
8645                                               SDValue FVal, const SDLoc &dl,
8646                                               SelectionDAG &DAG) const {
8647   // Handle f128 first, because it will result in a comparison of some RTLIB
8648   // call result against zero.
8649   if (LHS.getValueType() == MVT::f128) {
8650     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
8651
8652     // If softenSetCCOperands returned a scalar, we need to compare the result
8653     // against zero to select between true and false values.
8654     if (!RHS.getNode()) {
8655       RHS = DAG.getConstant(0, dl, LHS.getValueType());
8656       CC = ISD::SETNE;
8657     }
8658   }
8659
8660   // Also handle f16, for which we need to do a f32 comparison.
8661   if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
8662     LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
8663     RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
8664   }
8665
8666   // Next, handle integers.
8667   if (LHS.getValueType().isInteger()) {
8668     assert((LHS.getValueType() == RHS.getValueType()) &&
8669            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
8670
8671     ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
8672     ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
8673     ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
8674     // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
8675     // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
8676     // supported types.
8677     if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
8678         CTVal->isOne() && CFVal->isAllOnes() &&
8679         LHS.getValueType() == TVal.getValueType()) {
8680       EVT VT = LHS.getValueType();
8681       SDValue Shift =
8682           DAG.getNode(ISD::SRA, dl, VT, LHS,
8683                       DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
8684       return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
8685     }
8686
8687     unsigned Opcode = AArch64ISD::CSEL;
8688
8689     // If both the TVal and the FVal are constants, see if we can swap them in
8690     // order to for a CSINV or CSINC out of them.
8691     if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
8692       std::swap(TVal, FVal);
8693       std::swap(CTVal, CFVal);
8694       CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8695     } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
8696       std::swap(TVal, FVal);
8697       std::swap(CTVal, CFVal);
8698       CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8699     } else if (TVal.getOpcode() == ISD::XOR) {
8700       // If TVal is a NOT we want to swap TVal and FVal so that we can match
8701       // with a CSINV rather than a CSEL.
8702       if (isAllOnesConstant(TVal.getOperand(1))) {
8703         std::swap(TVal, FVal);
8704         std::swap(CTVal, CFVal);
8705         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8706       }
8707     } else if (TVal.getOpcode() == ISD::SUB) {
8708       // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
8709       // that we can match with a CSNEG rather than a CSEL.
8710       if (isNullConstant(TVal.getOperand(0))) {
8711         std::swap(TVal, FVal);
8712         std::swap(CTVal, CFVal);
8713         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8714       }
8715     } else if (CTVal && CFVal) {
8716       const int64_t TrueVal = CTVal->getSExtValue();
8717       const int64_t FalseVal = CFVal->getSExtValue();
8718       bool Swap = false;
8719
8720       // If both TVal and FVal are constants, see if FVal is the
8721       // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
8722       // instead of a CSEL in that case.
8723       if (TrueVal == ~FalseVal) {
8724         Opcode = AArch64ISD::CSINV;
8725       } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
8726                  TrueVal == -FalseVal) {
8727         Opcode = AArch64ISD::CSNEG;
8728       } else if (TVal.getValueType() == MVT::i32) {
8729         // If our operands are only 32-bit wide, make sure we use 32-bit
8730         // arithmetic for the check whether we can use CSINC. This ensures that
8731         // the addition in the check will wrap around properly in case there is
8732         // an overflow (which would not be the case if we do the check with
8733         // 64-bit arithmetic).
8734         const uint32_t TrueVal32 = CTVal->getZExtValue();
8735         const uint32_t FalseVal32 = CFVal->getZExtValue();
8736
8737         if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
8738           Opcode = AArch64ISD::CSINC;
8739
8740           if (TrueVal32 > FalseVal32) {
8741             Swap = true;
8742           }
8743         }
8744       } else {
8745         // 64-bit check whether we can use CSINC.
8746         const uint64_t TrueVal64 = TrueVal;
8747         const uint64_t FalseVal64 = FalseVal;
8748
8749         if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
8750           Opcode = AArch64ISD::CSINC;
8751
8752           if (TrueVal > FalseVal) {
8753             Swap = true;
8754           }
8755         }
8756       }
8757
8758       // Swap TVal and FVal if necessary.
8759       if (Swap) {
8760         std::swap(TVal, FVal);
8761         std::swap(CTVal, CFVal);
8762         CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8763       }
8764
8765       if (Opcode != AArch64ISD::CSEL) {
8766         // Drop FVal since we can get its value by simply inverting/negating
8767         // TVal.
8768         FVal = TVal;
8769       }
8770     }
8771
8772     // Avoid materializing a constant when possible by reusing a known value in
8773     // a register.  However, don't perform this optimization if the known value
8774     // is one, zero or negative one in the case of a CSEL.  We can always
8775     // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
8776     // FVal, respectively.
8777     ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
8778     if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
8779         !RHSVal->isZero() && !RHSVal->isAllOnes()) {
8780       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
8781       // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
8782       // "a != C ? x : a" to avoid materializing C.
8783       if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
8784         TVal = LHS;
8785       else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
8786         FVal = LHS;
8787     } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
8788       assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
8789       // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
8790       // avoid materializing C.
8791       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
8792       if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
8793         Opcode = AArch64ISD::CSINV;
8794         TVal = LHS;
8795         FVal = DAG.getConstant(0, dl, FVal.getValueType());
8796       }
8797     }
8798
8799     SDValue CCVal;
8800     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
8801     EVT VT = TVal.getValueType();
8802     return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
8803   }
8804
8805   // Now we know we're dealing with FP values.
8806   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
8807          LHS.getValueType() == MVT::f64);
8808   assert(LHS.getValueType() == RHS.getValueType());
8809   EVT VT = TVal.getValueType();
8810   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
8811
8812   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
8813   // clean.  Some of them require two CSELs to implement.
8814   AArch64CC::CondCode CC1, CC2;
8815   changeFPCCToAArch64CC(CC, CC1, CC2);
8816
8817   if (DAG.getTarget().Options.UnsafeFPMath) {
8818     // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
8819     // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
8820     ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
8821     if (RHSVal && RHSVal->isZero()) {
8822       ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
8823       ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
8824
8825       if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
8826           CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
8827         TVal = LHS;
8828       else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
8829                CFVal && CFVal->isZero() &&
8830                FVal.getValueType() == LHS.getValueType())
8831         FVal = LHS;
8832     }
8833   }
8834
8835   // Emit first, and possibly only, CSEL.
8836   SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8837   SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
8838
8839   // If we need a second CSEL, emit it, using the output of the first as the
8840   // RHS.  We're effectively OR'ing the two CC's together.
8841   if (CC2 != AArch64CC::AL) {
8842     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
8843     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
8844   }
8845
8846   // Otherwise, return the output of the first CSEL.
8847   return CS1;
8848 }
8849
8850 SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
8851                                                   SelectionDAG &DAG) const {
8852   EVT Ty = Op.getValueType();
8853   auto Idx = Op.getConstantOperandAPInt(2);
8854   int64_t IdxVal = Idx.getSExtValue();
8855   assert(Ty.isScalableVector() &&
8856          "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
8857
8858   // We can use the splice instruction for certain index values where we are
8859   // able to efficiently generate the correct predicate. The index will be
8860   // inverted and used directly as the input to the ptrue instruction, i.e.
8861   // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
8862   // splice predicate. However, we can only do this if we can guarantee that
8863   // there are enough elements in the vector, hence we check the index <= min
8864   // number of elements.
8865   Optional<unsigned> PredPattern;
8866   if (Ty.isScalableVector() && IdxVal < 0 &&
8867       (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
8868           None) {
8869     SDLoc DL(Op);
8870
8871     // Create a predicate where all but the last -IdxVal elements are false.
8872     EVT PredVT = Ty.changeVectorElementType(MVT::i1);
8873     SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
8874     Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
8875
8876     // Now splice the two inputs together using the predicate.
8877     return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
8878                        Op.getOperand(1));
8879   }
8880
8881   // This will select to an EXT instruction, which has a maximum immediate
8882   // value of 255, hence 2048-bits is the maximum value we can lower.
8883   if (IdxVal >= 0 &&
8884       IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
8885     return Op;
8886
8887   return SDValue();
8888 }
8889
8890 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
8891                                               SelectionDAG &DAG) const {
8892   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
8893   SDValue LHS = Op.getOperand(0);
8894   SDValue RHS = Op.getOperand(1);
8895   SDValue TVal = Op.getOperand(2);
8896   SDValue FVal = Op.getOperand(3);
8897   SDLoc DL(Op);
8898   return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
8899 }
8900
8901 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
8902                                            SelectionDAG &DAG) const {
8903   SDValue CCVal = Op->getOperand(0);
8904   SDValue TVal = Op->getOperand(1);
8905   SDValue FVal = Op->getOperand(2);
8906   SDLoc DL(Op);
8907
8908   EVT Ty = Op.getValueType();
8909   if (Ty.isScalableVector()) {
8910     SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
8911     MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
8912     SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
8913     return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
8914   }
8915
8916   if (useSVEForFixedLengthVectorVT(Ty)) {
8917     // FIXME: Ideally this would be the same as above using i1 types, however
8918     // for the moment we can't deal with fixed i1 vector types properly, so
8919     // instead extend the predicate to a result type sized integer vector.
8920     MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
8921     MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
8922     SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
8923     SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
8924     return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
8925   }
8926
8927   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
8928   // instruction.
8929   if (ISD::isOverflowIntrOpRes(CCVal)) {
8930     // Only lower legal XALUO ops.
8931     if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
8932       return SDValue();
8933
8934     AArch64CC::CondCode OFCC;
8935     SDValue Value, Overflow;
8936     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
8937     SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
8938
8939     return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
8940                        CCVal, Overflow);
8941   }
8942
8943   // Lower it the same way as we would lower a SELECT_CC node.
8944   ISD::CondCode CC;
8945   SDValue LHS, RHS;
8946   if (CCVal.getOpcode() == ISD::SETCC) {
8947     LHS = CCVal.getOperand(0);
8948     RHS = CCVal.getOperand(1);
8949     CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
8950   } else {
8951     LHS = CCVal;
8952     RHS = DAG.getConstant(0, DL, CCVal.getValueType());
8953     CC = ISD::SETNE;
8954   }
8955
8956   // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
8957   // order to use FCSELSrrr
8958   if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
8959     TVal = SDValue(
8960         DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
8961                            DAG.getUNDEF(MVT::f32), TVal,
8962                            DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
8963         0);
8964     FVal = SDValue(
8965         DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
8966                            DAG.getUNDEF(MVT::f32), FVal,
8967                            DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
8968         0);
8969   }
8970
8971   SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
8972
8973   if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
8974     Res = SDValue(
8975         DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, Ty, Res,
8976                            DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
8977         0);
8978   }
8979
8980   return Res;
8981 }
8982
8983 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
8984                                               SelectionDAG &DAG) const {
8985   // Jump table entries as PC relative offsets. No additional tweaking
8986   // is necessary here. Just get the address of the jump table.
8987   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
8988
8989   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
8990       !Subtarget->isTargetMachO()) {
8991     return getAddrLarge(JT, DAG);
8992   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8993     return getAddrTiny(JT, DAG);
8994   }
8995   return getAddr(JT, DAG);
8996 }
8997
8998 SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
8999                                           SelectionDAG &DAG) const {
9000   // Jump table entries as PC relative offsets. No additional tweaking
9001   // is necessary here. Just get the address of the jump table.
9002   SDLoc DL(Op);
9003   SDValue JT = Op.getOperand(1);
9004   SDValue Entry = Op.getOperand(2);
9005   int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
9006
9007   auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9008   AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
9009
9010   SDNode *Dest =
9011       DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
9012                          Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
9013   return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
9014                      SDValue(Dest, 0));
9015 }
9016
9017 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
9018                                                  SelectionDAG &DAG) const {
9019   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
9020
9021   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
9022     // Use the GOT for the large code model on iOS.
9023     if (Subtarget->isTargetMachO()) {
9024       return getGOT(CP, DAG);
9025     }
9026     return getAddrLarge(CP, DAG);
9027   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9028     return getAddrTiny(CP, DAG);
9029   } else {
9030     return getAddr(CP, DAG);
9031   }
9032 }
9033
9034 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
9035                                                SelectionDAG &DAG) const {
9036   BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
9037   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
9038       !Subtarget->isTargetMachO()) {
9039     return getAddrLarge(BA, DAG);
9040   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9041     return getAddrTiny(BA, DAG);
9042   }
9043   return getAddr(BA, DAG);
9044 }
9045
9046 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
9047                                                  SelectionDAG &DAG) const {
9048   AArch64FunctionInfo *FuncInfo =
9049       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9050
9051   SDLoc DL(Op);
9052   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
9053                                  getPointerTy(DAG.getDataLayout()));
9054   FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
9055   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9056   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9057                       MachinePointerInfo(SV));
9058 }
9059
9060 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
9061                                                   SelectionDAG &DAG) const {
9062   MachineFunction &MF = DAG.getMachineFunction();
9063   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9064
9065   SDLoc DL(Op);
9066   SDValue FR;
9067   if (Subtarget->isWindowsArm64EC()) {
9068     // With the Arm64EC ABI, we compute the address of the varargs save area
9069     // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
9070     // but calls from an entry thunk can pass in a different address.
9071     Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
9072     SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
9073     uint64_t StackOffset;
9074     if (FuncInfo->getVarArgsGPRSize() > 0)
9075       StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
9076     else
9077       StackOffset = FuncInfo->getVarArgsStackOffset();
9078     FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
9079                      DAG.getConstant(StackOffset, DL, MVT::i64));
9080   } else {
9081     FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
9082                                ? FuncInfo->getVarArgsGPRIndex()
9083                                : FuncInfo->getVarArgsStackIndex(),
9084                            getPointerTy(DAG.getDataLayout()));
9085   }
9086   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9087   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9088                       MachinePointerInfo(SV));
9089 }
9090
9091 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
9092                                                   SelectionDAG &DAG) const {
9093   // The layout of the va_list struct is specified in the AArch64 Procedure Call
9094   // Standard, section B.3.
9095   MachineFunction &MF = DAG.getMachineFunction();
9096   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9097   unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
9098   auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
9099   auto PtrVT = getPointerTy(DAG.getDataLayout());
9100   SDLoc DL(Op);
9101
9102   SDValue Chain = Op.getOperand(0);
9103   SDValue VAList = Op.getOperand(1);
9104   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9105   SmallVector<SDValue, 4> MemOps;
9106
9107   // void *__stack at offset 0
9108   unsigned Offset = 0;
9109   SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
9110   Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
9111   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
9112                                 MachinePointerInfo(SV), Align(PtrSize)));
9113
9114   // void *__gr_top at offset 8 (4 on ILP32)
9115   Offset += PtrSize;
9116   int GPRSize = FuncInfo->getVarArgsGPRSize();
9117   if (GPRSize > 0) {
9118     SDValue GRTop, GRTopAddr;
9119
9120     GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9121                             DAG.getConstant(Offset, DL, PtrVT));
9122
9123     GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
9124     GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
9125                         DAG.getConstant(GPRSize, DL, PtrVT));
9126     GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
9127
9128     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
9129                                   MachinePointerInfo(SV, Offset),
9130                                   Align(PtrSize)));
9131   }
9132
9133   // void *__vr_top at offset 16 (8 on ILP32)
9134   Offset += PtrSize;
9135   int FPRSize = FuncInfo->getVarArgsFPRSize();
9136   if (FPRSize > 0) {
9137     SDValue VRTop, VRTopAddr;
9138     VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9139                             DAG.getConstant(Offset, DL, PtrVT));
9140
9141     VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
9142     VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
9143                         DAG.getConstant(FPRSize, DL, PtrVT));
9144     VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
9145
9146     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
9147                                   MachinePointerInfo(SV, Offset),
9148                                   Align(PtrSize)));
9149   }
9150
9151   // int __gr_offs at offset 24 (12 on ILP32)
9152   Offset += PtrSize;
9153   SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9154                                    DAG.getConstant(Offset, DL, PtrVT));
9155   MemOps.push_back(
9156       DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
9157                    GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
9158
9159   // int __vr_offs at offset 28 (16 on ILP32)
9160   Offset += 4;
9161   SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9162                                    DAG.getConstant(Offset, DL, PtrVT));
9163   MemOps.push_back(
9164       DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
9165                    VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
9166
9167   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
9168 }
9169
9170 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
9171                                             SelectionDAG &DAG) const {
9172   MachineFunction &MF = DAG.getMachineFunction();
9173
9174   if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
9175     return LowerWin64_VASTART(Op, DAG);
9176   else if (Subtarget->isTargetDarwin())
9177     return LowerDarwin_VASTART(Op, DAG);
9178   else
9179     return LowerAAPCS_VASTART(Op, DAG);
9180 }
9181
9182 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
9183                                            SelectionDAG &DAG) const {
9184   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
9185   // pointer.
9186   SDLoc DL(Op);
9187   unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
9188   unsigned VaListSize =
9189       (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
9190           ? PtrSize
9191           : Subtarget->isTargetILP32() ? 20 : 32;
9192   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
9193   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
9194
9195   return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
9196                        DAG.getConstant(VaListSize, DL, MVT::i32),
9197                        Align(PtrSize), false, false, false,
9198                        MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
9199 }
9200
9201 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
9202   assert(Subtarget->isTargetDarwin() &&
9203          "automatic va_arg instruction only works on Darwin");
9204
9205   const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9206   EVT VT = Op.getValueType();
9207   SDLoc DL(Op);
9208   SDValue Chain = Op.getOperand(0);
9209   SDValue Addr = Op.getOperand(1);
9210   MaybeAlign Align(Op.getConstantOperandVal(3));
9211   unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
9212   auto PtrVT = getPointerTy(DAG.getDataLayout());
9213   auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
9214   SDValue VAList =
9215       DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
9216   Chain = VAList.getValue(1);
9217   VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
9218
9219   if (VT.isScalableVector())
9220     report_fatal_error("Passing SVE types to variadic functions is "
9221                        "currently not supported");
9222
9223   if (Align && *Align > MinSlotSize) {
9224     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9225                          DAG.getConstant(Align->value() - 1, DL, PtrVT));
9226     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
9227                          DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
9228   }
9229
9230   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
9231   unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
9232
9233   // Scalar integer and FP values smaller than 64 bits are implicitly extended
9234   // up to 64 bits.  At the very least, we have to increase the striding of the
9235   // vaargs list to match this, and for FP values we need to introduce
9236   // FP_ROUND nodes as well.
9237   if (VT.isInteger() && !VT.isVector())
9238     ArgSize = std::max(ArgSize, MinSlotSize);
9239   bool NeedFPTrunc = false;
9240   if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
9241     ArgSize = 8;
9242     NeedFPTrunc = true;
9243   }
9244
9245   // Increment the pointer, VAList, to the next vaarg
9246   SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9247                                DAG.getConstant(ArgSize, DL, PtrVT));
9248   VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
9249
9250   // Store the incremented VAList to the legalized pointer
9251   SDValue APStore =
9252       DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
9253
9254   // Load the actual argument out of the pointer VAList
9255   if (NeedFPTrunc) {
9256     // Load the value as an f64.
9257     SDValue WideFP =
9258         DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
9259     // Round the value down to an f32.
9260     SDValue NarrowFP =
9261         DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
9262                     DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
9263     SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
9264     // Merge the rounded value with the chain output of the load.
9265     return DAG.getMergeValues(Ops, DL);
9266   }
9267
9268   return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
9269 }
9270
9271 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
9272                                               SelectionDAG &DAG) const {
9273   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9274   MFI.setFrameAddressIsTaken(true);
9275
9276   EVT VT = Op.getValueType();
9277   SDLoc DL(Op);
9278   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9279   SDValue FrameAddr =
9280       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
9281   while (Depth--)
9282     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
9283                             MachinePointerInfo());
9284
9285   if (Subtarget->isTargetILP32())
9286     FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
9287                             DAG.getValueType(VT));
9288
9289   return FrameAddr;
9290 }
9291
9292 SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
9293                                               SelectionDAG &DAG) const {
9294   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9295
9296   EVT VT = getPointerTy(DAG.getDataLayout());
9297   SDLoc DL(Op);
9298   int FI = MFI.CreateFixedObject(4, 0, false);
9299   return DAG.getFrameIndex(FI, VT);
9300 }
9301
9302 #define GET_REGISTER_MATCHER
9303 #include "AArch64GenAsmMatcher.inc"
9304
9305 // FIXME? Maybe this could be a TableGen attribute on some registers and
9306 // this table could be generated automatically from RegInfo.
9307 Register AArch64TargetLowering::
9308 getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
9309   Register Reg = MatchRegisterName(RegName);
9310   if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
9311     const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
9312     unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
9313     if (!Subtarget->isXRegisterReserved(DwarfRegNum))
9314       Reg = 0;
9315   }
9316   if (Reg)
9317     return Reg;
9318   report_fatal_error(Twine("Invalid register name \""
9319                               + StringRef(RegName)  + "\"."));
9320 }
9321
9322 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
9323                                                      SelectionDAG &DAG) const {
9324   DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
9325
9326   EVT VT = Op.getValueType();
9327   SDLoc DL(Op);
9328
9329   SDValue FrameAddr =
9330       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
9331   SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
9332
9333   return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
9334 }
9335
9336 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
9337                                                SelectionDAG &DAG) const {
9338   MachineFunction &MF = DAG.getMachineFunction();
9339   MachineFrameInfo &MFI = MF.getFrameInfo();
9340   MFI.setReturnAddressIsTaken(true);
9341
9342   EVT VT = Op.getValueType();
9343   SDLoc DL(Op);
9344   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9345   SDValue ReturnAddress;
9346   if (Depth) {
9347     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
9348     SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
9349     ReturnAddress = DAG.getLoad(
9350         VT, DL, DAG.getEntryNode(),
9351         DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
9352   } else {
9353     // Return LR, which contains the return address. Mark it an implicit
9354     // live-in.
9355     Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
9356     ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
9357   }
9358
9359   // The XPACLRI instruction assembles to a hint-space instruction before
9360   // Armv8.3-A therefore this instruction can be safely used for any pre
9361   // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
9362   // that instead.
9363   SDNode *St;
9364   if (Subtarget->hasPAuth()) {
9365     St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
9366   } else {
9367     // XPACLRI operates on LR therefore we must move the operand accordingly.
9368     SDValue Chain =
9369         DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
9370     St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
9371   }
9372   return SDValue(St, 0);
9373 }
9374
9375 /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
9376 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
9377 SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
9378                                                SelectionDAG &DAG) const {
9379   SDValue Lo, Hi;
9380   expandShiftParts(Op.getNode(), Lo, Hi, DAG);
9381   return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
9382 }
9383
9384 bool AArch64TargetLowering::isOffsetFoldingLegal(
9385     const GlobalAddressSDNode *GA) const {
9386   // Offsets are folded in the DAG combine rather than here so that we can
9387   // intelligently choose an offset based on the uses.
9388   return false;
9389 }
9390
9391 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
9392                                          bool OptForSize) const {
9393   bool IsLegal = false;
9394   // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
9395   // 16-bit case when target has full fp16 support.
9396   // FIXME: We should be able to handle f128 as well with a clever lowering.
9397   const APInt ImmInt = Imm.bitcastToAPInt();
9398   if (VT == MVT::f64)
9399     IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
9400   else if (VT == MVT::f32)
9401     IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
9402   else if (VT == MVT::f16 && Subtarget->hasFullFP16())
9403     IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero();
9404   // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
9405   //       generate that fmov.
9406
9407   // If we can not materialize in immediate field for fmov, check if the
9408   // value can be encoded as the immediate operand of a logical instruction.
9409   // The immediate value will be created with either MOVZ, MOVN, or ORR.
9410   if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
9411     // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
9412     // however the mov+fmov sequence is always better because of the reduced
9413     // cache pressure. The timings are still the same if you consider
9414     // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
9415     // movw+movk is fused). So we limit up to 2 instrdduction at most.
9416     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
9417     AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
9418                               Insn);
9419     unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
9420     IsLegal = Insn.size() <= Limit;
9421   }
9422
9423   LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
9424                     << " imm value: "; Imm.dump(););
9425   return IsLegal;
9426 }
9427
9428 //===----------------------------------------------------------------------===//
9429 //                          AArch64 Optimization Hooks
9430 //===----------------------------------------------------------------------===//
9431
9432 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
9433                            SDValue Operand, SelectionDAG &DAG,
9434                            int &ExtraSteps) {
9435   EVT VT = Operand.getValueType();
9436   if ((ST->hasNEON() &&
9437        (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
9438         VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
9439         VT == MVT::v4f32)) ||
9440       (ST->hasSVE() &&
9441        (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
9442     if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
9443       // For the reciprocal estimates, convergence is quadratic, so the number
9444       // of digits is doubled after each iteration.  In ARMv8, the accuracy of
9445       // the initial estimate is 2^-8.  Thus the number of extra steps to refine
9446       // the result for float (23 mantissa bits) is 2 and for double (52
9447       // mantissa bits) is 3.
9448       ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
9449
9450     return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
9451   }
9452
9453   return SDValue();
9454 }
9455
9456 SDValue
9457 AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
9458                                         const DenormalMode &Mode) const {
9459   SDLoc DL(Op);
9460   EVT VT = Op.getValueType();
9461   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
9462   SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
9463   return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
9464 }
9465
9466 SDValue
9467 AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
9468                                                    SelectionDAG &DAG) const {
9469   return Op;
9470 }
9471
9472 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
9473                                                SelectionDAG &DAG, int Enabled,
9474                                                int &ExtraSteps,
9475                                                bool &UseOneConst,
9476                                                bool Reciprocal) const {
9477   if (Enabled == ReciprocalEstimate::Enabled ||
9478       (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
9479     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
9480                                        DAG, ExtraSteps)) {
9481       SDLoc DL(Operand);
9482       EVT VT = Operand.getValueType();
9483
9484       SDNodeFlags Flags;
9485       Flags.setAllowReassociation(true);
9486
9487       // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
9488       // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
9489       for (int i = ExtraSteps; i > 0; --i) {
9490         SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
9491                                    Flags);
9492         Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
9493         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
9494       }
9495       if (!Reciprocal)
9496         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
9497
9498       ExtraSteps = 0;
9499       return Estimate;
9500     }
9501
9502   return SDValue();
9503 }
9504
9505 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
9506                                                 SelectionDAG &DAG, int Enabled,
9507                                                 int &ExtraSteps) const {
9508   if (Enabled == ReciprocalEstimate::Enabled)
9509     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
9510                                        DAG, ExtraSteps)) {
9511       SDLoc DL(Operand);
9512       EVT VT = Operand.getValueType();
9513
9514       SDNodeFlags Flags;
9515       Flags.setAllowReassociation(true);
9516
9517       // Newton reciprocal iteration: E * (2 - X * E)
9518       // AArch64 reciprocal iteration instruction: (2 - M * N)
9519       for (int i = ExtraSteps; i > 0; --i) {
9520         SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
9521                                    Estimate, Flags);
9522         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
9523       }
9524
9525       ExtraSteps = 0;
9526       return Estimate;
9527     }
9528
9529   return SDValue();
9530 }
9531
9532 //===----------------------------------------------------------------------===//
9533 //                          AArch64 Inline Assembly Support
9534 //===----------------------------------------------------------------------===//
9535
9536 // Table of Constraints
9537 // TODO: This is the current set of constraints supported by ARM for the
9538 // compiler, not all of them may make sense.
9539 //
9540 // r - A general register
9541 // w - An FP/SIMD register of some size in the range v0-v31
9542 // x - An FP/SIMD register of some size in the range v0-v15
9543 // I - Constant that can be used with an ADD instruction
9544 // J - Constant that can be used with a SUB instruction
9545 // K - Constant that can be used with a 32-bit logical instruction
9546 // L - Constant that can be used with a 64-bit logical instruction
9547 // M - Constant that can be used as a 32-bit MOV immediate
9548 // N - Constant that can be used as a 64-bit MOV immediate
9549 // Q - A memory reference with base register and no offset
9550 // S - A symbolic address
9551 // Y - Floating point constant zero
9552 // Z - Integer constant zero
9553 //
9554 //   Note that general register operands will be output using their 64-bit x
9555 // register name, whatever the size of the variable, unless the asm operand
9556 // is prefixed by the %w modifier. Floating-point and SIMD register operands
9557 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
9558 // %q modifier.
9559 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
9560   // At this point, we have to lower this constraint to something else, so we
9561   // lower it to an "r" or "w". However, by doing this we will force the result
9562   // to be in register, while the X constraint is much more permissive.
9563   //
9564   // Although we are correct (we are free to emit anything, without
9565   // constraints), we might break use cases that would expect us to be more
9566   // efficient and emit something else.
9567   if (!Subtarget->hasFPARMv8())
9568     return "r";
9569
9570   if (ConstraintVT.isFloatingPoint())
9571     return "w";
9572
9573   if (ConstraintVT.isVector() &&
9574      (ConstraintVT.getSizeInBits() == 64 ||
9575       ConstraintVT.getSizeInBits() == 128))
9576     return "w";
9577
9578   return "r";
9579 }
9580
9581 enum PredicateConstraint {
9582   Upl,
9583   Upa,
9584   Invalid
9585 };
9586
9587 static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
9588   PredicateConstraint P = PredicateConstraint::Invalid;
9589   if (Constraint == "Upa")
9590     P = PredicateConstraint::Upa;
9591   if (Constraint == "Upl")
9592     P = PredicateConstraint::Upl;
9593   return P;
9594 }
9595
9596 /// getConstraintType - Given a constraint letter, return the type of
9597 /// constraint it is for this target.
9598 AArch64TargetLowering::ConstraintType
9599 AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
9600   if (Constraint.size() == 1) {
9601     switch (Constraint[0]) {
9602     default:
9603       break;
9604     case 'x':
9605     case 'w':
9606     case 'y':
9607       return C_RegisterClass;
9608     // An address with a single base register. Due to the way we
9609     // currently handle addresses it is the same as 'r'.
9610     case 'Q':
9611       return C_Memory;
9612     case 'I':
9613     case 'J':
9614     case 'K':
9615     case 'L':
9616     case 'M':
9617     case 'N':
9618     case 'Y':
9619     case 'Z':
9620       return C_Immediate;
9621     case 'z':
9622     case 'S': // A symbolic address
9623       return C_Other;
9624     }
9625   } else if (parsePredicateConstraint(Constraint) !=
9626              PredicateConstraint::Invalid)
9627       return C_RegisterClass;
9628   return TargetLowering::getConstraintType(Constraint);
9629 }
9630
9631 /// Examine constraint type and operand type and determine a weight value.
9632 /// This object must already have been set up with the operand type
9633 /// and the current alternative constraint selected.
9634 TargetLowering::ConstraintWeight
9635 AArch64TargetLowering::getSingleConstraintMatchWeight(
9636     AsmOperandInfo &info, const char *constraint) const {
9637   ConstraintWeight weight = CW_Invalid;
9638   Value *CallOperandVal = info.CallOperandVal;
9639   // If we don't have a value, we can't do a match,
9640   // but allow it at the lowest weight.
9641   if (!CallOperandVal)
9642     return CW_Default;
9643   Type *type = CallOperandVal->getType();
9644   // Look at the constraint type.
9645   switch (*constraint) {
9646   default:
9647     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
9648     break;
9649   case 'x':
9650   case 'w':
9651   case 'y':
9652     if (type->isFloatingPointTy() || type->isVectorTy())
9653       weight = CW_Register;
9654     break;
9655   case 'z':
9656     weight = CW_Constant;
9657     break;
9658   case 'U':
9659     if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
9660       weight = CW_Register;
9661     break;
9662   }
9663   return weight;
9664 }
9665
9666 std::pair<unsigned, const TargetRegisterClass *>
9667 AArch64TargetLowering::getRegForInlineAsmConstraint(
9668     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
9669   if (Constraint.size() == 1) {
9670     switch (Constraint[0]) {
9671     case 'r':
9672       if (VT.isScalableVector())
9673         return std::make_pair(0U, nullptr);
9674       if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
9675         return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
9676       if (VT.getFixedSizeInBits() == 64)
9677         return std::make_pair(0U, &AArch64::GPR64commonRegClass);
9678       return std::make_pair(0U, &AArch64::GPR32commonRegClass);
9679     case 'w': {
9680       if (!Subtarget->hasFPARMv8())
9681         break;
9682       if (VT.isScalableVector()) {
9683         if (VT.getVectorElementType() != MVT::i1)
9684           return std::make_pair(0U, &AArch64::ZPRRegClass);
9685         return std::make_pair(0U, nullptr);
9686       }
9687       uint64_t VTSize = VT.getFixedSizeInBits();
9688       if (VTSize == 16)
9689         return std::make_pair(0U, &AArch64::FPR16RegClass);
9690       if (VTSize == 32)
9691         return std::make_pair(0U, &AArch64::FPR32RegClass);
9692       if (VTSize == 64)
9693         return std::make_pair(0U, &AArch64::FPR64RegClass);
9694       if (VTSize == 128)
9695         return std::make_pair(0U, &AArch64::FPR128RegClass);
9696       break;
9697     }
9698     // The instructions that this constraint is designed for can
9699     // only take 128-bit registers so just use that regclass.
9700     case 'x':
9701       if (!Subtarget->hasFPARMv8())
9702         break;
9703       if (VT.isScalableVector())
9704         return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
9705       if (VT.getSizeInBits() == 128)
9706         return std::make_pair(0U, &AArch64::FPR128_loRegClass);
9707       break;
9708     case 'y':
9709       if (!Subtarget->hasFPARMv8())
9710         break;
9711       if (VT.isScalableVector())
9712         return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
9713       break;
9714     }
9715   } else {
9716     PredicateConstraint PC = parsePredicateConstraint(Constraint);
9717     if (PC != PredicateConstraint::Invalid) {
9718       if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
9719         return std::make_pair(0U, nullptr);
9720       bool restricted = (PC == PredicateConstraint::Upl);
9721       return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
9722                         : std::make_pair(0U, &AArch64::PPRRegClass);
9723     }
9724   }
9725   if (StringRef("{cc}").equals_insensitive(Constraint))
9726     return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
9727
9728   // Use the default implementation in TargetLowering to convert the register
9729   // constraint into a member of a register class.
9730   std::pair<unsigned, const TargetRegisterClass *> Res;
9731   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9732
9733   // Not found as a standard register?
9734   if (!Res.second) {
9735     unsigned Size = Constraint.size();
9736     if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
9737         tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
9738       int RegNo;
9739       bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
9740       if (!Failed && RegNo >= 0 && RegNo <= 31) {
9741         // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
9742         // By default we'll emit v0-v31 for this unless there's a modifier where
9743         // we'll emit the correct register as well.
9744         if (VT != MVT::Other && VT.getSizeInBits() == 64) {
9745           Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
9746           Res.second = &AArch64::FPR64RegClass;
9747         } else {
9748           Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
9749           Res.second = &AArch64::FPR128RegClass;
9750         }
9751       }
9752     }
9753   }
9754
9755   if (Res.second && !Subtarget->hasFPARMv8() &&
9756       !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
9757       !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
9758     return std::make_pair(0U, nullptr);
9759
9760   return Res;
9761 }
9762
9763 EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
9764                                                   llvm::Type *Ty,
9765                                                   bool AllowUnknown) const {
9766   if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
9767     return EVT(MVT::i64x8);
9768
9769   return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
9770 }
9771
9772 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
9773 /// vector.  If it is invalid, don't add anything to Ops.
9774 void AArch64TargetLowering::LowerAsmOperandForConstraint(
9775     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
9776     SelectionDAG &DAG) const {
9777   SDValue Result;
9778
9779   // Currently only support length 1 constraints.
9780   if (Constraint.length() != 1)
9781     return;
9782
9783   char ConstraintLetter = Constraint[0];
9784   switch (ConstraintLetter) {
9785   default:
9786     break;
9787
9788   // This set of constraints deal with valid constants for various instructions.
9789   // Validate and return a target constant for them if we can.
9790   case 'z': {
9791     // 'z' maps to xzr or wzr so it needs an input of 0.
9792     if (!isNullConstant(Op))
9793       return;
9794
9795     if (Op.getValueType() == MVT::i64)
9796       Result = DAG.getRegister(AArch64::XZR, MVT::i64);
9797     else
9798       Result = DAG.getRegister(AArch64::WZR, MVT::i32);
9799     break;
9800   }
9801   case 'S': {
9802     // An absolute symbolic address or label reference.
9803     if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
9804       Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
9805                                           GA->getValueType(0));
9806     } else if (const BlockAddressSDNode *BA =
9807                    dyn_cast<BlockAddressSDNode>(Op)) {
9808       Result =
9809           DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
9810     } else
9811       return;
9812     break;
9813   }
9814
9815   case 'I':
9816   case 'J':
9817   case 'K':
9818   case 'L':
9819   case 'M':
9820   case 'N':
9821     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
9822     if (!C)
9823       return;
9824
9825     // Grab the value and do some validation.
9826     uint64_t CVal = C->getZExtValue();
9827     switch (ConstraintLetter) {
9828     // The I constraint applies only to simple ADD or SUB immediate operands:
9829     // i.e. 0 to 4095 with optional shift by 12
9830     // The J constraint applies only to ADD or SUB immediates that would be
9831     // valid when negated, i.e. if [an add pattern] were to be output as a SUB
9832     // instruction [or vice versa], in other words -1 to -4095 with optional
9833     // left shift by 12.
9834     case 'I':
9835       if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
9836         break;
9837       return;
9838     case 'J': {
9839       uint64_t NVal = -C->getSExtValue();
9840       if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
9841         CVal = C->getSExtValue();
9842         break;
9843       }
9844       return;
9845     }
9846     // The K and L constraints apply *only* to logical immediates, including
9847     // what used to be the MOVI alias for ORR (though the MOVI alias has now
9848     // been removed and MOV should be used). So these constraints have to
9849     // distinguish between bit patterns that are valid 32-bit or 64-bit
9850     // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
9851     // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
9852     // versa.
9853     case 'K':
9854       if (AArch64_AM::isLogicalImmediate(CVal, 32))
9855         break;
9856       return;
9857     case 'L':
9858       if (AArch64_AM::isLogicalImmediate(CVal, 64))
9859         break;
9860       return;
9861     // The M and N constraints are a superset of K and L respectively, for use
9862     // with the MOV (immediate) alias. As well as the logical immediates they
9863     // also match 32 or 64-bit immediates that can be loaded either using a
9864     // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
9865     // (M) or 64-bit 0x1234000000000000 (N) etc.
9866     // As a note some of this code is liberally stolen from the asm parser.
9867     case 'M': {
9868       if (!isUInt<32>(CVal))
9869         return;
9870       if (AArch64_AM::isLogicalImmediate(CVal, 32))
9871         break;
9872       if ((CVal & 0xFFFF) == CVal)
9873         break;
9874       if ((CVal & 0xFFFF0000ULL) == CVal)
9875         break;
9876       uint64_t NCVal = ~(uint32_t)CVal;
9877       if ((NCVal & 0xFFFFULL) == NCVal)
9878         break;
9879       if ((NCVal & 0xFFFF0000ULL) == NCVal)
9880         break;
9881       return;
9882     }
9883     case 'N': {
9884       if (AArch64_AM::isLogicalImmediate(CVal, 64))
9885         break;
9886       if ((CVal & 0xFFFFULL) == CVal)
9887         break;
9888       if ((CVal & 0xFFFF0000ULL) == CVal)
9889         break;
9890       if ((CVal & 0xFFFF00000000ULL) == CVal)
9891         break;
9892       if ((CVal & 0xFFFF000000000000ULL) == CVal)
9893         break;
9894       uint64_t NCVal = ~CVal;
9895       if ((NCVal & 0xFFFFULL) == NCVal)
9896         break;
9897       if ((NCVal & 0xFFFF0000ULL) == NCVal)
9898         break;
9899       if ((NCVal & 0xFFFF00000000ULL) == NCVal)
9900         break;
9901       if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
9902         break;
9903       return;
9904     }
9905     default:
9906       return;
9907     }
9908
9909     // All assembler immediates are 64-bit integers.
9910     Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
9911     break;
9912   }
9913
9914   if (Result.getNode()) {
9915     Ops.push_back(Result);
9916     return;
9917   }
9918
9919   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
9920 }
9921
9922 //===----------------------------------------------------------------------===//
9923 //                     AArch64 Advanced SIMD Support
9924 //===----------------------------------------------------------------------===//
9925
9926 /// WidenVector - Given a value in the V64 register class, produce the
9927 /// equivalent value in the V128 register class.
9928 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
9929   EVT VT = V64Reg.getValueType();
9930   unsigned NarrowSize = VT.getVectorNumElements();
9931   MVT EltTy = VT.getVectorElementType().getSimpleVT();
9932   MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
9933   SDLoc DL(V64Reg);
9934
9935   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
9936                      V64Reg, DAG.getConstant(0, DL, MVT::i64));
9937 }
9938
9939 /// getExtFactor - Determine the adjustment factor for the position when
9940 /// generating an "extract from vector registers" instruction.
9941 static unsigned getExtFactor(SDValue &V) {
9942   EVT EltType = V.getValueType().getVectorElementType();
9943   return EltType.getSizeInBits() / 8;
9944 }
9945
9946 /// NarrowVector - Given a value in the V128 register class, produce the
9947 /// equivalent value in the V64 register class.
9948 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
9949   EVT VT = V128Reg.getValueType();
9950   unsigned WideSize = VT.getVectorNumElements();
9951   MVT EltTy = VT.getVectorElementType().getSimpleVT();
9952   MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
9953   SDLoc DL(V128Reg);
9954
9955   return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
9956 }
9957
9958 // Gather data to see if the operation can be modelled as a
9959 // shuffle in combination with VEXTs.
9960 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
9961                                                   SelectionDAG &DAG) const {
9962   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
9963   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
9964   SDLoc dl(Op);
9965   EVT VT = Op.getValueType();
9966   assert(!VT.isScalableVector() &&
9967          "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
9968   unsigned NumElts = VT.getVectorNumElements();
9969
9970   struct ShuffleSourceInfo {
9971     SDValue Vec;
9972     unsigned MinElt;
9973     unsigned MaxElt;
9974
9975     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
9976     // be compatible with the shuffle we intend to construct. As a result
9977     // ShuffleVec will be some sliding window into the original Vec.
9978     SDValue ShuffleVec;
9979
9980     // Code should guarantee that element i in Vec starts at element "WindowBase
9981     // + i * WindowScale in ShuffleVec".
9982     int WindowBase;
9983     int WindowScale;
9984
9985     ShuffleSourceInfo(SDValue Vec)
9986       : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
9987           ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
9988
9989     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
9990   };
9991
9992   // First gather all vectors used as an immediate source for this BUILD_VECTOR
9993   // node.
9994   SmallVector<ShuffleSourceInfo, 2> Sources;
9995   for (unsigned i = 0; i < NumElts; ++i) {
9996     SDValue V = Op.getOperand(i);
9997     if (V.isUndef())
9998       continue;
9999     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10000              !isa<ConstantSDNode>(V.getOperand(1)) ||
10001              V.getOperand(0).getValueType().isScalableVector()) {
10002       LLVM_DEBUG(
10003           dbgs() << "Reshuffle failed: "
10004                     "a shuffle can only come from building a vector from "
10005                     "various elements of other fixed-width vectors, provided "
10006                     "their indices are constant\n");
10007       return SDValue();
10008     }
10009
10010     // Add this element source to the list if it's not already there.
10011     SDValue SourceVec = V.getOperand(0);
10012     auto Source = find(Sources, SourceVec);
10013     if (Source == Sources.end())
10014       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
10015
10016     // Update the minimum and maximum lane number seen.
10017     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
10018     Source->MinElt = std::min(Source->MinElt, EltNo);
10019     Source->MaxElt = std::max(Source->MaxElt, EltNo);
10020   }
10021
10022   // If we have 3 or 4 sources, try to generate a TBL, which will at least be
10023   // better than moving to/from gpr registers for larger vectors.
10024   if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
10025     // Construct a mask for the tbl. We may need to adjust the index for types
10026     // larger than i8.
10027     SmallVector<unsigned, 16> Mask;
10028     unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
10029     for (unsigned I = 0; I < NumElts; ++I) {
10030       SDValue V = Op.getOperand(I);
10031       if (V.isUndef()) {
10032         for (unsigned OF = 0; OF < OutputFactor; OF++)
10033           Mask.push_back(-1);
10034         continue;
10035       }
10036       // Set the Mask lanes adjusted for the size of the input and output
10037       // lanes. The Mask is always i8, so it will set OutputFactor lanes per
10038       // output element, adjusted in their positions per input and output types.
10039       unsigned Lane = V.getConstantOperandVal(1);
10040       for (unsigned S = 0; S < Sources.size(); S++) {
10041         if (V.getOperand(0) == Sources[S].Vec) {
10042           unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
10043           unsigned InputBase = 16 * S + Lane * InputSize / 8;
10044           for (unsigned OF = 0; OF < OutputFactor; OF++)
10045             Mask.push_back(InputBase + OF);
10046           break;
10047         }
10048       }
10049     }
10050
10051     // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
10052     // v16i8, and the TBLMask
10053     SmallVector<SDValue, 16> TBLOperands;
10054     TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
10055                                               ? Intrinsic::aarch64_neon_tbl3
10056                                               : Intrinsic::aarch64_neon_tbl4,
10057                                           dl, MVT::i32));
10058     for (unsigned i = 0; i < Sources.size(); i++) {
10059       SDValue Src = Sources[i].Vec;
10060       EVT SrcVT = Src.getValueType();
10061       Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
10062       assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
10063              "Expected a legally typed vector");
10064       if (SrcVT.is64BitVector())
10065         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
10066                           DAG.getUNDEF(MVT::v8i8));
10067       TBLOperands.push_back(Src);
10068     }
10069
10070     SmallVector<SDValue, 16> TBLMask;
10071     for (unsigned i = 0; i < Mask.size(); i++)
10072       TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
10073     assert((Mask.size() == 8 || Mask.size() == 16) &&
10074            "Expected a v8i8 or v16i8 Mask");
10075     TBLOperands.push_back(
10076         DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
10077
10078     SDValue Shuffle =
10079         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
10080                     Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
10081     return DAG.getBitcast(VT, Shuffle);
10082   }
10083
10084   if (Sources.size() > 2) {
10085     LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
10086                       << "sensible when at most two source vectors are "
10087                       << "involved\n");
10088     return SDValue();
10089   }
10090
10091   // Find out the smallest element size among result and two sources, and use
10092   // it as element size to build the shuffle_vector.
10093   EVT SmallestEltTy = VT.getVectorElementType();
10094   for (auto &Source : Sources) {
10095     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
10096     if (SrcEltTy.bitsLT(SmallestEltTy)) {
10097       SmallestEltTy = SrcEltTy;
10098     }
10099   }
10100   unsigned ResMultiplier =
10101       VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
10102   uint64_t VTSize = VT.getFixedSizeInBits();
10103   NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
10104   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
10105
10106   // If the source vector is too wide or too narrow, we may nevertheless be able
10107   // to construct a compatible shuffle either by concatenating it with UNDEF or
10108   // extracting a suitable range of elements.
10109   for (auto &Src : Sources) {
10110     EVT SrcVT = Src.ShuffleVec.getValueType();
10111
10112     TypeSize SrcVTSize = SrcVT.getSizeInBits();
10113     if (SrcVTSize == TypeSize::Fixed(VTSize))
10114       continue;
10115
10116     // This stage of the search produces a source with the same element type as
10117     // the original, but with a total width matching the BUILD_VECTOR output.
10118     EVT EltVT = SrcVT.getVectorElementType();
10119     unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
10120     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
10121
10122     if (SrcVTSize.getFixedValue() < VTSize) {
10123       assert(2 * SrcVTSize == VTSize);
10124       // We can pad out the smaller vector for free, so if it's part of a
10125       // shuffle...
10126       Src.ShuffleVec =
10127           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
10128                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
10129       continue;
10130     }
10131
10132     if (SrcVTSize.getFixedValue() != 2 * VTSize) {
10133       LLVM_DEBUG(
10134           dbgs() << "Reshuffle failed: result vector too small to extract\n");
10135       return SDValue();
10136     }
10137
10138     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
10139       LLVM_DEBUG(
10140           dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
10141       return SDValue();
10142     }
10143
10144     if (Src.MinElt >= NumSrcElts) {
10145       // The extraction can just take the second half
10146       Src.ShuffleVec =
10147           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
10148                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
10149       Src.WindowBase = -NumSrcElts;
10150     } else if (Src.MaxElt < NumSrcElts) {
10151       // The extraction can just take the first half
10152       Src.ShuffleVec =
10153           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
10154                       DAG.getConstant(0, dl, MVT::i64));
10155     } else {
10156       // An actual VEXT is needed
10157       SDValue VEXTSrc1 =
10158           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
10159                       DAG.getConstant(0, dl, MVT::i64));
10160       SDValue VEXTSrc2 =
10161           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
10162                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
10163       unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
10164
10165       if (!SrcVT.is64BitVector()) {
10166         LLVM_DEBUG(
10167           dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
10168                     "for SVE vectors.");
10169         return SDValue();
10170       }
10171
10172       Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
10173                                    VEXTSrc2,
10174                                    DAG.getConstant(Imm, dl, MVT::i32));
10175       Src.WindowBase = -Src.MinElt;
10176     }
10177   }
10178
10179   // Another possible incompatibility occurs from the vector element types. We
10180   // can fix this by bitcasting the source vectors to the same type we intend
10181   // for the shuffle.
10182   for (auto &Src : Sources) {
10183     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
10184     if (SrcEltTy == SmallestEltTy)
10185       continue;
10186     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
10187     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
10188     Src.WindowScale =
10189         SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
10190     Src.WindowBase *= Src.WindowScale;
10191   }
10192
10193   // Final check before we try to actually produce a shuffle.
10194   LLVM_DEBUG(for (auto Src
10195                   : Sources)
10196                  assert(Src.ShuffleVec.getValueType() == ShuffleVT););
10197
10198   // The stars all align, our next step is to produce the mask for the shuffle.
10199   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
10200   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
10201   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
10202     SDValue Entry = Op.getOperand(i);
10203     if (Entry.isUndef())
10204       continue;
10205
10206     auto Src = find(Sources, Entry.getOperand(0));
10207     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
10208
10209     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
10210     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
10211     // segment.
10212     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
10213     int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
10214                                VT.getScalarSizeInBits());
10215     int LanesDefined = BitsDefined / BitsPerShuffleLane;
10216
10217     // This source is expected to fill ResMultiplier lanes of the final shuffle,
10218     // starting at the appropriate offset.
10219     int *LaneMask = &Mask[i * ResMultiplier];
10220
10221     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
10222     ExtractBase += NumElts * (Src - Sources.begin());
10223     for (int j = 0; j < LanesDefined; ++j)
10224       LaneMask[j] = ExtractBase + j;
10225   }
10226
10227   // Final check before we try to produce nonsense...
10228   if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
10229     LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
10230     return SDValue();
10231   }
10232
10233   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
10234   for (unsigned i = 0; i < Sources.size(); ++i)
10235     ShuffleOps[i] = Sources[i].ShuffleVec;
10236
10237   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
10238                                          ShuffleOps[1], Mask);
10239   SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
10240
10241   LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
10242              dbgs() << "Reshuffle, creating node: "; V.dump(););
10243
10244   return V;
10245 }
10246
10247 // check if an EXT instruction can handle the shuffle mask when the
10248 // vector sources of the shuffle are the same.
10249 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
10250   unsigned NumElts = VT.getVectorNumElements();
10251
10252   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
10253   if (M[0] < 0)
10254     return false;
10255
10256   Imm = M[0];
10257
10258   // If this is a VEXT shuffle, the immediate value is the index of the first
10259   // element.  The other shuffle indices must be the successive elements after
10260   // the first one.
10261   unsigned ExpectedElt = Imm;
10262   for (unsigned i = 1; i < NumElts; ++i) {
10263     // Increment the expected index.  If it wraps around, just follow it
10264     // back to index zero and keep going.
10265     ++ExpectedElt;
10266     if (ExpectedElt == NumElts)
10267       ExpectedElt = 0;
10268
10269     if (M[i] < 0)
10270       continue; // ignore UNDEF indices
10271     if (ExpectedElt != static_cast<unsigned>(M[i]))
10272       return false;
10273   }
10274
10275   return true;
10276 }
10277
10278 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
10279 // v4i32s. This is really a truncate, which we can construct out of (legal)
10280 // concats and truncate nodes.
10281 static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
10282   if (V.getValueType() != MVT::v16i8)
10283     return SDValue();
10284   assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
10285
10286   for (unsigned X = 0; X < 4; X++) {
10287     // Check the first item in each group is an extract from lane 0 of a v4i32
10288     // or v4i16.
10289     SDValue BaseExt = V.getOperand(X * 4);
10290     if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10291         (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
10292          BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
10293         !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
10294         BaseExt.getConstantOperandVal(1) != 0)
10295       return SDValue();
10296     SDValue Base = BaseExt.getOperand(0);
10297     // And check the other items are extracts from the same vector.
10298     for (unsigned Y = 1; Y < 4; Y++) {
10299       SDValue Ext = V.getOperand(X * 4 + Y);
10300       if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10301           Ext.getOperand(0) != Base ||
10302           !isa<ConstantSDNode>(Ext.getOperand(1)) ||
10303           Ext.getConstantOperandVal(1) != Y)
10304         return SDValue();
10305     }
10306   }
10307
10308   // Turn the buildvector into a series of truncates and concates, which will
10309   // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
10310   // concat together to produce 2 v8i16. These are both truncated and concat
10311   // together.
10312   SDLoc DL(V);
10313   SDValue Trunc[4] = {
10314       V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
10315       V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
10316   for (SDValue &V : Trunc)
10317     if (V.getValueType() == MVT::v4i32)
10318       V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
10319   SDValue Concat0 =
10320       DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
10321   SDValue Concat1 =
10322       DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
10323   SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
10324   SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
10325   return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
10326 }
10327
10328 /// Check if a vector shuffle corresponds to a DUP instructions with a larger
10329 /// element width than the vector lane type. If that is the case the function
10330 /// returns true and writes the value of the DUP instruction lane operand into
10331 /// DupLaneOp
10332 static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
10333                           unsigned &DupLaneOp) {
10334   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
10335          "Only possible block sizes for wide DUP are: 16, 32, 64");
10336
10337   if (BlockSize <= VT.getScalarSizeInBits())
10338     return false;
10339   if (BlockSize % VT.getScalarSizeInBits() != 0)
10340     return false;
10341   if (VT.getSizeInBits() % BlockSize != 0)
10342     return false;
10343
10344   size_t SingleVecNumElements = VT.getVectorNumElements();
10345   size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
10346   size_t NumBlocks = VT.getSizeInBits() / BlockSize;
10347
10348   // We are looking for masks like
10349   // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
10350   // might be replaced by 'undefined'. BlockIndices will eventually contain
10351   // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
10352   // for the above examples)
10353   SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
10354   for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
10355     for (size_t I = 0; I < NumEltsPerBlock; I++) {
10356       int Elt = M[BlockIndex * NumEltsPerBlock + I];
10357       if (Elt < 0)
10358         continue;
10359       // For now we don't support shuffles that use the second operand
10360       if ((unsigned)Elt >= SingleVecNumElements)
10361         return false;
10362       if (BlockElts[I] < 0)
10363         BlockElts[I] = Elt;
10364       else if (BlockElts[I] != Elt)
10365         return false;
10366     }
10367
10368   // We found a candidate block (possibly with some undefs). It must be a
10369   // sequence of consecutive integers starting with a value divisible by
10370   // NumEltsPerBlock with some values possibly replaced by undef-s.
10371
10372   // Find first non-undef element
10373   auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
10374   assert(FirstRealEltIter != BlockElts.end() &&
10375          "Shuffle with all-undefs must have been caught by previous cases, "
10376          "e.g. isSplat()");
10377   if (FirstRealEltIter == BlockElts.end()) {
10378     DupLaneOp = 0;
10379     return true;
10380   }
10381
10382   // Index of FirstRealElt in BlockElts
10383   size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
10384
10385   if ((unsigned)*FirstRealEltIter < FirstRealIndex)
10386     return false;
10387   // BlockElts[0] must have the following value if it isn't undef:
10388   size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
10389
10390   // Check the first element
10391   if (Elt0 % NumEltsPerBlock != 0)
10392     return false;
10393   // Check that the sequence indeed consists of consecutive integers (modulo
10394   // undefs)
10395   for (size_t I = 0; I < NumEltsPerBlock; I++)
10396     if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
10397       return false;
10398
10399   DupLaneOp = Elt0 / NumEltsPerBlock;
10400   return true;
10401 }
10402
10403 // check if an EXT instruction can handle the shuffle mask when the
10404 // vector sources of the shuffle are different.
10405 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
10406                       unsigned &Imm) {
10407   // Look for the first non-undef element.
10408   const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
10409
10410   // Benefit form APInt to handle overflow when calculating expected element.
10411   unsigned NumElts = VT.getVectorNumElements();
10412   unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
10413   APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
10414   // The following shuffle indices must be the successive elements after the
10415   // first real element.
10416   bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
10417     return Elt != ExpectedElt++ && Elt != -1;
10418   });
10419   if (FoundWrongElt)
10420     return false;
10421
10422   // The index of an EXT is the first element if it is not UNDEF.
10423   // Watch out for the beginning UNDEFs. The EXT index should be the expected
10424   // value of the first element.  E.g.
10425   // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
10426   // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
10427   // ExpectedElt is the last mask index plus 1.
10428   Imm = ExpectedElt.getZExtValue();
10429
10430   // There are two difference cases requiring to reverse input vectors.
10431   // For example, for vector <4 x i32> we have the following cases,
10432   // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
10433   // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
10434   // For both cases, we finally use mask <5, 6, 7, 0>, which requires
10435   // to reverse two input vectors.
10436   if (Imm < NumElts)
10437     ReverseEXT = true;
10438   else
10439     Imm -= NumElts;
10440
10441   return true;
10442 }
10443
10444 /// isREVMask - Check if a vector shuffle corresponds to a REV
10445 /// instruction with the specified blocksize.  (The order of the elements
10446 /// within each block of the vector is reversed.)
10447 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
10448   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
10449          "Only possible block sizes for REV are: 16, 32, 64");
10450
10451   unsigned EltSz = VT.getScalarSizeInBits();
10452   if (EltSz == 64)
10453     return false;
10454
10455   unsigned NumElts = VT.getVectorNumElements();
10456   unsigned BlockElts = M[0] + 1;
10457   // If the first shuffle index is UNDEF, be optimistic.
10458   if (M[0] < 0)
10459     BlockElts = BlockSize / EltSz;
10460
10461   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
10462     return false;
10463
10464   for (unsigned i = 0; i < NumElts; ++i) {
10465     if (M[i] < 0)
10466       continue; // ignore UNDEF indices
10467     if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
10468       return false;
10469   }
10470
10471   return true;
10472 }
10473
10474 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10475   unsigned NumElts = VT.getVectorNumElements();
10476   if (NumElts % 2 != 0)
10477     return false;
10478   WhichResult = (M[0] == 0 ? 0 : 1);
10479   unsigned Idx = WhichResult * NumElts / 2;
10480   for (unsigned i = 0; i != NumElts; i += 2) {
10481     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
10482         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
10483       return false;
10484     Idx += 1;
10485   }
10486
10487   return true;
10488 }
10489
10490 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10491   unsigned NumElts = VT.getVectorNumElements();
10492   WhichResult = (M[0] == 0 ? 0 : 1);
10493   for (unsigned i = 0; i != NumElts; ++i) {
10494     if (M[i] < 0)
10495       continue; // ignore UNDEF indices
10496     if ((unsigned)M[i] != 2 * i + WhichResult)
10497       return false;
10498   }
10499
10500   return true;
10501 }
10502
10503 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10504   unsigned NumElts = VT.getVectorNumElements();
10505   if (NumElts % 2 != 0)
10506     return false;
10507   WhichResult = (M[0] == 0 ? 0 : 1);
10508   for (unsigned i = 0; i < NumElts; i += 2) {
10509     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
10510         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
10511       return false;
10512   }
10513   return true;
10514 }
10515
10516 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
10517 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
10518 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
10519 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10520   unsigned NumElts = VT.getVectorNumElements();
10521   if (NumElts % 2 != 0)
10522     return false;
10523   WhichResult = (M[0] == 0 ? 0 : 1);
10524   unsigned Idx = WhichResult * NumElts / 2;
10525   for (unsigned i = 0; i != NumElts; i += 2) {
10526     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
10527         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
10528       return false;
10529     Idx += 1;
10530   }
10531
10532   return true;
10533 }
10534
10535 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
10536 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
10537 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
10538 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10539   unsigned Half = VT.getVectorNumElements() / 2;
10540   WhichResult = (M[0] == 0 ? 0 : 1);
10541   for (unsigned j = 0; j != 2; ++j) {
10542     unsigned Idx = WhichResult;
10543     for (unsigned i = 0; i != Half; ++i) {
10544       int MIdx = M[i + j * Half];
10545       if (MIdx >= 0 && (unsigned)MIdx != Idx)
10546         return false;
10547       Idx += 2;
10548     }
10549   }
10550
10551   return true;
10552 }
10553
10554 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
10555 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
10556 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
10557 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10558   unsigned NumElts = VT.getVectorNumElements();
10559   if (NumElts % 2 != 0)
10560     return false;
10561   WhichResult = (M[0] == 0 ? 0 : 1);
10562   for (unsigned i = 0; i < NumElts; i += 2) {
10563     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
10564         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
10565       return false;
10566   }
10567   return true;
10568 }
10569
10570 static bool isINSMask(ArrayRef<int> M, int NumInputElements,
10571                       bool &DstIsLeft, int &Anomaly) {
10572   if (M.size() != static_cast<size_t>(NumInputElements))
10573     return false;
10574
10575   int NumLHSMatch = 0, NumRHSMatch = 0;
10576   int LastLHSMismatch = -1, LastRHSMismatch = -1;
10577
10578   for (int i = 0; i < NumInputElements; ++i) {
10579     if (M[i] == -1) {
10580       ++NumLHSMatch;
10581       ++NumRHSMatch;
10582       continue;
10583     }
10584
10585     if (M[i] == i)
10586       ++NumLHSMatch;
10587     else
10588       LastLHSMismatch = i;
10589
10590     if (M[i] == i + NumInputElements)
10591       ++NumRHSMatch;
10592     else
10593       LastRHSMismatch = i;
10594   }
10595
10596   if (NumLHSMatch == NumInputElements - 1) {
10597     DstIsLeft = true;
10598     Anomaly = LastLHSMismatch;
10599     return true;
10600   } else if (NumRHSMatch == NumInputElements - 1) {
10601     DstIsLeft = false;
10602     Anomaly = LastRHSMismatch;
10603     return true;
10604   }
10605
10606   return false;
10607 }
10608
10609 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
10610   if (VT.getSizeInBits() != 128)
10611     return false;
10612
10613   unsigned NumElts = VT.getVectorNumElements();
10614
10615   for (int I = 0, E = NumElts / 2; I != E; I++) {
10616     if (Mask[I] != I)
10617       return false;
10618   }
10619
10620   int Offset = NumElts / 2;
10621   for (int I = NumElts / 2, E = NumElts; I != E; I++) {
10622     if (Mask[I] != I + SplitLHS * Offset)
10623       return false;
10624   }
10625
10626   return true;
10627 }
10628
10629 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
10630   SDLoc DL(Op);
10631   EVT VT = Op.getValueType();
10632   SDValue V0 = Op.getOperand(0);
10633   SDValue V1 = Op.getOperand(1);
10634   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
10635
10636   if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
10637       VT.getVectorElementType() != V1.getValueType().getVectorElementType())
10638     return SDValue();
10639
10640   bool SplitV0 = V0.getValueSizeInBits() == 128;
10641
10642   if (!isConcatMask(Mask, VT, SplitV0))
10643     return SDValue();
10644
10645   EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
10646   if (SplitV0) {
10647     V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
10648                      DAG.getConstant(0, DL, MVT::i64));
10649   }
10650   if (V1.getValueSizeInBits() == 128) {
10651     V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
10652                      DAG.getConstant(0, DL, MVT::i64));
10653   }
10654   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
10655 }
10656
10657 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
10658 /// the specified operations to build the shuffle. ID is the perfect-shuffle
10659 //ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
10660 //table entry and LHS/RHS are the immediate inputs for this stage of the
10661 //shuffle.
10662 static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
10663                                       SDValue V2, unsigned PFEntry, SDValue LHS,
10664                                       SDValue RHS, SelectionDAG &DAG,
10665                                       const SDLoc &dl) {
10666   unsigned OpNum = (PFEntry >> 26) & 0x0F;
10667   unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
10668   unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
10669
10670   enum {
10671     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
10672     OP_VREV,
10673     OP_VDUP0,
10674     OP_VDUP1,
10675     OP_VDUP2,
10676     OP_VDUP3,
10677     OP_VEXT1,
10678     OP_VEXT2,
10679     OP_VEXT3,
10680     OP_VUZPL,  // VUZP, left result
10681     OP_VUZPR,  // VUZP, right result
10682     OP_VZIPL,  // VZIP, left result
10683     OP_VZIPR,  // VZIP, right result
10684     OP_VTRNL,  // VTRN, left result
10685     OP_VTRNR,  // VTRN, right result
10686     OP_MOVLANE // Move lane. RHSID is the lane to move into
10687   };
10688
10689   if (OpNum == OP_COPY) {
10690     if (LHSID == (1 * 9 + 2) * 9 + 3)
10691       return LHS;
10692     assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
10693     return RHS;
10694   }
10695
10696   if (OpNum == OP_MOVLANE) {
10697     // Decompose a PerfectShuffle ID to get the Mask for lane Elt
10698     auto getPFIDLane = [](unsigned ID, int Elt) -> int {
10699       assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
10700       Elt = 3 - Elt;
10701       while (Elt > 0) {
10702         ID /= 9;
10703         Elt--;
10704       }
10705       return (ID % 9 == 8) ? -1 : ID % 9;
10706     };
10707
10708     // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
10709     // get the lane to move from from the PFID, which is always from the
10710     // original vectors (V1 or V2).
10711     SDValue OpLHS = GeneratePerfectShuffle(
10712         LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
10713     EVT VT = OpLHS.getValueType();
10714     assert(RHSID < 8 && "Expected a lane index for RHSID!");
10715     unsigned ExtLane = 0;
10716     SDValue Input;
10717
10718     // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
10719     // convert into a higher type.
10720     if (RHSID & 0x4) {
10721       int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
10722       if (MaskElt == -1)
10723         MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
10724       assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
10725       ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
10726       Input = MaskElt < 2 ? V1 : V2;
10727       if (VT.getScalarSizeInBits() == 16) {
10728         Input = DAG.getBitcast(MVT::v2f32, Input);
10729         OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
10730       } else {
10731         assert(VT.getScalarSizeInBits() == 32 &&
10732                "Expected 16 or 32 bit shuffle elemements");
10733         Input = DAG.getBitcast(MVT::v2f64, Input);
10734         OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
10735       }
10736     } else {
10737       int MaskElt = getPFIDLane(ID, RHSID);
10738       assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
10739       ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
10740       Input = MaskElt < 4 ? V1 : V2;
10741       // Be careful about creating illegal types. Use f16 instead of i16.
10742       if (VT == MVT::v4i16) {
10743         Input = DAG.getBitcast(MVT::v4f16, Input);
10744         OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
10745       }
10746     }
10747     SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
10748                               Input.getValueType().getVectorElementType(),
10749                               Input, DAG.getVectorIdxConstant(ExtLane, dl));
10750     SDValue Ins =
10751         DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
10752                     Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
10753     return DAG.getBitcast(VT, Ins);
10754   }
10755
10756   SDValue OpLHS, OpRHS;
10757   OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
10758                                  RHS, DAG, dl);
10759   OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
10760                                  RHS, DAG, dl);
10761   EVT VT = OpLHS.getValueType();
10762
10763   switch (OpNum) {
10764   default:
10765     llvm_unreachable("Unknown shuffle opcode!");
10766   case OP_VREV:
10767     // VREV divides the vector in half and swaps within the half.
10768     if (VT.getVectorElementType() == MVT::i32 ||
10769         VT.getVectorElementType() == MVT::f32)
10770       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
10771     // vrev <4 x i16> -> REV32
10772     if (VT.getVectorElementType() == MVT::i16 ||
10773         VT.getVectorElementType() == MVT::f16 ||
10774         VT.getVectorElementType() == MVT::bf16)
10775       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
10776     // vrev <4 x i8> -> REV16
10777     assert(VT.getVectorElementType() == MVT::i8);
10778     return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
10779   case OP_VDUP0:
10780   case OP_VDUP1:
10781   case OP_VDUP2:
10782   case OP_VDUP3: {
10783     EVT EltTy = VT.getVectorElementType();
10784     unsigned Opcode;
10785     if (EltTy == MVT::i8)
10786       Opcode = AArch64ISD::DUPLANE8;
10787     else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
10788       Opcode = AArch64ISD::DUPLANE16;
10789     else if (EltTy == MVT::i32 || EltTy == MVT::f32)
10790       Opcode = AArch64ISD::DUPLANE32;
10791     else if (EltTy == MVT::i64 || EltTy == MVT::f64)
10792       Opcode = AArch64ISD::DUPLANE64;
10793     else
10794       llvm_unreachable("Invalid vector element type?");
10795
10796     if (VT.getSizeInBits() == 64)
10797       OpLHS = WidenVector(OpLHS, DAG);
10798     SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
10799     return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
10800   }
10801   case OP_VEXT1:
10802   case OP_VEXT2:
10803   case OP_VEXT3: {
10804     unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
10805     return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
10806                        DAG.getConstant(Imm, dl, MVT::i32));
10807   }
10808   case OP_VUZPL:
10809     return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
10810                        OpRHS);
10811   case OP_VUZPR:
10812     return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
10813                        OpRHS);
10814   case OP_VZIPL:
10815     return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
10816                        OpRHS);
10817   case OP_VZIPR:
10818     return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
10819                        OpRHS);
10820   case OP_VTRNL:
10821     return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
10822                        OpRHS);
10823   case OP_VTRNR:
10824     return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
10825                        OpRHS);
10826   }
10827 }
10828
10829 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
10830                            SelectionDAG &DAG) {
10831   // Check to see if we can use the TBL instruction.
10832   SDValue V1 = Op.getOperand(0);
10833   SDValue V2 = Op.getOperand(1);
10834   SDLoc DL(Op);
10835
10836   EVT EltVT = Op.getValueType().getVectorElementType();
10837   unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
10838
10839   bool Swap = false;
10840   if (V1.isUndef() || isZerosVector(V1.getNode())) {
10841     std::swap(V1, V2);
10842     Swap = true;
10843   }
10844
10845   // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
10846   // out of range values with 0s. We do need to make sure that any out-of-range
10847   // values are really out-of-range for a v16i8 vector.
10848   bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
10849   MVT IndexVT = MVT::v8i8;
10850   unsigned IndexLen = 8;
10851   if (Op.getValueSizeInBits() == 128) {
10852     IndexVT = MVT::v16i8;
10853     IndexLen = 16;
10854   }
10855
10856   SmallVector<SDValue, 8> TBLMask;
10857   for (int Val : ShuffleMask) {
10858     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
10859       unsigned Offset = Byte + Val * BytesPerElt;
10860       if (Swap)
10861         Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
10862       if (IsUndefOrZero && Offset >= IndexLen)
10863         Offset = 255;
10864       TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
10865     }
10866   }
10867
10868   SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
10869   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
10870
10871   SDValue Shuffle;
10872   if (IsUndefOrZero) {
10873     if (IndexLen == 8)
10874       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
10875     Shuffle = DAG.getNode(
10876         ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
10877         DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
10878         DAG.getBuildVector(IndexVT, DL,
10879                            makeArrayRef(TBLMask.data(), IndexLen)));
10880   } else {
10881     if (IndexLen == 8) {
10882       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
10883       Shuffle = DAG.getNode(
10884           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
10885           DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
10886           DAG.getBuildVector(IndexVT, DL,
10887                              makeArrayRef(TBLMask.data(), IndexLen)));
10888     } else {
10889       // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
10890       // cannot currently represent the register constraints on the input
10891       // table registers.
10892       //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
10893       //                   DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
10894       //                   IndexLen));
10895       Shuffle = DAG.getNode(
10896           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
10897           DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
10898           V2Cst, DAG.getBuildVector(IndexVT, DL,
10899                                     makeArrayRef(TBLMask.data(), IndexLen)));
10900     }
10901   }
10902   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
10903 }
10904
10905 static unsigned getDUPLANEOp(EVT EltType) {
10906   if (EltType == MVT::i8)
10907     return AArch64ISD::DUPLANE8;
10908   if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
10909     return AArch64ISD::DUPLANE16;
10910   if (EltType == MVT::i32 || EltType == MVT::f32)
10911     return AArch64ISD::DUPLANE32;
10912   if (EltType == MVT::i64 || EltType == MVT::f64)
10913     return AArch64ISD::DUPLANE64;
10914
10915   llvm_unreachable("Invalid vector element type?");
10916 }
10917
10918 static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
10919                             unsigned Opcode, SelectionDAG &DAG) {
10920   // Try to eliminate a bitcasted extract subvector before a DUPLANE.
10921   auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
10922     // Match: dup (bitcast (extract_subv X, C)), LaneC
10923     if (BitCast.getOpcode() != ISD::BITCAST ||
10924         BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
10925       return false;
10926
10927     // The extract index must align in the destination type. That may not
10928     // happen if the bitcast is from narrow to wide type.
10929     SDValue Extract = BitCast.getOperand(0);
10930     unsigned ExtIdx = Extract.getConstantOperandVal(1);
10931     unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
10932     unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
10933     unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
10934     if (ExtIdxInBits % CastedEltBitWidth != 0)
10935       return false;
10936
10937     // Can't handle cases where vector size is not 128-bit
10938     if (!Extract.getOperand(0).getValueType().is128BitVector())
10939       return false;
10940
10941     // Update the lane value by offsetting with the scaled extract index.
10942     LaneC += ExtIdxInBits / CastedEltBitWidth;
10943
10944     // Determine the casted vector type of the wide vector input.
10945     // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
10946     // Examples:
10947     // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
10948     // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
10949     unsigned SrcVecNumElts =
10950         Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
10951     CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
10952                               SrcVecNumElts);
10953     return true;
10954   };
10955   MVT CastVT;
10956   if (getScaledOffsetDup(V, Lane, CastVT)) {
10957     V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
10958   } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10959              V.getOperand(0).getValueType().is128BitVector()) {
10960     // The lane is incremented by the index of the extract.
10961     // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
10962     Lane += V.getConstantOperandVal(1);
10963     V = V.getOperand(0);
10964   } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
10965     // The lane is decremented if we are splatting from the 2nd operand.
10966     // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
10967     unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
10968     Lane -= Idx * VT.getVectorNumElements() / 2;
10969     V = WidenVector(V.getOperand(Idx), DAG);
10970   } else if (VT.getSizeInBits() == 64) {
10971     // Widen the operand to 128-bit register with undef.
10972     V = WidenVector(V, DAG);
10973   }
10974   return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
10975 }
10976
10977 // Return true if we can get a new shuffle mask by checking the parameter mask
10978 // array to test whether every two adjacent mask values are continuous and
10979 // starting from an even number.
10980 static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
10981                            SmallVectorImpl<int> &NewMask) {
10982   unsigned NumElts = VT.getVectorNumElements();
10983   if (NumElts % 2 != 0)
10984     return false;
10985
10986   NewMask.clear();
10987   for (unsigned i = 0; i < NumElts; i += 2) {
10988     int M0 = M[i];
10989     int M1 = M[i + 1];
10990
10991     // If both elements are undef, new mask is undef too.
10992     if (M0 == -1 && M1 == -1) {
10993       NewMask.push_back(-1);
10994       continue;
10995     }
10996
10997     if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
10998       NewMask.push_back(M1 / 2);
10999       continue;
11000     }
11001
11002     if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
11003       NewMask.push_back(M0 / 2);
11004       continue;
11005     }
11006
11007     NewMask.clear();
11008     return false;
11009   }
11010
11011   assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
11012   return true;
11013 }
11014
11015 // Try to widen element type to get a new mask value for a better permutation
11016 // sequence, so that we can use NEON shuffle instructions, such as zip1/2,
11017 // UZP1/2, TRN1/2, REV, INS, etc.
11018 // For example:
11019 //  shufflevector <4 x i32> %a, <4 x i32> %b,
11020 //                <4 x i32> <i32 6, i32 7, i32 2, i32 3>
11021 // is equivalent to:
11022 //  shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
11023 // Finally, we can get:
11024 //  mov     v0.d[0], v1.d[1]
11025 static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
11026   SDLoc DL(Op);
11027   EVT VT = Op.getValueType();
11028   EVT ScalarVT = VT.getVectorElementType();
11029   unsigned ElementSize = ScalarVT.getFixedSizeInBits();
11030   SDValue V0 = Op.getOperand(0);
11031   SDValue V1 = Op.getOperand(1);
11032   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
11033
11034   // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
11035   // We need to make sure the wider element type is legal. Thus, ElementSize
11036   // should be not larger than 32 bits, and i1 type should also be excluded.
11037   if (ElementSize > 32 || ElementSize == 1)
11038     return SDValue();
11039
11040   SmallVector<int, 8> NewMask;
11041   if (isWideTypeMask(Mask, VT, NewMask)) {
11042     MVT NewEltVT = VT.isFloatingPoint()
11043                        ? MVT::getFloatingPointVT(ElementSize * 2)
11044                        : MVT::getIntegerVT(ElementSize * 2);
11045     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11046     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11047       V0 = DAG.getBitcast(NewVT, V0);
11048       V1 = DAG.getBitcast(NewVT, V1);
11049       return DAG.getBitcast(VT,
11050                             DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
11051     }
11052   }
11053
11054   return SDValue();
11055 }
11056
11057 // Try to fold shuffle (tbl2, tbl2) into a single tbl4.
11058 static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
11059                                                ArrayRef<int> ShuffleMask,
11060                                                SelectionDAG &DAG) {
11061   SDValue Tbl1 = Op->getOperand(0);
11062   SDValue Tbl2 = Op->getOperand(1);
11063   SDLoc dl(Op);
11064   SDValue Tbl2ID =
11065       DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
11066
11067   EVT VT = Op.getValueType();
11068   if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
11069       Tbl1->getOperand(0) != Tbl2ID ||
11070       Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
11071       Tbl2->getOperand(0) != Tbl2ID)
11072     return SDValue();
11073
11074   if (Tbl1->getValueType(0) != MVT::v16i8 ||
11075       Tbl2->getValueType(0) != MVT::v16i8)
11076     return SDValue();
11077
11078   SDValue Mask1 = Tbl1->getOperand(3);
11079   SDValue Mask2 = Tbl2->getOperand(3);
11080   SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
11081   for (unsigned I = 0; I < 16; I++) {
11082     if (ShuffleMask[I] < 16)
11083       TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
11084     else {
11085       auto *C =
11086           dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
11087       if (!C)
11088         return SDValue();
11089       TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
11090     }
11091   }
11092
11093   SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
11094   SDValue ID =
11095       DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
11096
11097   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
11098                      {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
11099                       Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
11100 }
11101
11102 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
11103                                                    SelectionDAG &DAG) const {
11104   SDLoc dl(Op);
11105   EVT VT = Op.getValueType();
11106
11107   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
11108
11109   if (useSVEForFixedLengthVectorVT(VT,
11110                                    Subtarget->forceStreamingCompatibleSVE()))
11111     return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
11112
11113   // Convert shuffles that are directly supported on NEON to target-specific
11114   // DAG nodes, instead of keeping them as shuffles and matching them again
11115   // during code selection.  This is more efficient and avoids the possibility
11116   // of inconsistencies between legalization and selection.
11117   ArrayRef<int> ShuffleMask = SVN->getMask();
11118
11119   SDValue V1 = Op.getOperand(0);
11120   SDValue V2 = Op.getOperand(1);
11121
11122   assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
11123   assert(ShuffleMask.size() == VT.getVectorNumElements() &&
11124          "Unexpected VECTOR_SHUFFLE mask size!");
11125
11126   if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
11127     return Res;
11128
11129   if (SVN->isSplat()) {
11130     int Lane = SVN->getSplatIndex();
11131     // If this is undef splat, generate it via "just" vdup, if possible.
11132     if (Lane == -1)
11133       Lane = 0;
11134
11135     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
11136       return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
11137                          V1.getOperand(0));
11138     // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
11139     // constant. If so, we can just reference the lane's definition directly.
11140     if (V1.getOpcode() == ISD::BUILD_VECTOR &&
11141         !isa<ConstantSDNode>(V1.getOperand(Lane)))
11142       return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
11143
11144     // Otherwise, duplicate from the lane of the input vector.
11145     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
11146     return constructDup(V1, Lane, dl, VT, Opcode, DAG);
11147   }
11148
11149   // Check if the mask matches a DUP for a wider element
11150   for (unsigned LaneSize : {64U, 32U, 16U}) {
11151     unsigned Lane = 0;
11152     if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
11153       unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
11154                                        : LaneSize == 32 ? AArch64ISD::DUPLANE32
11155                                                         : AArch64ISD::DUPLANE16;
11156       // Cast V1 to an integer vector with required lane size
11157       MVT NewEltTy = MVT::getIntegerVT(LaneSize);
11158       unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
11159       MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
11160       V1 = DAG.getBitcast(NewVecTy, V1);
11161       // Constuct the DUP instruction
11162       V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
11163       // Cast back to the original type
11164       return DAG.getBitcast(VT, V1);
11165     }
11166   }
11167
11168   if (isREVMask(ShuffleMask, VT, 64))
11169     return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
11170   if (isREVMask(ShuffleMask, VT, 32))
11171     return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
11172   if (isREVMask(ShuffleMask, VT, 16))
11173     return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
11174
11175   if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
11176        (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
11177       ShuffleVectorInst::isReverseMask(ShuffleMask)) {
11178     SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
11179     return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
11180                        DAG.getConstant(8, dl, MVT::i32));
11181   }
11182
11183   bool ReverseEXT = false;
11184   unsigned Imm;
11185   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
11186     if (ReverseEXT)
11187       std::swap(V1, V2);
11188     Imm *= getExtFactor(V1);
11189     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
11190                        DAG.getConstant(Imm, dl, MVT::i32));
11191   } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
11192     Imm *= getExtFactor(V1);
11193     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
11194                        DAG.getConstant(Imm, dl, MVT::i32));
11195   }
11196
11197   unsigned WhichResult;
11198   if (isZIPMask(ShuffleMask, VT, WhichResult)) {
11199     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
11200     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
11201   }
11202   if (isUZPMask(ShuffleMask, VT, WhichResult)) {
11203     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
11204     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
11205   }
11206   if (isTRNMask(ShuffleMask, VT, WhichResult)) {
11207     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
11208     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
11209   }
11210
11211   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
11212     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
11213     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
11214   }
11215   if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
11216     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
11217     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
11218   }
11219   if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
11220     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
11221     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
11222   }
11223
11224   if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
11225     return Concat;
11226
11227   bool DstIsLeft;
11228   int Anomaly;
11229   int NumInputElements = V1.getValueType().getVectorNumElements();
11230   if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
11231     SDValue DstVec = DstIsLeft ? V1 : V2;
11232     SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
11233
11234     SDValue SrcVec = V1;
11235     int SrcLane = ShuffleMask[Anomaly];
11236     if (SrcLane >= NumInputElements) {
11237       SrcVec = V2;
11238       SrcLane -= VT.getVectorNumElements();
11239     }
11240     SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
11241
11242     EVT ScalarVT = VT.getVectorElementType();
11243
11244     if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
11245       ScalarVT = MVT::i32;
11246
11247     return DAG.getNode(
11248         ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
11249         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
11250         DstLaneV);
11251   }
11252
11253   if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
11254     return NewSD;
11255
11256   // If the shuffle is not directly supported and it has 4 elements, use
11257   // the PerfectShuffle-generated table to synthesize it from other shuffles.
11258   unsigned NumElts = VT.getVectorNumElements();
11259   if (NumElts == 4) {
11260     unsigned PFIndexes[4];
11261     for (unsigned i = 0; i != 4; ++i) {
11262       if (ShuffleMask[i] < 0)
11263         PFIndexes[i] = 8;
11264       else
11265         PFIndexes[i] = ShuffleMask[i];
11266     }
11267
11268     // Compute the index in the perfect shuffle table.
11269     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
11270                             PFIndexes[2] * 9 + PFIndexes[3];
11271     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
11272     return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
11273                                   dl);
11274   }
11275
11276   return GenerateTBL(Op, ShuffleMask, DAG);
11277 }
11278
11279 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
11280                                                  SelectionDAG &DAG) const {
11281   EVT VT = Op.getValueType();
11282
11283   if (useSVEForFixedLengthVectorVT(VT))
11284     return LowerToScalableOp(Op, DAG);
11285
11286   assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
11287          "Unexpected vector type!");
11288
11289   // We can handle the constant cases during isel.
11290   if (isa<ConstantSDNode>(Op.getOperand(0)))
11291     return Op;
11292
11293   // There isn't a natural way to handle the general i1 case, so we use some
11294   // trickery with whilelo.
11295   SDLoc DL(Op);
11296   SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
11297   SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
11298                          DAG.getValueType(MVT::i1));
11299   SDValue ID =
11300       DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
11301   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11302   if (VT == MVT::nxv1i1)
11303     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
11304                        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
11305                                    Zero, SplatVal),
11306                        Zero);
11307   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
11308 }
11309
11310 SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
11311                                              SelectionDAG &DAG) const {
11312   SDLoc DL(Op);
11313
11314   EVT VT = Op.getValueType();
11315   if (!isTypeLegal(VT) || !VT.isScalableVector())
11316     return SDValue();
11317
11318   // Current lowering only supports the SVE-ACLE types.
11319   if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
11320     return SDValue();
11321
11322   // The DUPQ operation is indepedent of element type so normalise to i64s.
11323   SDValue Idx128 = Op.getOperand(2);
11324
11325   // DUPQ can be used when idx is in range.
11326   auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
11327   if (CIdx && (CIdx->getZExtValue() <= 3)) {
11328     SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
11329     return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
11330   }
11331
11332   SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
11333
11334   // The ACLE says this must produce the same result as:
11335   //   svtbl(data, svadd_x(svptrue_b64(),
11336   //                       svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
11337   //                       index * 2))
11338   SDValue One = DAG.getConstant(1, DL, MVT::i64);
11339   SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
11340
11341   // create the vector 0,1,0,1,...
11342   SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
11343   SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
11344
11345   // create the vector idx64,idx64+1,idx64,idx64+1,...
11346   SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
11347   SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
11348   SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
11349
11350   // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
11351   SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
11352   return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
11353 }
11354
11355
11356 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
11357                                APInt &UndefBits) {
11358   EVT VT = BVN->getValueType(0);
11359   APInt SplatBits, SplatUndef;
11360   unsigned SplatBitSize;
11361   bool HasAnyUndefs;
11362   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
11363     unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
11364
11365     for (unsigned i = 0; i < NumSplats; ++i) {
11366       CnstBits <<= SplatBitSize;
11367       UndefBits <<= SplatBitSize;
11368       CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
11369       UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
11370     }
11371
11372     return true;
11373   }
11374
11375   return false;
11376 }
11377
11378 // Try 64-bit splatted SIMD immediate.
11379 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11380                                  const APInt &Bits) {
11381   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11382     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11383     EVT VT = Op.getValueType();
11384     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
11385
11386     if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
11387       Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
11388
11389       SDLoc dl(Op);
11390       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
11391                                 DAG.getConstant(Value, dl, MVT::i32));
11392       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11393     }
11394   }
11395
11396   return SDValue();
11397 }
11398
11399 // Try 32-bit splatted SIMD immediate.
11400 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11401                                   const APInt &Bits,
11402                                   const SDValue *LHS = nullptr) {
11403   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11404     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11405     EVT VT = Op.getValueType();
11406     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
11407     bool isAdvSIMDModImm = false;
11408     uint64_t Shift;
11409
11410     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
11411       Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
11412       Shift = 0;
11413     }
11414     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
11415       Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
11416       Shift = 8;
11417     }
11418     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
11419       Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
11420       Shift = 16;
11421     }
11422     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
11423       Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
11424       Shift = 24;
11425     }
11426
11427     if (isAdvSIMDModImm) {
11428       SDLoc dl(Op);
11429       SDValue Mov;
11430
11431       if (LHS)
11432         Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
11433                           DAG.getConstant(Value, dl, MVT::i32),
11434                           DAG.getConstant(Shift, dl, MVT::i32));
11435       else
11436         Mov = DAG.getNode(NewOp, dl, MovTy,
11437                           DAG.getConstant(Value, dl, MVT::i32),
11438                           DAG.getConstant(Shift, dl, MVT::i32));
11439
11440       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11441     }
11442   }
11443
11444   return SDValue();
11445 }
11446
11447 // Try 16-bit splatted SIMD immediate.
11448 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11449                                   const APInt &Bits,
11450                                   const SDValue *LHS = nullptr) {
11451   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11452     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11453     EVT VT = Op.getValueType();
11454     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
11455     bool isAdvSIMDModImm = false;
11456     uint64_t Shift;
11457
11458     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
11459       Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
11460       Shift = 0;
11461     }
11462     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
11463       Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
11464       Shift = 8;
11465     }
11466
11467     if (isAdvSIMDModImm) {
11468       SDLoc dl(Op);
11469       SDValue Mov;
11470
11471       if (LHS)
11472         Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
11473                           DAG.getConstant(Value, dl, MVT::i32),
11474                           DAG.getConstant(Shift, dl, MVT::i32));
11475       else
11476         Mov = DAG.getNode(NewOp, dl, MovTy,
11477                           DAG.getConstant(Value, dl, MVT::i32),
11478                           DAG.getConstant(Shift, dl, MVT::i32));
11479
11480       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11481     }
11482   }
11483
11484   return SDValue();
11485 }
11486
11487 // Try 32-bit splatted SIMD immediate with shifted ones.
11488 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
11489                                     SelectionDAG &DAG, const APInt &Bits) {
11490   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11491     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11492     EVT VT = Op.getValueType();
11493     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
11494     bool isAdvSIMDModImm = false;
11495     uint64_t Shift;
11496
11497     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
11498       Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
11499       Shift = 264;
11500     }
11501     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
11502       Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
11503       Shift = 272;
11504     }
11505
11506     if (isAdvSIMDModImm) {
11507       SDLoc dl(Op);
11508       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
11509                                 DAG.getConstant(Value, dl, MVT::i32),
11510                                 DAG.getConstant(Shift, dl, MVT::i32));
11511       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11512     }
11513   }
11514
11515   return SDValue();
11516 }
11517
11518 // Try 8-bit splatted SIMD immediate.
11519 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11520                                  const APInt &Bits) {
11521   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11522     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11523     EVT VT = Op.getValueType();
11524     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
11525
11526     if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
11527       Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
11528
11529       SDLoc dl(Op);
11530       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
11531                                 DAG.getConstant(Value, dl, MVT::i32));
11532       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11533     }
11534   }
11535
11536   return SDValue();
11537 }
11538
11539 // Try FP splatted SIMD immediate.
11540 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11541                                   const APInt &Bits) {
11542   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11543     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11544     EVT VT = Op.getValueType();
11545     bool isWide = (VT.getSizeInBits() == 128);
11546     MVT MovTy;
11547     bool isAdvSIMDModImm = false;
11548
11549     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
11550       Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
11551       MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
11552     }
11553     else if (isWide &&
11554              (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
11555       Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
11556       MovTy = MVT::v2f64;
11557     }
11558
11559     if (isAdvSIMDModImm) {
11560       SDLoc dl(Op);
11561       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
11562                                 DAG.getConstant(Value, dl, MVT::i32));
11563       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11564     }
11565   }
11566
11567   return SDValue();
11568 }
11569
11570 // Specialized code to quickly find if PotentialBVec is a BuildVector that
11571 // consists of only the same constant int value, returned in reference arg
11572 // ConstVal
11573 static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
11574                                      uint64_t &ConstVal) {
11575   BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
11576   if (!Bvec)
11577     return false;
11578   ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
11579   if (!FirstElt)
11580     return false;
11581   EVT VT = Bvec->getValueType(0);
11582   unsigned NumElts = VT.getVectorNumElements();
11583   for (unsigned i = 1; i < NumElts; ++i)
11584     if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
11585       return false;
11586   ConstVal = FirstElt->getZExtValue();
11587   return true;
11588 }
11589
11590 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
11591 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
11592 // BUILD_VECTORs with constant element C1, C2 is a constant, and:
11593 //   - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
11594 //   - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
11595 // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
11596 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
11597   EVT VT = N->getValueType(0);
11598
11599   if (!VT.isVector())
11600     return SDValue();
11601
11602   SDLoc DL(N);
11603
11604   SDValue And;
11605   SDValue Shift;
11606
11607   SDValue FirstOp = N->getOperand(0);
11608   unsigned FirstOpc = FirstOp.getOpcode();
11609   SDValue SecondOp = N->getOperand(1);
11610   unsigned SecondOpc = SecondOp.getOpcode();
11611
11612   // Is one of the operands an AND or a BICi? The AND may have been optimised to
11613   // a BICi in order to use an immediate instead of a register.
11614   // Is the other operand an shl or lshr? This will have been turned into:
11615   // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
11616   if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
11617       (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
11618     And = FirstOp;
11619     Shift = SecondOp;
11620
11621   } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
11622              (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
11623     And = SecondOp;
11624     Shift = FirstOp;
11625   } else
11626     return SDValue();
11627
11628   bool IsAnd = And.getOpcode() == ISD::AND;
11629   bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
11630
11631   // Is the shift amount constant?
11632   ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
11633   if (!C2node)
11634     return SDValue();
11635
11636   uint64_t C1;
11637   if (IsAnd) {
11638     // Is the and mask vector all constant?
11639     if (!isAllConstantBuildVector(And.getOperand(1), C1))
11640       return SDValue();
11641   } else {
11642     // Reconstruct the corresponding AND immediate from the two BICi immediates.
11643     ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
11644     ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
11645     assert(C1nodeImm && C1nodeShift);
11646     C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
11647   }
11648
11649   // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
11650   // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
11651   // how much one can shift elements of a particular size?
11652   uint64_t C2 = C2node->getZExtValue();
11653   unsigned ElemSizeInBits = VT.getScalarSizeInBits();
11654   if (C2 > ElemSizeInBits)
11655     return SDValue();
11656
11657   APInt C1AsAPInt(ElemSizeInBits, C1);
11658   APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
11659                                   : APInt::getLowBitsSet(ElemSizeInBits, C2);
11660   if (C1AsAPInt != RequiredC1)
11661     return SDValue();
11662
11663   SDValue X = And.getOperand(0);
11664   SDValue Y = Shift.getOperand(0);
11665
11666   unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
11667   SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
11668
11669   LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
11670   LLVM_DEBUG(N->dump(&DAG));
11671   LLVM_DEBUG(dbgs() << "into: \n");
11672   LLVM_DEBUG(ResultSLI->dump(&DAG));
11673
11674   ++NumShiftInserts;
11675   return ResultSLI;
11676 }
11677
11678 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
11679                                              SelectionDAG &DAG) const {
11680   if (useSVEForFixedLengthVectorVT(Op.getValueType()))
11681     return LowerToScalableOp(Op, DAG);
11682
11683   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
11684   if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
11685     return Res;
11686
11687   EVT VT = Op.getValueType();
11688
11689   SDValue LHS = Op.getOperand(0);
11690   BuildVectorSDNode *BVN =
11691       dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
11692   if (!BVN) {
11693     // OR commutes, so try swapping the operands.
11694     LHS = Op.getOperand(1);
11695     BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
11696   }
11697   if (!BVN)
11698     return Op;
11699
11700   APInt DefBits(VT.getSizeInBits(), 0);
11701   APInt UndefBits(VT.getSizeInBits(), 0);
11702   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
11703     SDValue NewOp;
11704
11705     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
11706                                     DefBits, &LHS)) ||
11707         (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
11708                                     DefBits, &LHS)))
11709       return NewOp;
11710
11711     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
11712                                     UndefBits, &LHS)) ||
11713         (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
11714                                     UndefBits, &LHS)))
11715       return NewOp;
11716   }
11717
11718   // We can always fall back to a non-immediate OR.
11719   return Op;
11720 }
11721
11722 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
11723 // be truncated to fit element width.
11724 static SDValue NormalizeBuildVector(SDValue Op,
11725                                     SelectionDAG &DAG) {
11726   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11727   SDLoc dl(Op);
11728   EVT VT = Op.getValueType();
11729   EVT EltTy= VT.getVectorElementType();
11730
11731   if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
11732     return Op;
11733
11734   SmallVector<SDValue, 16> Ops;
11735   for (SDValue Lane : Op->ops()) {
11736     // For integer vectors, type legalization would have promoted the
11737     // operands already. Otherwise, if Op is a floating-point splat
11738     // (with operands cast to integers), then the only possibilities
11739     // are constants and UNDEFs.
11740     if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
11741       APInt LowBits(EltTy.getSizeInBits(),
11742                     CstLane->getZExtValue());
11743       Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
11744     } else if (Lane.getNode()->isUndef()) {
11745       Lane = DAG.getUNDEF(MVT::i32);
11746     } else {
11747       assert(Lane.getValueType() == MVT::i32 &&
11748              "Unexpected BUILD_VECTOR operand type");
11749     }
11750     Ops.push_back(Lane);
11751   }
11752   return DAG.getBuildVector(VT, dl, Ops);
11753 }
11754
11755 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
11756   EVT VT = Op.getValueType();
11757
11758   APInt DefBits(VT.getSizeInBits(), 0);
11759   APInt UndefBits(VT.getSizeInBits(), 0);
11760   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
11761   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
11762     SDValue NewOp;
11763     if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
11764         (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
11765         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
11766         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
11767         (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
11768         (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
11769       return NewOp;
11770
11771     DefBits = ~DefBits;
11772     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
11773         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
11774         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
11775       return NewOp;
11776
11777     DefBits = UndefBits;
11778     if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
11779         (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
11780         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
11781         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
11782         (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
11783         (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
11784       return NewOp;
11785
11786     DefBits = ~UndefBits;
11787     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
11788         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
11789         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
11790       return NewOp;
11791   }
11792
11793   return SDValue();
11794 }
11795
11796 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
11797                                                  SelectionDAG &DAG) const {
11798   EVT VT = Op.getValueType();
11799
11800   if (useSVEForFixedLengthVectorVT(VT,
11801                                    Subtarget->forceStreamingCompatibleSVE())) {
11802     if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
11803       SDLoc DL(Op);
11804       EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11805       SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
11806       SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
11807       SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
11808       return convertFromScalableVector(DAG, Op.getValueType(), Seq);
11809     }
11810
11811     // Revert to common legalisation for all other variants.
11812     return SDValue();
11813   }
11814
11815   // Try to build a simple constant vector.
11816   Op = NormalizeBuildVector(Op, DAG);
11817   if (VT.isInteger()) {
11818     // Certain vector constants, used to express things like logical NOT and
11819     // arithmetic NEG, are passed through unmodified.  This allows special
11820     // patterns for these operations to match, which will lower these constants
11821     // to whatever is proven necessary.
11822     BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
11823     if (BVN->isConstant())
11824       if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
11825         unsigned BitSize = VT.getVectorElementType().getSizeInBits();
11826         APInt Val(BitSize,
11827                   Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
11828         if (Val.isZero() || Val.isAllOnes())
11829           return Op;
11830       }
11831   }
11832
11833   if (SDValue V = ConstantBuildVector(Op, DAG))
11834     return V;
11835
11836   // Scan through the operands to find some interesting properties we can
11837   // exploit:
11838   //   1) If only one value is used, we can use a DUP, or
11839   //   2) if only the low element is not undef, we can just insert that, or
11840   //   3) if only one constant value is used (w/ some non-constant lanes),
11841   //      we can splat the constant value into the whole vector then fill
11842   //      in the non-constant lanes.
11843   //   4) FIXME: If different constant values are used, but we can intelligently
11844   //             select the values we'll be overwriting for the non-constant
11845   //             lanes such that we can directly materialize the vector
11846   //             some other way (MOVI, e.g.), we can be sneaky.
11847   //   5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
11848   SDLoc dl(Op);
11849   unsigned NumElts = VT.getVectorNumElements();
11850   bool isOnlyLowElement = true;
11851   bool usesOnlyOneValue = true;
11852   bool usesOnlyOneConstantValue = true;
11853   bool isConstant = true;
11854   bool AllLanesExtractElt = true;
11855   unsigned NumConstantLanes = 0;
11856   unsigned NumDifferentLanes = 0;
11857   unsigned NumUndefLanes = 0;
11858   SDValue Value;
11859   SDValue ConstantValue;
11860   for (unsigned i = 0; i < NumElts; ++i) {
11861     SDValue V = Op.getOperand(i);
11862     if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11863       AllLanesExtractElt = false;
11864     if (V.isUndef()) {
11865       ++NumUndefLanes;
11866       continue;
11867     }
11868     if (i > 0)
11869       isOnlyLowElement = false;
11870     if (!isIntOrFPConstant(V))
11871       isConstant = false;
11872
11873     if (isIntOrFPConstant(V)) {
11874       ++NumConstantLanes;
11875       if (!ConstantValue.getNode())
11876         ConstantValue = V;
11877       else if (ConstantValue != V)
11878         usesOnlyOneConstantValue = false;
11879     }
11880
11881     if (!Value.getNode())
11882       Value = V;
11883     else if (V != Value) {
11884       usesOnlyOneValue = false;
11885       ++NumDifferentLanes;
11886     }
11887   }
11888
11889   if (!Value.getNode()) {
11890     LLVM_DEBUG(
11891         dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
11892     return DAG.getUNDEF(VT);
11893   }
11894
11895   // Convert BUILD_VECTOR where all elements but the lowest are undef into
11896   // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
11897   // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
11898   if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
11899     LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
11900                          "SCALAR_TO_VECTOR node\n");
11901     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
11902   }
11903
11904   if (AllLanesExtractElt) {
11905     SDNode *Vector = nullptr;
11906     bool Even = false;
11907     bool Odd = false;
11908     // Check whether the extract elements match the Even pattern <0,2,4,...> or
11909     // the Odd pattern <1,3,5,...>.
11910     for (unsigned i = 0; i < NumElts; ++i) {
11911       SDValue V = Op.getOperand(i);
11912       const SDNode *N = V.getNode();
11913       if (!isa<ConstantSDNode>(N->getOperand(1)))
11914         break;
11915       SDValue N0 = N->getOperand(0);
11916
11917       // All elements are extracted from the same vector.
11918       if (!Vector) {
11919         Vector = N0.getNode();
11920         // Check that the type of EXTRACT_VECTOR_ELT matches the type of
11921         // BUILD_VECTOR.
11922         if (VT.getVectorElementType() !=
11923             N0.getValueType().getVectorElementType())
11924           break;
11925       } else if (Vector != N0.getNode()) {
11926         Odd = false;
11927         Even = false;
11928         break;
11929       }
11930
11931       // Extracted values are either at Even indices <0,2,4,...> or at Odd
11932       // indices <1,3,5,...>.
11933       uint64_t Val = N->getConstantOperandVal(1);
11934       if (Val == 2 * i) {
11935         Even = true;
11936         continue;
11937       }
11938       if (Val - 1 == 2 * i) {
11939         Odd = true;
11940         continue;
11941       }
11942
11943       // Something does not match: abort.
11944       Odd = false;
11945       Even = false;
11946       break;
11947     }
11948     if (Even || Odd) {
11949       SDValue LHS =
11950           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
11951                       DAG.getConstant(0, dl, MVT::i64));
11952       SDValue RHS =
11953           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
11954                       DAG.getConstant(NumElts, dl, MVT::i64));
11955
11956       if (Even && !Odd)
11957         return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
11958                            RHS);
11959       if (Odd && !Even)
11960         return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
11961                            RHS);
11962     }
11963   }
11964
11965   // Use DUP for non-constant splats. For f32 constant splats, reduce to
11966   // i32 and try again.
11967   if (usesOnlyOneValue) {
11968     if (!isConstant) {
11969       if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11970           Value.getValueType() != VT) {
11971         LLVM_DEBUG(
11972             dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
11973         return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
11974       }
11975
11976       // This is actually a DUPLANExx operation, which keeps everything vectory.
11977
11978       SDValue Lane = Value.getOperand(1);
11979       Value = Value.getOperand(0);
11980       if (Value.getValueSizeInBits() == 64) {
11981         LLVM_DEBUG(
11982             dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
11983                       "widening it\n");
11984         Value = WidenVector(Value, DAG);
11985       }
11986
11987       unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
11988       return DAG.getNode(Opcode, dl, VT, Value, Lane);
11989     }
11990
11991     if (VT.getVectorElementType().isFloatingPoint()) {
11992       SmallVector<SDValue, 8> Ops;
11993       EVT EltTy = VT.getVectorElementType();
11994       assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
11995                EltTy == MVT::f64) && "Unsupported floating-point vector type");
11996       LLVM_DEBUG(
11997           dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
11998                     "BITCASTS, and try again\n");
11999       MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
12000       for (unsigned i = 0; i < NumElts; ++i)
12001         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
12002       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
12003       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
12004       LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
12005                  Val.dump(););
12006       Val = LowerBUILD_VECTOR(Val, DAG);
12007       if (Val.getNode())
12008         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
12009     }
12010   }
12011
12012   // If we need to insert a small number of different non-constant elements and
12013   // the vector width is sufficiently large, prefer using DUP with the common
12014   // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
12015   // skip the constant lane handling below.
12016   bool PreferDUPAndInsert =
12017       !isConstant && NumDifferentLanes >= 1 &&
12018       NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
12019       NumDifferentLanes >= NumConstantLanes;
12020
12021   // If there was only one constant value used and for more than one lane,
12022   // start by splatting that value, then replace the non-constant lanes. This
12023   // is better than the default, which will perform a separate initialization
12024   // for each lane.
12025   if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
12026     // Firstly, try to materialize the splat constant.
12027     SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
12028             Val = ConstantBuildVector(Vec, DAG);
12029     if (!Val) {
12030       // Otherwise, materialize the constant and splat it.
12031       Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
12032       DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
12033     }
12034
12035     // Now insert the non-constant lanes.
12036     for (unsigned i = 0; i < NumElts; ++i) {
12037       SDValue V = Op.getOperand(i);
12038       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
12039       if (!isIntOrFPConstant(V))
12040         // Note that type legalization likely mucked about with the VT of the
12041         // source operand, so we may have to convert it here before inserting.
12042         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
12043     }
12044     return Val;
12045   }
12046
12047   // This will generate a load from the constant pool.
12048   if (isConstant) {
12049     LLVM_DEBUG(
12050         dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
12051                   "expansion\n");
12052     return SDValue();
12053   }
12054
12055   // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
12056   // v4i32s. This is really a truncate, which we can construct out of (legal)
12057   // concats and truncate nodes.
12058   if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
12059     return M;
12060
12061   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
12062   if (NumElts >= 4) {
12063     if (SDValue shuffle = ReconstructShuffle(Op, DAG))
12064       return shuffle;
12065   }
12066
12067   if (PreferDUPAndInsert) {
12068     // First, build a constant vector with the common element.
12069     SmallVector<SDValue, 8> Ops(NumElts, Value);
12070     SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
12071     // Next, insert the elements that do not match the common value.
12072     for (unsigned I = 0; I < NumElts; ++I)
12073       if (Op.getOperand(I) != Value)
12074         NewVector =
12075             DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
12076                         Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
12077
12078     return NewVector;
12079   }
12080
12081   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
12082   // know the default expansion would otherwise fall back on something even
12083   // worse. For a vector with one or two non-undef values, that's
12084   // scalar_to_vector for the elements followed by a shuffle (provided the
12085   // shuffle is valid for the target) and materialization element by element
12086   // on the stack followed by a load for everything else.
12087   if (!isConstant && !usesOnlyOneValue) {
12088     LLVM_DEBUG(
12089         dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
12090                   "of INSERT_VECTOR_ELT\n");
12091
12092     SDValue Vec = DAG.getUNDEF(VT);
12093     SDValue Op0 = Op.getOperand(0);
12094     unsigned i = 0;
12095
12096     // Use SCALAR_TO_VECTOR for lane zero to
12097     // a) Avoid a RMW dependency on the full vector register, and
12098     // b) Allow the register coalescer to fold away the copy if the
12099     //    value is already in an S or D register, and we're forced to emit an
12100     //    INSERT_SUBREG that we can't fold anywhere.
12101     //
12102     // We also allow types like i8 and i16 which are illegal scalar but legal
12103     // vector element types. After type-legalization the inserted value is
12104     // extended (i32) and it is safe to cast them to the vector type by ignoring
12105     // the upper bits of the lowest lane (e.g. v8i8, v4i16).
12106     if (!Op0.isUndef()) {
12107       LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
12108       Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
12109       ++i;
12110     }
12111     LLVM_DEBUG(if (i < NumElts) dbgs()
12112                    << "Creating nodes for the other vector elements:\n";);
12113     for (; i < NumElts; ++i) {
12114       SDValue V = Op.getOperand(i);
12115       if (V.isUndef())
12116         continue;
12117       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
12118       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
12119     }
12120     return Vec;
12121   }
12122
12123   LLVM_DEBUG(
12124       dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
12125                 "better alternative\n");
12126   return SDValue();
12127 }
12128
12129 SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
12130                                                    SelectionDAG &DAG) const {
12131   if (useSVEForFixedLengthVectorVT(Op.getValueType()))
12132     return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
12133
12134   assert(Op.getValueType().isScalableVector() &&
12135          isTypeLegal(Op.getValueType()) &&
12136          "Expected legal scalable vector type!");
12137
12138   if (isTypeLegal(Op.getOperand(0).getValueType())) {
12139     unsigned NumOperands = Op->getNumOperands();
12140     assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
12141            "Unexpected number of operands in CONCAT_VECTORS");
12142
12143     if (NumOperands == 2)
12144       return Op;
12145
12146     // Concat each pair of subvectors and pack into the lower half of the array.
12147     SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
12148     while (ConcatOps.size() > 1) {
12149       for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
12150         SDValue V1 = ConcatOps[I];
12151         SDValue V2 = ConcatOps[I + 1];
12152         EVT SubVT = V1.getValueType();
12153         EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
12154         ConcatOps[I / 2] =
12155             DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
12156       }
12157       ConcatOps.resize(ConcatOps.size() / 2);
12158     }
12159     return ConcatOps[0];
12160   }
12161
12162   return SDValue();
12163 }
12164
12165 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
12166                                                       SelectionDAG &DAG) const {
12167   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
12168
12169   if (useSVEForFixedLengthVectorVT(Op.getValueType()))
12170     return LowerFixedLengthInsertVectorElt(Op, DAG);
12171
12172   // Check for non-constant or out of range lane.
12173   EVT VT = Op.getOperand(0).getValueType();
12174
12175   if (VT.getScalarType() == MVT::i1) {
12176     EVT VectorVT = getPromotedVTForPredicate(VT);
12177     SDLoc DL(Op);
12178     SDValue ExtendedVector =
12179         DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
12180     SDValue ExtendedValue =
12181         DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
12182                              VectorVT.getScalarType().getSizeInBits() < 32
12183                                  ? MVT::i32
12184                                  : VectorVT.getScalarType());
12185     ExtendedVector =
12186         DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
12187                     ExtendedValue, Op.getOperand(2));
12188     return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
12189   }
12190
12191   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
12192   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
12193     return SDValue();
12194
12195   // Insertion/extraction are legal for V128 types.
12196   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
12197       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
12198       VT == MVT::v8f16 || VT == MVT::v8bf16)
12199     return Op;
12200
12201   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
12202       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
12203       VT != MVT::v4bf16)
12204     return SDValue();
12205
12206   // For V64 types, we perform insertion by expanding the value
12207   // to a V128 type and perform the insertion on that.
12208   SDLoc DL(Op);
12209   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
12210   EVT WideTy = WideVec.getValueType();
12211
12212   SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
12213                              Op.getOperand(1), Op.getOperand(2));
12214   // Re-narrow the resultant vector.
12215   return NarrowVector(Node, DAG);
12216 }
12217
12218 SDValue
12219 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12220                                                SelectionDAG &DAG) const {
12221   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
12222   EVT VT = Op.getOperand(0).getValueType();
12223
12224   if (VT.getScalarType() == MVT::i1) {
12225     // We can't directly extract from an SVE predicate; extend it first.
12226     // (This isn't the only possible lowering, but it's straightforward.)
12227     EVT VectorVT = getPromotedVTForPredicate(VT);
12228     SDLoc DL(Op);
12229     SDValue Extend =
12230         DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
12231     MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
12232     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
12233                                   Extend, Op.getOperand(1));
12234     return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
12235   }
12236
12237   if (useSVEForFixedLengthVectorVT(VT))
12238     return LowerFixedLengthExtractVectorElt(Op, DAG);
12239
12240   // Check for non-constant or out of range lane.
12241   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
12242   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
12243     return SDValue();
12244
12245   // Insertion/extraction are legal for V128 types.
12246   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
12247       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
12248       VT == MVT::v8f16 || VT == MVT::v8bf16)
12249     return Op;
12250
12251   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
12252       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
12253       VT != MVT::v4bf16)
12254     return SDValue();
12255
12256   // For V64 types, we perform extraction by expanding the value
12257   // to a V128 type and perform the extraction on that.
12258   SDLoc DL(Op);
12259   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
12260   EVT WideTy = WideVec.getValueType();
12261
12262   EVT ExtrTy = WideTy.getVectorElementType();
12263   if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
12264     ExtrTy = MVT::i32;
12265
12266   // For extractions, we just return the result directly.
12267   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
12268                      Op.getOperand(1));
12269 }
12270
12271 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
12272                                                       SelectionDAG &DAG) const {
12273   assert(Op.getValueType().isFixedLengthVector() &&
12274          "Only cases that extract a fixed length vector are supported!");
12275
12276   EVT InVT = Op.getOperand(0).getValueType();
12277   unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12278   unsigned Size = Op.getValueSizeInBits();
12279
12280   // If we don't have legal types yet, do nothing
12281   if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
12282     return SDValue();
12283
12284   if (InVT.isScalableVector()) {
12285     // This will be matched by custom code during ISelDAGToDAG.
12286     if (Idx == 0 && isPackedVectorType(InVT, DAG))
12287       return Op;
12288
12289     return SDValue();
12290   }
12291
12292   // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
12293   if (Idx == 0 && InVT.getSizeInBits() <= 128)
12294     return Op;
12295
12296   // If this is extracting the upper 64-bits of a 128-bit vector, we match
12297   // that directly.
12298   if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
12299       InVT.getSizeInBits() == 128)
12300     return Op;
12301
12302   if (useSVEForFixedLengthVectorVT(InVT)) {
12303     SDLoc DL(Op);
12304
12305     EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
12306     SDValue NewInVec =
12307         convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
12308
12309     SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
12310                                  NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
12311     return convertFromScalableVector(DAG, Op.getValueType(), Splice);
12312   }
12313
12314   return SDValue();
12315 }
12316
12317 SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
12318                                                      SelectionDAG &DAG) const {
12319   assert(Op.getValueType().isScalableVector() &&
12320          "Only expect to lower inserts into scalable vectors!");
12321
12322   EVT InVT = Op.getOperand(1).getValueType();
12323   unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
12324
12325   SDValue Vec0 = Op.getOperand(0);
12326   SDValue Vec1 = Op.getOperand(1);
12327   SDLoc DL(Op);
12328   EVT VT = Op.getValueType();
12329
12330   if (InVT.isScalableVector()) {
12331     if (!isTypeLegal(VT))
12332       return SDValue();
12333
12334     // Break down insert_subvector into simpler parts.
12335     if (VT.getVectorElementType() == MVT::i1) {
12336       unsigned NumElts = VT.getVectorMinNumElements();
12337       EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12338
12339       SDValue Lo, Hi;
12340       Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
12341                        DAG.getVectorIdxConstant(0, DL));
12342       Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
12343                        DAG.getVectorIdxConstant(NumElts / 2, DL));
12344       if (Idx < (NumElts / 2)) {
12345         SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
12346                                     DAG.getVectorIdxConstant(Idx, DL));
12347         return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi);
12348       } else {
12349         SDValue NewHi =
12350             DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
12351                         DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
12352         return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi);
12353       }
12354     }
12355
12356     // Ensure the subvector is half the size of the main vector.
12357     if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
12358       return SDValue();
12359
12360     // Here narrow and wide refers to the vector element types. After "casting"
12361     // both vectors must have the same bit length and so because the subvector
12362     // has fewer elements, those elements need to be bigger.
12363     EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
12364     EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
12365
12366     // NOP cast operands to the largest legal vector of the same element count.
12367     if (VT.isFloatingPoint()) {
12368       Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
12369       Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
12370     } else {
12371       // Legal integer vectors are already their largest so Vec0 is fine as is.
12372       Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
12373     }
12374
12375     // To replace the top/bottom half of vector V with vector SubV we widen the
12376     // preserved half of V, concatenate this to SubV (the order depending on the
12377     // half being replaced) and then narrow the result.
12378     SDValue Narrow;
12379     if (Idx == 0) {
12380       SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
12381       Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
12382     } else {
12383       assert(Idx == InVT.getVectorMinNumElements() &&
12384              "Invalid subvector index!");
12385       SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
12386       Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
12387     }
12388
12389     return getSVESafeBitCast(VT, Narrow, DAG);
12390   }
12391
12392   if (Idx == 0 && isPackedVectorType(VT, DAG)) {
12393     // This will be matched by custom code during ISelDAGToDAG.
12394     if (Vec0.isUndef())
12395       return Op;
12396
12397     Optional<unsigned> PredPattern =
12398         getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
12399     auto PredTy = VT.changeVectorElementType(MVT::i1);
12400     SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
12401     SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
12402     return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
12403   }
12404
12405   return SDValue();
12406 }
12407
12408 static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
12409   if (Op.getOpcode() != AArch64ISD::DUP &&
12410       Op.getOpcode() != ISD::SPLAT_VECTOR &&
12411       Op.getOpcode() != ISD::BUILD_VECTOR)
12412     return false;
12413
12414   if (Op.getOpcode() == ISD::BUILD_VECTOR &&
12415       !isAllConstantBuildVector(Op, SplatVal))
12416     return false;
12417
12418   if (Op.getOpcode() != ISD::BUILD_VECTOR &&
12419       !isa<ConstantSDNode>(Op->getOperand(0)))
12420     return false;
12421
12422   SplatVal = Op->getConstantOperandVal(0);
12423   if (Op.getValueType().getVectorElementType() != MVT::i64)
12424     SplatVal = (int32_t)SplatVal;
12425
12426   Negated = false;
12427   if (isPowerOf2_64(SplatVal))
12428     return true;
12429
12430   Negated = true;
12431   if (isPowerOf2_64(-SplatVal)) {
12432     SplatVal = -SplatVal;
12433     return true;
12434   }
12435
12436   return false;
12437 }
12438
12439 SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
12440   EVT VT = Op.getValueType();
12441   SDLoc dl(Op);
12442
12443   if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
12444     return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
12445
12446   assert(VT.isScalableVector() && "Expected a scalable vector.");
12447
12448   bool Signed = Op.getOpcode() == ISD::SDIV;
12449   unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
12450
12451   bool Negated;
12452   uint64_t SplatVal;
12453   if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
12454     SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
12455     SDValue Res =
12456         DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
12457                     DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
12458     if (Negated)
12459       Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
12460
12461     return Res;
12462   }
12463
12464   if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
12465     return LowerToPredicatedOp(Op, DAG, PredOpcode);
12466
12467   // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
12468   // operations, and truncate the result.
12469   EVT WidenedVT;
12470   if (VT == MVT::nxv16i8)
12471     WidenedVT = MVT::nxv8i16;
12472   else if (VT == MVT::nxv8i16)
12473     WidenedVT = MVT::nxv4i32;
12474   else
12475     llvm_unreachable("Unexpected Custom DIV operation");
12476
12477   unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
12478   unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
12479   SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
12480   SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
12481   SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
12482   SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
12483   SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
12484   SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
12485   return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
12486 }
12487
12488 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
12489   // Currently no fixed length shuffles that require SVE are legal.
12490   if (useSVEForFixedLengthVectorVT(VT))
12491     return false;
12492
12493   if (VT.getVectorNumElements() == 4 &&
12494       (VT.is128BitVector() || VT.is64BitVector())) {
12495     unsigned Cost = getPerfectShuffleCost(M);
12496     if (Cost <= 1)
12497       return true;
12498   }
12499
12500   bool DummyBool;
12501   int DummyInt;
12502   unsigned DummyUnsigned;
12503
12504   return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
12505           isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
12506           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
12507           // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
12508           isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
12509           isZIPMask(M, VT, DummyUnsigned) ||
12510           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
12511           isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
12512           isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
12513           isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
12514           isConcatMask(M, VT, VT.getSizeInBits() == 128));
12515 }
12516
12517 bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
12518                                                    EVT VT) const {
12519   // Just delegate to the generic legality, clear masks aren't special.
12520   return isShuffleMaskLegal(M, VT);
12521 }
12522
12523 /// getVShiftImm - Check if this is a valid build_vector for the immediate
12524 /// operand of a vector shift operation, where all the elements of the
12525 /// build_vector must have the same constant integer value.
12526 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
12527   // Ignore bit_converts.
12528   while (Op.getOpcode() == ISD::BITCAST)
12529     Op = Op.getOperand(0);
12530   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
12531   APInt SplatBits, SplatUndef;
12532   unsigned SplatBitSize;
12533   bool HasAnyUndefs;
12534   if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
12535                                     HasAnyUndefs, ElementBits) ||
12536       SplatBitSize > ElementBits)
12537     return false;
12538   Cnt = SplatBits.getSExtValue();
12539   return true;
12540 }
12541
12542 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
12543 /// operand of a vector shift left operation.  That value must be in the range:
12544 ///   0 <= Value < ElementBits for a left shift; or
12545 ///   0 <= Value <= ElementBits for a long left shift.
12546 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
12547   assert(VT.isVector() && "vector shift count is not a vector type");
12548   int64_t ElementBits = VT.getScalarSizeInBits();
12549   if (!getVShiftImm(Op, ElementBits, Cnt))
12550     return false;
12551   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
12552 }
12553
12554 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
12555 /// operand of a vector shift right operation. The value must be in the range:
12556 ///   1 <= Value <= ElementBits for a right shift; or
12557 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
12558   assert(VT.isVector() && "vector shift count is not a vector type");
12559   int64_t ElementBits = VT.getScalarSizeInBits();
12560   if (!getVShiftImm(Op, ElementBits, Cnt))
12561     return false;
12562   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
12563 }
12564
12565 SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
12566                                              SelectionDAG &DAG) const {
12567   EVT VT = Op.getValueType();
12568
12569   if (VT.getScalarType() == MVT::i1) {
12570     // Lower i1 truncate to `(x & 1) != 0`.
12571     SDLoc dl(Op);
12572     EVT OpVT = Op.getOperand(0).getValueType();
12573     SDValue Zero = DAG.getConstant(0, dl, OpVT);
12574     SDValue One = DAG.getConstant(1, dl, OpVT);
12575     SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
12576     return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
12577   }
12578
12579   if (!VT.isVector() || VT.isScalableVector())
12580     return SDValue();
12581
12582   if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
12583     return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
12584
12585   return SDValue();
12586 }
12587
12588 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
12589                                                       SelectionDAG &DAG) const {
12590   EVT VT = Op.getValueType();
12591   SDLoc DL(Op);
12592   int64_t Cnt;
12593
12594   if (!Op.getOperand(1).getValueType().isVector())
12595     return Op;
12596   unsigned EltSize = VT.getScalarSizeInBits();
12597
12598   switch (Op.getOpcode()) {
12599   case ISD::SHL:
12600     if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
12601       return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
12602
12603     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
12604       return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
12605                          DAG.getConstant(Cnt, DL, MVT::i32));
12606     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12607                        DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
12608                                        MVT::i32),
12609                        Op.getOperand(0), Op.getOperand(1));
12610   case ISD::SRA:
12611   case ISD::SRL:
12612     if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) {
12613       unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
12614                                                 : AArch64ISD::SRL_PRED;
12615       return LowerToPredicatedOp(Op, DAG, Opc);
12616     }
12617
12618     // Right shift immediate
12619     if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
12620       unsigned Opc =
12621           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
12622       return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
12623                          DAG.getConstant(Cnt, DL, MVT::i32));
12624     }
12625
12626     // Right shift register.  Note, there is not a shift right register
12627     // instruction, but the shift left register instruction takes a signed
12628     // value, where negative numbers specify a right shift.
12629     unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
12630                                                 : Intrinsic::aarch64_neon_ushl;
12631     // negate the shift amount
12632     SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
12633                                    Op.getOperand(1));
12634     SDValue NegShiftLeft =
12635         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12636                     DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
12637                     NegShift);
12638     return NegShiftLeft;
12639   }
12640
12641   llvm_unreachable("unexpected shift opcode");
12642 }
12643
12644 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
12645                                     AArch64CC::CondCode CC, bool NoNans, EVT VT,
12646                                     const SDLoc &dl, SelectionDAG &DAG) {
12647   EVT SrcVT = LHS.getValueType();
12648   assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
12649          "function only supposed to emit natural comparisons");
12650
12651   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
12652   APInt CnstBits(VT.getSizeInBits(), 0);
12653   APInt UndefBits(VT.getSizeInBits(), 0);
12654   bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
12655   bool IsZero = IsCnst && (CnstBits == 0);
12656
12657   if (SrcVT.getVectorElementType().isFloatingPoint()) {
12658     switch (CC) {
12659     default:
12660       return SDValue();
12661     case AArch64CC::NE: {
12662       SDValue Fcmeq;
12663       if (IsZero)
12664         Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
12665       else
12666         Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
12667       return DAG.getNOT(dl, Fcmeq, VT);
12668     }
12669     case AArch64CC::EQ:
12670       if (IsZero)
12671         return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
12672       return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
12673     case AArch64CC::GE:
12674       if (IsZero)
12675         return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
12676       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
12677     case AArch64CC::GT:
12678       if (IsZero)
12679         return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
12680       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
12681     case AArch64CC::LE:
12682       if (!NoNans)
12683         return SDValue();
12684       // If we ignore NaNs then we can use to the LS implementation.
12685       [[fallthrough]];
12686     case AArch64CC::LS:
12687       if (IsZero)
12688         return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
12689       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
12690     case AArch64CC::LT:
12691       if (!NoNans)
12692         return SDValue();
12693       // If we ignore NaNs then we can use to the MI implementation.
12694       [[fallthrough]];
12695     case AArch64CC::MI:
12696       if (IsZero)
12697         return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
12698       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
12699     }
12700   }
12701
12702   switch (CC) {
12703   default:
12704     return SDValue();
12705   case AArch64CC::NE: {
12706     SDValue Cmeq;
12707     if (IsZero)
12708       Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
12709     else
12710       Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
12711     return DAG.getNOT(dl, Cmeq, VT);
12712   }
12713   case AArch64CC::EQ:
12714     if (IsZero)
12715       return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
12716     return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
12717   case AArch64CC::GE:
12718     if (IsZero)
12719       return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
12720     return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
12721   case AArch64CC::GT:
12722     if (IsZero)
12723       return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
12724     return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
12725   case AArch64CC::LE:
12726     if (IsZero)
12727       return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
12728     return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
12729   case AArch64CC::LS:
12730     return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
12731   case AArch64CC::LO:
12732     return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
12733   case AArch64CC::LT:
12734     if (IsZero)
12735       return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
12736     return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
12737   case AArch64CC::HI:
12738     return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
12739   case AArch64CC::HS:
12740     return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
12741   }
12742 }
12743
12744 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
12745                                            SelectionDAG &DAG) const {
12746   if (Op.getValueType().isScalableVector())
12747     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
12748
12749   if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
12750     return LowerFixedLengthVectorSetccToSVE(Op, DAG);
12751
12752   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
12753   SDValue LHS = Op.getOperand(0);
12754   SDValue RHS = Op.getOperand(1);
12755   EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
12756   SDLoc dl(Op);
12757
12758   if (LHS.getValueType().getVectorElementType().isInteger()) {
12759     assert(LHS.getValueType() == RHS.getValueType());
12760     AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
12761     SDValue Cmp =
12762         EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
12763     return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
12764   }
12765
12766   const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
12767
12768   // Make v4f16 (only) fcmp operations utilise vector instructions
12769   // v8f16 support will be a litle more complicated
12770   if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
12771     if (LHS.getValueType().getVectorNumElements() == 4) {
12772       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
12773       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
12774       SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
12775       DAG.ReplaceAllUsesWith(Op, NewSetcc);
12776       CmpVT = MVT::v4i32;
12777     } else
12778       return SDValue();
12779   }
12780
12781   assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
12782           LHS.getValueType().getVectorElementType() != MVT::f128);
12783
12784   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
12785   // clean.  Some of them require two branches to implement.
12786   AArch64CC::CondCode CC1, CC2;
12787   bool ShouldInvert;
12788   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
12789
12790   bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
12791   SDValue Cmp =
12792       EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
12793   if (!Cmp.getNode())
12794     return SDValue();
12795
12796   if (CC2 != AArch64CC::AL) {
12797     SDValue Cmp2 =
12798         EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
12799     if (!Cmp2.getNode())
12800       return SDValue();
12801
12802     Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
12803   }
12804
12805   Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
12806
12807   if (ShouldInvert)
12808     Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
12809
12810   return Cmp;
12811 }
12812
12813 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
12814                                   SelectionDAG &DAG) {
12815   SDValue VecOp = ScalarOp.getOperand(0);
12816   auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
12817   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
12818                      DAG.getConstant(0, DL, MVT::i64));
12819 }
12820
12821 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
12822                                               SelectionDAG &DAG) const {
12823   SDValue Src = Op.getOperand(0);
12824
12825   // Try to lower fixed length reductions to SVE.
12826   EVT SrcVT = Src.getValueType();
12827   bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND ||
12828                       Op.getOpcode() == ISD::VECREDUCE_OR ||
12829                       Op.getOpcode() == ISD::VECREDUCE_XOR ||
12830                       Op.getOpcode() == ISD::VECREDUCE_FADD ||
12831                       (Op.getOpcode() != ISD::VECREDUCE_ADD &&
12832                        SrcVT.getVectorElementType() == MVT::i64);
12833   if (SrcVT.isScalableVector() ||
12834       useSVEForFixedLengthVectorVT(
12835           SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
12836
12837     if (SrcVT.getVectorElementType() == MVT::i1)
12838       return LowerPredReductionToSVE(Op, DAG);
12839
12840     switch (Op.getOpcode()) {
12841     case ISD::VECREDUCE_ADD:
12842       return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
12843     case ISD::VECREDUCE_AND:
12844       return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
12845     case ISD::VECREDUCE_OR:
12846       return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
12847     case ISD::VECREDUCE_SMAX:
12848       return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
12849     case ISD::VECREDUCE_SMIN:
12850       return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
12851     case ISD::VECREDUCE_UMAX:
12852       return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
12853     case ISD::VECREDUCE_UMIN:
12854       return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
12855     case ISD::VECREDUCE_XOR:
12856       return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
12857     case ISD::VECREDUCE_FADD:
12858       return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
12859     case ISD::VECREDUCE_FMAX:
12860       return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
12861     case ISD::VECREDUCE_FMIN:
12862       return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
12863     default:
12864       llvm_unreachable("Unhandled fixed length reduction");
12865     }
12866   }
12867
12868   // Lower NEON reductions.
12869   SDLoc dl(Op);
12870   switch (Op.getOpcode()) {
12871   case ISD::VECREDUCE_ADD:
12872     return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
12873   case ISD::VECREDUCE_SMAX:
12874     return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
12875   case ISD::VECREDUCE_SMIN:
12876     return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
12877   case ISD::VECREDUCE_UMAX:
12878     return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
12879   case ISD::VECREDUCE_UMIN:
12880     return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
12881   case ISD::VECREDUCE_FMAX: {
12882     return DAG.getNode(
12883         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
12884         DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
12885         Src);
12886   }
12887   case ISD::VECREDUCE_FMIN: {
12888     return DAG.getNode(
12889         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
12890         DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
12891         Src);
12892   }
12893   default:
12894     llvm_unreachable("Unhandled reduction");
12895   }
12896 }
12897
12898 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
12899                                                     SelectionDAG &DAG) const {
12900   auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
12901   if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
12902     return SDValue();
12903
12904   // LSE has an atomic load-add instruction, but not a load-sub.
12905   SDLoc dl(Op);
12906   MVT VT = Op.getSimpleValueType();
12907   SDValue RHS = Op.getOperand(2);
12908   AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
12909   RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
12910   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
12911                        Op.getOperand(0), Op.getOperand(1), RHS,
12912                        AN->getMemOperand());
12913 }
12914
12915 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
12916                                                     SelectionDAG &DAG) const {
12917   auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
12918   if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
12919     return SDValue();
12920
12921   // LSE has an atomic load-clear instruction, but not a load-and.
12922   SDLoc dl(Op);
12923   MVT VT = Op.getSimpleValueType();
12924   SDValue RHS = Op.getOperand(2);
12925   AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
12926   RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
12927   return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
12928                        Op.getOperand(0), Op.getOperand(1), RHS,
12929                        AN->getMemOperand());
12930 }
12931
12932 SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
12933     SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
12934   SDLoc dl(Op);
12935   EVT PtrVT = getPointerTy(DAG.getDataLayout());
12936   SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
12937                                                PtrVT, 0);
12938
12939   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
12940   const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
12941   if (Subtarget->hasCustomCallingConv())
12942     TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
12943
12944   Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
12945                      DAG.getConstant(4, dl, MVT::i64));
12946   Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
12947   Chain =
12948       DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
12949                   Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
12950                   DAG.getRegisterMask(Mask), Chain.getValue(1));
12951   // To match the actual intent better, we should read the output from X15 here
12952   // again (instead of potentially spilling it to the stack), but rereading Size
12953   // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
12954   // here.
12955
12956   Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
12957                      DAG.getConstant(4, dl, MVT::i64));
12958   return Chain;
12959 }
12960
12961 SDValue
12962 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
12963                                                SelectionDAG &DAG) const {
12964   assert(Subtarget->isTargetWindows() &&
12965          "Only Windows alloca probing supported");
12966   SDLoc dl(Op);
12967   // Get the inputs.
12968   SDNode *Node = Op.getNode();
12969   SDValue Chain = Op.getOperand(0);
12970   SDValue Size = Op.getOperand(1);
12971   MaybeAlign Align =
12972       cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
12973   EVT VT = Node->getValueType(0);
12974
12975   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
12976           "no-stack-arg-probe")) {
12977     SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
12978     Chain = SP.getValue(1);
12979     SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
12980     if (Align)
12981       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
12982                        DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
12983     Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
12984     SDValue Ops[2] = {SP, Chain};
12985     return DAG.getMergeValues(Ops, dl);
12986   }
12987
12988   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
12989
12990   Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
12991
12992   SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
12993   Chain = SP.getValue(1);
12994   SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
12995   if (Align)
12996     SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
12997                      DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
12998   Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
12999
13000   Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
13001
13002   SDValue Ops[2] = {SP, Chain};
13003   return DAG.getMergeValues(Ops, dl);
13004 }
13005
13006 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
13007                                            SelectionDAG &DAG) const {
13008   EVT VT = Op.getValueType();
13009   assert(VT != MVT::i64 && "Expected illegal VSCALE node");
13010
13011   SDLoc DL(Op);
13012   APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
13013   return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
13014                             VT);
13015 }
13016
13017 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
13018 template <unsigned NumVecs>
13019 static bool
13020 setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
13021               AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
13022   Info.opc = ISD::INTRINSIC_VOID;
13023   // Retrieve EC from first vector argument.
13024   const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
13025   ElementCount EC = VT.getVectorElementCount();
13026 #ifndef NDEBUG
13027   // Check the assumption that all input vectors are the same type.
13028   for (unsigned I = 0; I < NumVecs; ++I)
13029     assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
13030            "Invalid type.");
13031 #endif
13032   // memVT is `NumVecs * VT`.
13033   Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
13034                                 EC * NumVecs);
13035   Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
13036   Info.offset = 0;
13037   Info.align.reset();
13038   Info.flags = MachineMemOperand::MOStore;
13039   return true;
13040 }
13041
13042 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
13043 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
13044 /// specified in the intrinsic calls.
13045 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
13046                                                const CallInst &I,
13047                                                MachineFunction &MF,
13048                                                unsigned Intrinsic) const {
13049   auto &DL = I.getModule()->getDataLayout();
13050   switch (Intrinsic) {
13051   case Intrinsic::aarch64_sve_st2:
13052     return setInfoSVEStN<2>(*this, DL, Info, I);
13053   case Intrinsic::aarch64_sve_st3:
13054     return setInfoSVEStN<3>(*this, DL, Info, I);
13055   case Intrinsic::aarch64_sve_st4:
13056     return setInfoSVEStN<4>(*this, DL, Info, I);
13057   case Intrinsic::aarch64_neon_ld2:
13058   case Intrinsic::aarch64_neon_ld3:
13059   case Intrinsic::aarch64_neon_ld4:
13060   case Intrinsic::aarch64_neon_ld1x2:
13061   case Intrinsic::aarch64_neon_ld1x3:
13062   case Intrinsic::aarch64_neon_ld1x4:
13063   case Intrinsic::aarch64_neon_ld2lane:
13064   case Intrinsic::aarch64_neon_ld3lane:
13065   case Intrinsic::aarch64_neon_ld4lane:
13066   case Intrinsic::aarch64_neon_ld2r:
13067   case Intrinsic::aarch64_neon_ld3r:
13068   case Intrinsic::aarch64_neon_ld4r: {
13069     Info.opc = ISD::INTRINSIC_W_CHAIN;
13070     // Conservatively set memVT to the entire set of vectors loaded.
13071     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
13072     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13073     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
13074     Info.offset = 0;
13075     Info.align.reset();
13076     // volatile loads with NEON intrinsics not supported
13077     Info.flags = MachineMemOperand::MOLoad;
13078     return true;
13079   }
13080   case Intrinsic::aarch64_neon_st2:
13081   case Intrinsic::aarch64_neon_st3:
13082   case Intrinsic::aarch64_neon_st4:
13083   case Intrinsic::aarch64_neon_st1x2:
13084   case Intrinsic::aarch64_neon_st1x3:
13085   case Intrinsic::aarch64_neon_st1x4:
13086   case Intrinsic::aarch64_neon_st2lane:
13087   case Intrinsic::aarch64_neon_st3lane:
13088   case Intrinsic::aarch64_neon_st4lane: {
13089     Info.opc = ISD::INTRINSIC_VOID;
13090     // Conservatively set memVT to the entire set of vectors stored.
13091     unsigned NumElts = 0;
13092     for (const Value *Arg : I.args()) {
13093       Type *ArgTy = Arg->getType();
13094       if (!ArgTy->isVectorTy())
13095         break;
13096       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
13097     }
13098     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13099     Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
13100     Info.offset = 0;
13101     Info.align.reset();
13102     // volatile stores with NEON intrinsics not supported
13103     Info.flags = MachineMemOperand::MOStore;
13104     return true;
13105   }
13106   case Intrinsic::aarch64_ldaxr:
13107   case Intrinsic::aarch64_ldxr: {
13108     Type *ValTy = I.getParamElementType(0);
13109     Info.opc = ISD::INTRINSIC_W_CHAIN;
13110     Info.memVT = MVT::getVT(ValTy);
13111     Info.ptrVal = I.getArgOperand(0);
13112     Info.offset = 0;
13113     Info.align = DL.getABITypeAlign(ValTy);
13114     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
13115     return true;
13116   }
13117   case Intrinsic::aarch64_stlxr:
13118   case Intrinsic::aarch64_stxr: {
13119     Type *ValTy = I.getParamElementType(1);
13120     Info.opc = ISD::INTRINSIC_W_CHAIN;
13121     Info.memVT = MVT::getVT(ValTy);
13122     Info.ptrVal = I.getArgOperand(1);
13123     Info.offset = 0;
13124     Info.align = DL.getABITypeAlign(ValTy);
13125     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
13126     return true;
13127   }
13128   case Intrinsic::aarch64_ldaxp:
13129   case Intrinsic::aarch64_ldxp:
13130     Info.opc = ISD::INTRINSIC_W_CHAIN;
13131     Info.memVT = MVT::i128;
13132     Info.ptrVal = I.getArgOperand(0);
13133     Info.offset = 0;
13134     Info.align = Align(16);
13135     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
13136     return true;
13137   case Intrinsic::aarch64_stlxp:
13138   case Intrinsic::aarch64_stxp:
13139     Info.opc = ISD::INTRINSIC_W_CHAIN;
13140     Info.memVT = MVT::i128;
13141     Info.ptrVal = I.getArgOperand(2);
13142     Info.offset = 0;
13143     Info.align = Align(16);
13144     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
13145     return true;
13146   case Intrinsic::aarch64_sve_ldnt1: {
13147     Type *ElTy = cast<VectorType>(I.getType())->getElementType();
13148     Info.opc = ISD::INTRINSIC_W_CHAIN;
13149     Info.memVT = MVT::getVT(I.getType());
13150     Info.ptrVal = I.getArgOperand(1);
13151     Info.offset = 0;
13152     Info.align = DL.getABITypeAlign(ElTy);
13153     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
13154     return true;
13155   }
13156   case Intrinsic::aarch64_sve_stnt1: {
13157     Type *ElTy =
13158         cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
13159     Info.opc = ISD::INTRINSIC_W_CHAIN;
13160     Info.memVT = MVT::getVT(I.getOperand(0)->getType());
13161     Info.ptrVal = I.getArgOperand(2);
13162     Info.offset = 0;
13163     Info.align = DL.getABITypeAlign(ElTy);
13164     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
13165     return true;
13166   }
13167   case Intrinsic::aarch64_mops_memset_tag: {
13168     Value *Dst = I.getArgOperand(0);
13169     Value *Val = I.getArgOperand(1);
13170     Info.opc = ISD::INTRINSIC_W_CHAIN;
13171     Info.memVT = MVT::getVT(Val->getType());
13172     Info.ptrVal = Dst;
13173     Info.offset = 0;
13174     Info.align = I.getParamAlign(0).valueOrOne();
13175     Info.flags = MachineMemOperand::MOStore;
13176     // The size of the memory being operated on is unknown at this point
13177     Info.size = MemoryLocation::UnknownSize;
13178     return true;
13179   }
13180   default:
13181     break;
13182   }
13183
13184   return false;
13185 }
13186
13187 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
13188                                                   ISD::LoadExtType ExtTy,
13189                                                   EVT NewVT) const {
13190   // TODO: This may be worth removing. Check regression tests for diffs.
13191   if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
13192     return false;
13193
13194   // If we're reducing the load width in order to avoid having to use an extra
13195   // instruction to do extension then it's probably a good idea.
13196   if (ExtTy != ISD::NON_EXTLOAD)
13197     return true;
13198   // Don't reduce load width if it would prevent us from combining a shift into
13199   // the offset.
13200   MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
13201   assert(Mem);
13202   const SDValue &Base = Mem->getBasePtr();
13203   if (Base.getOpcode() == ISD::ADD &&
13204       Base.getOperand(1).getOpcode() == ISD::SHL &&
13205       Base.getOperand(1).hasOneUse() &&
13206       Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
13207     // It's unknown whether a scalable vector has a power-of-2 bitwidth.
13208     if (Mem->getMemoryVT().isScalableVector())
13209       return false;
13210     // The shift can be combined if it matches the size of the value being
13211     // loaded (and so reducing the width would make it not match).
13212     uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
13213     uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
13214     if (ShiftAmount == Log2_32(LoadBytes))
13215       return false;
13216   }
13217   // We have no reason to disallow reducing the load width, so allow it.
13218   return true;
13219 }
13220
13221 // Truncations from 64-bit GPR to 32-bit GPR is free.
13222 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
13223   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
13224     return false;
13225   uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize();
13226   uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize();
13227   return NumBits1 > NumBits2;
13228 }
13229 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
13230   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
13231     return false;
13232   uint64_t NumBits1 = VT1.getFixedSizeInBits();
13233   uint64_t NumBits2 = VT2.getFixedSizeInBits();
13234   return NumBits1 > NumBits2;
13235 }
13236
13237 /// Check if it is profitable to hoist instruction in then/else to if.
13238 /// Not profitable if I and it's user can form a FMA instruction
13239 /// because we prefer FMSUB/FMADD.
13240 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
13241   if (I->getOpcode() != Instruction::FMul)
13242     return true;
13243
13244   if (!I->hasOneUse())
13245     return true;
13246
13247   Instruction *User = I->user_back();
13248
13249   if (!(User->getOpcode() == Instruction::FSub ||
13250         User->getOpcode() == Instruction::FAdd))
13251     return true;
13252
13253   const TargetOptions &Options = getTargetMachine().Options;
13254   const Function *F = I->getFunction();
13255   const DataLayout &DL = F->getParent()->getDataLayout();
13256   Type *Ty = User->getOperand(0)->getType();
13257
13258   return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
13259            isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
13260            (Options.AllowFPOpFusion == FPOpFusion::Fast ||
13261             Options.UnsafeFPMath));
13262 }
13263
13264 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
13265 // 64-bit GPR.
13266 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
13267   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
13268     return false;
13269   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
13270   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
13271   return NumBits1 == 32 && NumBits2 == 64;
13272 }
13273 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
13274   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
13275     return false;
13276   unsigned NumBits1 = VT1.getSizeInBits();
13277   unsigned NumBits2 = VT2.getSizeInBits();
13278   return NumBits1 == 32 && NumBits2 == 64;
13279 }
13280
13281 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
13282   EVT VT1 = Val.getValueType();
13283   if (isZExtFree(VT1, VT2)) {
13284     return true;
13285   }
13286
13287   if (Val.getOpcode() != ISD::LOAD)
13288     return false;
13289
13290   // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
13291   return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
13292           VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
13293           VT1.getSizeInBits() <= 32);
13294 }
13295
13296 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
13297   if (isa<FPExtInst>(Ext))
13298     return false;
13299
13300   // Vector types are not free.
13301   if (Ext->getType()->isVectorTy())
13302     return false;
13303
13304   for (const Use &U : Ext->uses()) {
13305     // The extension is free if we can fold it with a left shift in an
13306     // addressing mode or an arithmetic operation: add, sub, and cmp.
13307
13308     // Is there a shift?
13309     const Instruction *Instr = cast<Instruction>(U.getUser());
13310
13311     // Is this a constant shift?
13312     switch (Instr->getOpcode()) {
13313     case Instruction::Shl:
13314       if (!isa<ConstantInt>(Instr->getOperand(1)))
13315         return false;
13316       break;
13317     case Instruction::GetElementPtr: {
13318       gep_type_iterator GTI = gep_type_begin(Instr);
13319       auto &DL = Ext->getModule()->getDataLayout();
13320       std::advance(GTI, U.getOperandNo()-1);
13321       Type *IdxTy = GTI.getIndexedType();
13322       // This extension will end up with a shift because of the scaling factor.
13323       // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
13324       // Get the shift amount based on the scaling factor:
13325       // log2(sizeof(IdxTy)) - log2(8).
13326       uint64_t ShiftAmt =
13327         countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
13328       // Is the constant foldable in the shift of the addressing mode?
13329       // I.e., shift amount is between 1 and 4 inclusive.
13330       if (ShiftAmt == 0 || ShiftAmt > 4)
13331         return false;
13332       break;
13333     }
13334     case Instruction::Trunc:
13335       // Check if this is a noop.
13336       // trunc(sext ty1 to ty2) to ty1.
13337       if (Instr->getType() == Ext->getOperand(0)->getType())
13338         continue;
13339       [[fallthrough]];
13340     default:
13341       return false;
13342     }
13343
13344     // At this point we can use the bfm family, so this extension is free
13345     // for that use.
13346   }
13347   return true;
13348 }
13349
13350 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower
13351 /// or upper half of the vector elements.
13352 static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
13353   auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
13354     auto *FullTy = FullV->getType();
13355     auto *HalfTy = HalfV->getType();
13356     return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
13357            2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
13358   };
13359
13360   auto extractHalf = [](Value *FullV, Value *HalfV) {
13361     auto *FullVT = cast<FixedVectorType>(FullV->getType());
13362     auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
13363     return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
13364   };
13365
13366   ArrayRef<int> M1, M2;
13367   Value *S1Op1, *S2Op1;
13368   if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
13369       !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
13370     return false;
13371
13372   // Check that the operands are half as wide as the result and we extract
13373   // half of the elements of the input vectors.
13374   if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
13375       !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
13376     return false;
13377
13378   // Check the mask extracts either the lower or upper half of vector
13379   // elements.
13380   int M1Start = -1;
13381   int M2Start = -1;
13382   int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
13383   if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
13384       !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
13385       M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
13386     return false;
13387
13388   return true;
13389 }
13390
13391 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
13392 /// of the vector elements.
13393 static bool areExtractExts(Value *Ext1, Value *Ext2) {
13394   auto areExtDoubled = [](Instruction *Ext) {
13395     return Ext->getType()->getScalarSizeInBits() ==
13396            2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
13397   };
13398
13399   if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
13400       !match(Ext2, m_ZExtOrSExt(m_Value())) ||
13401       !areExtDoubled(cast<Instruction>(Ext1)) ||
13402       !areExtDoubled(cast<Instruction>(Ext2)))
13403     return false;
13404
13405   return true;
13406 }
13407
13408 /// Check if Op could be used with vmull_high_p64 intrinsic.
13409 static bool isOperandOfVmullHighP64(Value *Op) {
13410   Value *VectorOperand = nullptr;
13411   ConstantInt *ElementIndex = nullptr;
13412   return match(Op, m_ExtractElt(m_Value(VectorOperand),
13413                                 m_ConstantInt(ElementIndex))) &&
13414          ElementIndex->getValue() == 1 &&
13415          isa<FixedVectorType>(VectorOperand->getType()) &&
13416          cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
13417 }
13418
13419 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
13420 static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
13421   return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
13422 }
13423
13424 static bool isSplatShuffle(Value *V) {
13425   if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
13426     return all_equal(Shuf->getShuffleMask());
13427   return false;
13428 }
13429
13430 /// Check if sinking \p I's operands to I's basic block is profitable, because
13431 /// the operands can be folded into a target instruction, e.g.
13432 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
13433 bool AArch64TargetLowering::shouldSinkOperands(
13434     Instruction *I, SmallVectorImpl<Use *> &Ops) const {
13435   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
13436     switch (II->getIntrinsicID()) {
13437     case Intrinsic::aarch64_neon_smull:
13438     case Intrinsic::aarch64_neon_umull:
13439       if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) {
13440         Ops.push_back(&II->getOperandUse(0));
13441         Ops.push_back(&II->getOperandUse(1));
13442         return true;
13443       }
13444       [[fallthrough]];
13445
13446     case Intrinsic::fma:
13447       if (isa<VectorType>(I->getType()) &&
13448           cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
13449           !Subtarget->hasFullFP16())
13450         return false;
13451       [[fallthrough]];
13452     case Intrinsic::aarch64_neon_sqdmull:
13453     case Intrinsic::aarch64_neon_sqdmulh:
13454     case Intrinsic::aarch64_neon_sqrdmulh:
13455       // Sink splats for index lane variants
13456       if (isSplatShuffle(II->getOperand(0)))
13457         Ops.push_back(&II->getOperandUse(0));
13458       if (isSplatShuffle(II->getOperand(1)))
13459         Ops.push_back(&II->getOperandUse(1));
13460       return !Ops.empty();
13461     case Intrinsic::aarch64_sve_ptest_first:
13462     case Intrinsic::aarch64_sve_ptest_last:
13463       if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
13464         if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
13465           Ops.push_back(&II->getOperandUse(0));
13466       return !Ops.empty();
13467     case Intrinsic::aarch64_sme_write_horiz:
13468     case Intrinsic::aarch64_sme_write_vert:
13469     case Intrinsic::aarch64_sme_writeq_horiz:
13470     case Intrinsic::aarch64_sme_writeq_vert: {
13471       auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
13472       if (!Idx || Idx->getOpcode() != Instruction::Add)
13473         return false;
13474       Ops.push_back(&II->getOperandUse(1));
13475       return true;
13476     }
13477     case Intrinsic::aarch64_sme_read_horiz:
13478     case Intrinsic::aarch64_sme_read_vert:
13479     case Intrinsic::aarch64_sme_readq_horiz:
13480     case Intrinsic::aarch64_sme_readq_vert:
13481     case Intrinsic::aarch64_sme_ld1b_vert:
13482     case Intrinsic::aarch64_sme_ld1h_vert:
13483     case Intrinsic::aarch64_sme_ld1w_vert:
13484     case Intrinsic::aarch64_sme_ld1d_vert:
13485     case Intrinsic::aarch64_sme_ld1q_vert:
13486     case Intrinsic::aarch64_sme_st1b_vert:
13487     case Intrinsic::aarch64_sme_st1h_vert:
13488     case Intrinsic::aarch64_sme_st1w_vert:
13489     case Intrinsic::aarch64_sme_st1d_vert:
13490     case Intrinsic::aarch64_sme_st1q_vert:
13491     case Intrinsic::aarch64_sme_ld1b_horiz:
13492     case Intrinsic::aarch64_sme_ld1h_horiz:
13493     case Intrinsic::aarch64_sme_ld1w_horiz:
13494     case Intrinsic::aarch64_sme_ld1d_horiz:
13495     case Intrinsic::aarch64_sme_ld1q_horiz:
13496     case Intrinsic::aarch64_sme_st1b_horiz:
13497     case Intrinsic::aarch64_sme_st1h_horiz:
13498     case Intrinsic::aarch64_sme_st1w_horiz:
13499     case Intrinsic::aarch64_sme_st1d_horiz:
13500     case Intrinsic::aarch64_sme_st1q_horiz: {
13501       auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
13502       if (!Idx || Idx->getOpcode() != Instruction::Add)
13503         return false;
13504       Ops.push_back(&II->getOperandUse(3));
13505       return true;
13506     }
13507     case Intrinsic::aarch64_neon_pmull:
13508       if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
13509         return false;
13510       Ops.push_back(&II->getOperandUse(0));
13511       Ops.push_back(&II->getOperandUse(1));
13512       return true;
13513     case Intrinsic::aarch64_neon_pmull64:
13514       if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
13515                                      II->getArgOperand(1)))
13516         return false;
13517       Ops.push_back(&II->getArgOperandUse(0));
13518       Ops.push_back(&II->getArgOperandUse(1));
13519       return true;
13520     default:
13521       return false;
13522     }
13523   }
13524
13525   if (!I->getType()->isVectorTy())
13526     return false;
13527
13528   switch (I->getOpcode()) {
13529   case Instruction::Sub:
13530   case Instruction::Add: {
13531     if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
13532       return false;
13533
13534     // If the exts' operands extract either the lower or upper elements, we
13535     // can sink them too.
13536     auto Ext1 = cast<Instruction>(I->getOperand(0));
13537     auto Ext2 = cast<Instruction>(I->getOperand(1));
13538     if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
13539       Ops.push_back(&Ext1->getOperandUse(0));
13540       Ops.push_back(&Ext2->getOperandUse(0));
13541     }
13542
13543     Ops.push_back(&I->getOperandUse(0));
13544     Ops.push_back(&I->getOperandUse(1));
13545
13546     return true;
13547   }
13548   case Instruction::Mul: {
13549     bool IsProfitable = false;
13550     for (auto &Op : I->operands()) {
13551       // Make sure we are not already sinking this operand
13552       if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
13553         continue;
13554
13555       ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
13556
13557       // If the Shuffle is a splat and the operand is a zext/sext, sinking the
13558       // operand and the s/zext can help create indexed s/umull. This is
13559       // especially useful to prevent i64 mul being scalarized.
13560       if (Shuffle && isSplatShuffle(Shuffle) &&
13561           match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
13562         Ops.push_back(&Shuffle->getOperandUse(0));
13563         Ops.push_back(&Op);
13564         IsProfitable = true;
13565         continue;
13566       }
13567
13568       if (!Shuffle || !Shuffle->isZeroEltSplat())
13569         continue;
13570
13571       Value *ShuffleOperand = Shuffle->getOperand(0);
13572       InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
13573       if (!Insert)
13574         continue;
13575
13576       Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
13577       if (!OperandInstr)
13578         continue;
13579
13580       ConstantInt *ElementConstant =
13581           dyn_cast<ConstantInt>(Insert->getOperand(2));
13582       // Check that the insertelement is inserting into element 0
13583       if (!ElementConstant || ElementConstant->getZExtValue() != 0)
13584         continue;
13585
13586       unsigned Opcode = OperandInstr->getOpcode();
13587       if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
13588         continue;
13589
13590       Ops.push_back(&Shuffle->getOperandUse(0));
13591       Ops.push_back(&Op);
13592       IsProfitable = true;
13593     }
13594
13595     return IsProfitable;
13596   }
13597   default:
13598     return false;
13599   }
13600   return false;
13601 }
13602
13603 static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) {
13604   Value *Op = ZExt->getOperand(0);
13605   auto *SrcTy = dyn_cast<FixedVectorType>(Op->getType());
13606   auto *DstTy = dyn_cast<FixedVectorType>(ZExt->getType());
13607   unsigned NumElts = SrcTy->getNumElements();
13608   IRBuilder<> Builder(ZExt);
13609   SmallVector<int> Mask(4 * NumElts, NumElts);
13610   // Create a mask that selects <0,0,0,Op[i]> for each lane of vector of i32 to
13611   // replace the original ZExt. This can later be lowered to a set of tbl
13612   // instructions.
13613   for (unsigned i = 0; i < NumElts; i++) {
13614     if (IsLittleEndian)
13615       Mask[i * 4] = i;
13616     else
13617       Mask[i * 4 + 3] = i;
13618   }
13619
13620   auto *FirstEltZero = Builder.CreateInsertElement(
13621       PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
13622   Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
13623   Result = Builder.CreateBitCast(Result, DstTy);
13624   ZExt->replaceAllUsesWith(Result);
13625   ZExt->eraseFromParent();
13626 }
13627
13628 static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
13629   IRBuilder<> Builder(TI);
13630   SmallVector<Value *> Parts;
13631   Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
13632   Parts.push_back(Builder.CreateBitCast(
13633       Builder.CreateShuffleVector(TI->getOperand(0), {0, 1, 2, 3}), VecTy));
13634   Parts.push_back(Builder.CreateBitCast(
13635       Builder.CreateShuffleVector(TI->getOperand(0), {4, 5, 6, 7}), VecTy));
13636
13637   Intrinsic::ID TblID = Intrinsic::aarch64_neon_tbl2;
13638   unsigned NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
13639   if (NumElements == 16) {
13640     Parts.push_back(Builder.CreateBitCast(
13641         Builder.CreateShuffleVector(TI->getOperand(0), {8, 9, 10, 11}), VecTy));
13642     Parts.push_back(Builder.CreateBitCast(
13643         Builder.CreateShuffleVector(TI->getOperand(0), {12, 13, 14, 15}),
13644         VecTy));
13645     TblID = Intrinsic::aarch64_neon_tbl4;
13646   }
13647   SmallVector<Constant *, 16> MaskConst;
13648   for (unsigned Idx = 0; Idx < NumElements * 4; Idx += 4)
13649     MaskConst.push_back(
13650         ConstantInt::get(Builder.getInt8Ty(), IsLittleEndian ? Idx : Idx + 3));
13651
13652   for (unsigned Idx = NumElements * 4; Idx < 64; Idx += 4)
13653     MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255));
13654
13655   Parts.push_back(ConstantVector::get(MaskConst));
13656   auto *F =
13657       Intrinsic::getDeclaration(TI->getModule(), TblID, Parts[0]->getType());
13658   Value *Res = Builder.CreateCall(F, Parts);
13659
13660   if (NumElements == 8)
13661     Res = Builder.CreateShuffleVector(Res, {0, 1, 2, 3, 4, 5, 6, 7});
13662   TI->replaceAllUsesWith(Res);
13663   TI->eraseFromParent();
13664 }
13665
13666 bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
13667                                                                Loop *L) const {
13668   // Try to optimize conversions using tbl. This requires materializing constant
13669   // index vectors, which can increase code size and add loads. Skip the
13670   // transform unless the conversion is in a loop block guaranteed to execute
13671   // and we are not optimizing for size.
13672   Function *F = I->getParent()->getParent();
13673   if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
13674       F->hasOptSize())
13675     return false;
13676
13677   auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
13678   auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
13679   if (!SrcTy || !DstTy)
13680     return false;
13681
13682   // Convert 'zext <(8|16) x i8> %x to <(8|16) x i32>' to a shuffle that can be
13683   // lowered to either 2 or 4 tbl instructions to insert the original i8
13684   // elements into i32 lanes.
13685   auto *ZExt = dyn_cast<ZExtInst>(I);
13686   if (ZExt && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
13687       SrcTy->getElementType()->isIntegerTy(8) &&
13688       DstTy->getElementType()->isIntegerTy(32)) {
13689     createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
13690     return true;
13691   }
13692
13693   auto *UIToFP = dyn_cast<UIToFPInst>(I);
13694   if (UIToFP &&
13695       (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
13696       SrcTy->getElementType()->isIntegerTy(8) &&
13697       DstTy->getElementType()->isFloatTy()) {
13698     IRBuilder<> Builder(I);
13699     auto *ZExt = cast<ZExtInst>(
13700         Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
13701     auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
13702     I->replaceAllUsesWith(UI);
13703     I->eraseFromParent();
13704     createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
13705     return true;
13706   }
13707
13708   // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
13709   // followed by a truncate lowered to using tbl.4.
13710   auto *FPToUI = dyn_cast<FPToUIInst>(I);
13711   if (FPToUI &&
13712       (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
13713       SrcTy->getElementType()->isFloatTy() &&
13714       DstTy->getElementType()->isIntegerTy(8)) {
13715     IRBuilder<> Builder(I);
13716     auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
13717                                           VectorType::getInteger(SrcTy));
13718     auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
13719     I->replaceAllUsesWith(TruncI);
13720     I->eraseFromParent();
13721     createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
13722     return true;
13723   }
13724
13725   // Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4
13726   // instruction selecting the lowest 8 bits per lane of the input interpreted
13727   // as 2 or 4 <4 x i32> vectors.
13728   auto *TI = dyn_cast<TruncInst>(I);
13729   if (TI && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
13730       SrcTy->getElementType()->isIntegerTy(32) &&
13731       DstTy->getElementType()->isIntegerTy(8)) {
13732     createTblForTrunc(TI, Subtarget->isLittleEndian());
13733     return true;
13734   }
13735
13736   return false;
13737 }
13738
13739 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
13740                                           Align &RequiredAligment) const {
13741   if (!LoadedType.isSimple() ||
13742       (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
13743     return false;
13744   // Cyclone supports unaligned accesses.
13745   RequiredAligment = Align(1);
13746   unsigned NumBits = LoadedType.getSizeInBits();
13747   return NumBits == 32 || NumBits == 64;
13748 }
13749
13750 /// A helper function for determining the number of interleaved accesses we
13751 /// will generate when lowering accesses of the given type.
13752 unsigned AArch64TargetLowering::getNumInterleavedAccesses(
13753     VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
13754   unsigned VecSize = UseScalable ? Subtarget->getMinSVEVectorSizeInBits() : 128;
13755   return std::max<unsigned>(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize);
13756 }
13757
13758 MachineMemOperand::Flags
13759 AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
13760   if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
13761       I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
13762     return MOStridedAccess;
13763   return MachineMemOperand::MONone;
13764 }
13765
13766 bool AArch64TargetLowering::isLegalInterleavedAccessType(
13767     VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
13768
13769   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
13770   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
13771   unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
13772
13773   UseScalable = false;
13774
13775   // Ensure the number of vector elements is greater than 1.
13776   if (NumElements < 2)
13777     return false;
13778
13779   // Ensure the element type is legal.
13780   if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
13781     return false;
13782
13783   if (Subtarget->useSVEForFixedLengthVectors() &&
13784       (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
13785        (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
13786         isPowerOf2_32(NumElements) && VecSize > 128))) {
13787     UseScalable = true;
13788     return true;
13789   }
13790
13791   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
13792   // 128 will be split into multiple interleaved accesses.
13793   return VecSize == 64 || VecSize % 128 == 0;
13794 }
13795
13796 static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
13797   if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
13798     return ScalableVectorType::get(VTy->getElementType(), 2);
13799
13800   if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
13801     return ScalableVectorType::get(VTy->getElementType(), 4);
13802
13803   if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
13804     return ScalableVectorType::get(VTy->getElementType(), 8);
13805
13806   if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
13807     return ScalableVectorType::get(VTy->getElementType(), 8);
13808
13809   if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
13810     return ScalableVectorType::get(VTy->getElementType(), 2);
13811
13812   if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
13813     return ScalableVectorType::get(VTy->getElementType(), 4);
13814
13815   if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
13816     return ScalableVectorType::get(VTy->getElementType(), 8);
13817
13818   if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
13819     return ScalableVectorType::get(VTy->getElementType(), 16);
13820
13821   llvm_unreachable("Cannot handle input vector type");
13822 }
13823
13824 /// Lower an interleaved load into a ldN intrinsic.
13825 ///
13826 /// E.g. Lower an interleaved load (Factor = 2):
13827 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
13828 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
13829 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
13830 ///
13831 ///      Into:
13832 ///        %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
13833 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
13834 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
13835 bool AArch64TargetLowering::lowerInterleavedLoad(
13836     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
13837     ArrayRef<unsigned> Indices, unsigned Factor) const {
13838   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
13839          "Invalid interleave factor");
13840   assert(!Shuffles.empty() && "Empty shufflevector input");
13841   assert(Shuffles.size() == Indices.size() &&
13842          "Unmatched number of shufflevectors and indices");
13843
13844   const DataLayout &DL = LI->getModule()->getDataLayout();
13845
13846   VectorType *VTy = Shuffles[0]->getType();
13847
13848   // Skip if we do not have NEON and skip illegal vector types. We can
13849   // "legalize" wide vector types into multiple interleaved accesses as long as
13850   // the vector types are divisible by 128.
13851   bool UseScalable;
13852   if (!Subtarget->hasNEON() ||
13853       !isLegalInterleavedAccessType(VTy, DL, UseScalable))
13854     return false;
13855
13856   unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
13857
13858   auto *FVTy = cast<FixedVectorType>(VTy);
13859
13860   // A pointer vector can not be the return type of the ldN intrinsics. Need to
13861   // load integer vectors first and then convert to pointer vectors.
13862   Type *EltTy = FVTy->getElementType();
13863   if (EltTy->isPointerTy())
13864     FVTy =
13865         FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
13866
13867   // If we're going to generate more than one load, reset the sub-vector type
13868   // to something legal.
13869   FVTy = FixedVectorType::get(FVTy->getElementType(),
13870                               FVTy->getNumElements() / NumLoads);
13871
13872   auto *LDVTy =
13873       UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
13874
13875   IRBuilder<> Builder(LI);
13876
13877   // The base address of the load.
13878   Value *BaseAddr = LI->getPointerOperand();
13879
13880   if (NumLoads > 1) {
13881     // We will compute the pointer operand of each load from the original base
13882     // address using GEPs. Cast the base address to a pointer to the scalar
13883     // element type.
13884     BaseAddr = Builder.CreateBitCast(
13885         BaseAddr,
13886         LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
13887   }
13888
13889   Type *PtrTy =
13890       UseScalable
13891           ? LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())
13892           : LDVTy->getPointerTo(LI->getPointerAddressSpace());
13893   Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
13894                                  LDVTy->getElementCount());
13895
13896   static const Intrinsic::ID SVELoadIntrs[3] = {
13897       Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret,
13898       Intrinsic::aarch64_sve_ld4_sret};
13899   static const Intrinsic::ID NEONLoadIntrs[3] = {Intrinsic::aarch64_neon_ld2,
13900                                                  Intrinsic::aarch64_neon_ld3,
13901                                                  Intrinsic::aarch64_neon_ld4};
13902   Function *LdNFunc;
13903   if (UseScalable)
13904     LdNFunc = Intrinsic::getDeclaration(LI->getModule(),
13905                                         SVELoadIntrs[Factor - 2], {LDVTy});
13906   else
13907     LdNFunc = Intrinsic::getDeclaration(
13908         LI->getModule(), NEONLoadIntrs[Factor - 2], {LDVTy, PtrTy});
13909
13910   // Holds sub-vectors extracted from the load intrinsic return values. The
13911   // sub-vectors are associated with the shufflevector instructions they will
13912   // replace.
13913   DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
13914
13915   Value *PTrue = nullptr;
13916   if (UseScalable) {
13917     Optional<unsigned> PgPattern =
13918         getSVEPredPatternFromNumElements(FVTy->getNumElements());
13919     if (Subtarget->getMinSVEVectorSizeInBits() ==
13920             Subtarget->getMaxSVEVectorSizeInBits() &&
13921         Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
13922       PgPattern = AArch64SVEPredPattern::all;
13923
13924     auto *PTruePat =
13925         ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
13926     PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
13927                                     {PTruePat});
13928   }
13929
13930   for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
13931
13932     // If we're generating more than one load, compute the base address of
13933     // subsequent loads as an offset from the previous.
13934     if (LoadCount > 0)
13935       BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
13936                                             FVTy->getNumElements() * Factor);
13937
13938     CallInst *LdN;
13939     if (UseScalable)
13940       LdN = Builder.CreateCall(
13941           LdNFunc, {PTrue, Builder.CreateBitCast(BaseAddr, PtrTy)}, "ldN");
13942     else
13943       LdN = Builder.CreateCall(LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy),
13944                                "ldN");
13945
13946     // Extract and store the sub-vectors returned by the load intrinsic.
13947     for (unsigned i = 0; i < Shuffles.size(); i++) {
13948       ShuffleVectorInst *SVI = Shuffles[i];
13949       unsigned Index = Indices[i];
13950
13951       Value *SubVec = Builder.CreateExtractValue(LdN, Index);
13952
13953       if (UseScalable)
13954         SubVec = Builder.CreateExtractVector(
13955             FVTy, SubVec,
13956             ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
13957
13958       // Convert the integer vector to pointer vector if the element is pointer.
13959       if (EltTy->isPointerTy())
13960         SubVec = Builder.CreateIntToPtr(
13961             SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
13962                                          FVTy->getNumElements()));
13963
13964       SubVecs[SVI].push_back(SubVec);
13965     }
13966   }
13967
13968   // Replace uses of the shufflevector instructions with the sub-vectors
13969   // returned by the load intrinsic. If a shufflevector instruction is
13970   // associated with more than one sub-vector, those sub-vectors will be
13971   // concatenated into a single wide vector.
13972   for (ShuffleVectorInst *SVI : Shuffles) {
13973     auto &SubVec = SubVecs[SVI];
13974     auto *WideVec =
13975         SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
13976     SVI->replaceAllUsesWith(WideVec);
13977   }
13978
13979   return true;
13980 }
13981
13982 /// Lower an interleaved store into a stN intrinsic.
13983 ///
13984 /// E.g. Lower an interleaved store (Factor = 3):
13985 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
13986 ///                 <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
13987 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
13988 ///
13989 ///      Into:
13990 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
13991 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
13992 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
13993 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
13994 ///
13995 /// Note that the new shufflevectors will be removed and we'll only generate one
13996 /// st3 instruction in CodeGen.
13997 ///
13998 /// Example for a more general valid mask (Factor 3). Lower:
13999 ///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
14000 ///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
14001 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
14002 ///
14003 ///      Into:
14004 ///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
14005 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
14006 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
14007 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
14008 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
14009                                                   ShuffleVectorInst *SVI,
14010                                                   unsigned Factor) const {
14011   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
14012          "Invalid interleave factor");
14013
14014   auto *VecTy = cast<FixedVectorType>(SVI->getType());
14015   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
14016
14017   unsigned LaneLen = VecTy->getNumElements() / Factor;
14018   Type *EltTy = VecTy->getElementType();
14019   auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
14020
14021   const DataLayout &DL = SI->getModule()->getDataLayout();
14022   bool UseScalable;
14023
14024   // Skip if we do not have NEON and skip illegal vector types. We can
14025   // "legalize" wide vector types into multiple interleaved accesses as long as
14026   // the vector types are divisible by 128.
14027   if (!Subtarget->hasNEON() ||
14028       !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
14029     return false;
14030
14031   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
14032
14033   Value *Op0 = SVI->getOperand(0);
14034   Value *Op1 = SVI->getOperand(1);
14035   IRBuilder<> Builder(SI);
14036
14037   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
14038   // vectors to integer vectors.
14039   if (EltTy->isPointerTy()) {
14040     Type *IntTy = DL.getIntPtrType(EltTy);
14041     unsigned NumOpElts =
14042         cast<FixedVectorType>(Op0->getType())->getNumElements();
14043
14044     // Convert to the corresponding integer vector.
14045     auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
14046     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
14047     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
14048
14049     SubVecTy = FixedVectorType::get(IntTy, LaneLen);
14050   }
14051
14052   // If we're going to generate more than one store, reset the lane length
14053   // and sub-vector type to something legal.
14054   LaneLen /= NumStores;
14055   SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
14056
14057   auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
14058                             : SubVecTy;
14059
14060   // The base address of the store.
14061   Value *BaseAddr = SI->getPointerOperand();
14062
14063   if (NumStores > 1) {
14064     // We will compute the pointer operand of each store from the original base
14065     // address using GEPs. Cast the base address to a pointer to the scalar
14066     // element type.
14067     BaseAddr = Builder.CreateBitCast(
14068         BaseAddr,
14069         SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
14070   }
14071
14072   auto Mask = SVI->getShuffleMask();
14073
14074   // Sanity check if all the indices are NOT in range.
14075   // If mask is `undef` or `poison`, `Mask` may be a vector of -1s.
14076   // If all of them are `undef`, OOB read will happen later.
14077   if (llvm::all_of(Mask, [](int Idx) { return Idx == UndefMaskElem; })) {
14078     return false;
14079   }
14080
14081   Type *PtrTy =
14082       UseScalable
14083           ? STVTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())
14084           : STVTy->getPointerTo(SI->getPointerAddressSpace());
14085   Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
14086                                  STVTy->getElementCount());
14087
14088   static const Intrinsic::ID SVEStoreIntrs[3] = {Intrinsic::aarch64_sve_st2,
14089                                                  Intrinsic::aarch64_sve_st3,
14090                                                  Intrinsic::aarch64_sve_st4};
14091   static const Intrinsic::ID NEONStoreIntrs[3] = {Intrinsic::aarch64_neon_st2,
14092                                                   Intrinsic::aarch64_neon_st3,
14093                                                   Intrinsic::aarch64_neon_st4};
14094   Function *StNFunc;
14095   if (UseScalable)
14096     StNFunc = Intrinsic::getDeclaration(SI->getModule(),
14097                                         SVEStoreIntrs[Factor - 2], {STVTy});
14098   else
14099     StNFunc = Intrinsic::getDeclaration(
14100         SI->getModule(), NEONStoreIntrs[Factor - 2], {STVTy, PtrTy});
14101
14102   Value *PTrue = nullptr;
14103   if (UseScalable) {
14104     Optional<unsigned> PgPattern =
14105         getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
14106     if (Subtarget->getMinSVEVectorSizeInBits() ==
14107             Subtarget->getMaxSVEVectorSizeInBits() &&
14108         Subtarget->getMinSVEVectorSizeInBits() ==
14109             DL.getTypeSizeInBits(SubVecTy))
14110       PgPattern = AArch64SVEPredPattern::all;
14111
14112     auto *PTruePat =
14113         ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
14114     PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
14115                                     {PTruePat});
14116   }
14117
14118   for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
14119
14120     SmallVector<Value *, 5> Ops;
14121
14122     // Split the shufflevector operands into sub vectors for the new stN call.
14123     for (unsigned i = 0; i < Factor; i++) {
14124       Value *Shuffle;
14125       unsigned IdxI = StoreCount * LaneLen * Factor + i;
14126       if (Mask[IdxI] >= 0) {
14127         Shuffle = Builder.CreateShuffleVector(
14128             Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
14129       } else {
14130         unsigned StartMask = 0;
14131         for (unsigned j = 1; j < LaneLen; j++) {
14132           unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
14133           if (Mask[IdxJ] >= 0) {
14134             StartMask = Mask[IdxJ] - j;
14135             break;
14136           }
14137         }
14138         // Note: Filling undef gaps with random elements is ok, since
14139         // those elements were being written anyway (with undefs).
14140         // In the case of all undefs we're defaulting to using elems from 0
14141         // Note: StartMask cannot be negative, it's checked in
14142         // isReInterleaveMask
14143         Shuffle = Builder.CreateShuffleVector(
14144             Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
14145       }
14146
14147       if (UseScalable)
14148         Shuffle = Builder.CreateInsertVector(
14149             STVTy, UndefValue::get(STVTy), Shuffle,
14150             ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
14151
14152       Ops.push_back(Shuffle);
14153     }
14154
14155     if (UseScalable)
14156       Ops.push_back(PTrue);
14157
14158     // If we generating more than one store, we compute the base address of
14159     // subsequent stores as an offset from the previous.
14160     if (StoreCount > 0)
14161       BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
14162                                             BaseAddr, LaneLen * Factor);
14163
14164     Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
14165     Builder.CreateCall(StNFunc, Ops);
14166   }
14167   return true;
14168 }
14169
14170 EVT AArch64TargetLowering::getOptimalMemOpType(
14171     const MemOp &Op, const AttributeList &FuncAttributes) const {
14172   bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
14173   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
14174   bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
14175   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
14176   // taken one instruction to materialize the v2i64 zero and one store (with
14177   // restrictive addressing mode). Just do i64 stores.
14178   bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
14179   auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
14180     if (Op.isAligned(AlignCheck))
14181       return true;
14182     bool Fast;
14183     return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
14184                                           MachineMemOperand::MONone, &Fast) &&
14185            Fast;
14186   };
14187
14188   if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
14189       AlignmentIsAcceptable(MVT::v16i8, Align(16)))
14190     return MVT::v16i8;
14191   if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
14192     return MVT::f128;
14193   if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
14194     return MVT::i64;
14195   if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
14196     return MVT::i32;
14197   return MVT::Other;
14198 }
14199
14200 LLT AArch64TargetLowering::getOptimalMemOpLLT(
14201     const MemOp &Op, const AttributeList &FuncAttributes) const {
14202   bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
14203   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
14204   bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
14205   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
14206   // taken one instruction to materialize the v2i64 zero and one store (with
14207   // restrictive addressing mode). Just do i64 stores.
14208   bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
14209   auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
14210     if (Op.isAligned(AlignCheck))
14211       return true;
14212     bool Fast;
14213     return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
14214                                           MachineMemOperand::MONone, &Fast) &&
14215            Fast;
14216   };
14217
14218   if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
14219       AlignmentIsAcceptable(MVT::v2i64, Align(16)))
14220     return LLT::fixed_vector(2, 64);
14221   if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
14222     return LLT::scalar(128);
14223   if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
14224     return LLT::scalar(64);
14225   if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
14226     return LLT::scalar(32);
14227   return LLT();
14228 }
14229
14230 // 12-bit optionally shifted immediates are legal for adds.
14231 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
14232   if (Immed == std::numeric_limits<int64_t>::min()) {
14233     LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
14234                       << ": avoid UB for INT64_MIN\n");
14235     return false;
14236   }
14237   // Same encoding for add/sub, just flip the sign.
14238   Immed = std::abs(Immed);
14239   bool IsLegal = ((Immed >> 12) == 0 ||
14240                   ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
14241   LLVM_DEBUG(dbgs() << "Is " << Immed
14242                     << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
14243   return IsLegal;
14244 }
14245
14246 // Return false to prevent folding
14247 // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
14248 // if the folding leads to worse code.
14249 bool AArch64TargetLowering::isMulAddWithConstProfitable(
14250     SDValue AddNode, SDValue ConstNode) const {
14251   // Let the DAGCombiner decide for vector types and large types.
14252   const EVT VT = AddNode.getValueType();
14253   if (VT.isVector() || VT.getScalarSizeInBits() > 64)
14254     return true;
14255
14256   // It is worse if c1 is legal add immediate, while c1*c2 is not
14257   // and has to be composed by at least two instructions.
14258   const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
14259   const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
14260   const int64_t C1 = C1Node->getSExtValue();
14261   const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
14262   if (!isLegalAddImmediate(C1) || isLegalAddImmediate(C1C2.getSExtValue()))
14263     return true;
14264   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
14265   AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), VT.getSizeInBits(), Insn);
14266   if (Insn.size() > 1)
14267     return false;
14268
14269   // Default to true and let the DAGCombiner decide.
14270   return true;
14271 }
14272
14273 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
14274 // immediates is the same as for an add or a sub.
14275 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
14276   return isLegalAddImmediate(Immed);
14277 }
14278
14279 /// isLegalAddressingMode - Return true if the addressing mode represented
14280 /// by AM is legal for this target, for a load/store of the specified type.
14281 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
14282                                                   const AddrMode &AM, Type *Ty,
14283                                                   unsigned AS, Instruction *I) const {
14284   // AArch64 has five basic addressing modes:
14285   //  reg
14286   //  reg + 9-bit signed offset
14287   //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
14288   //  reg1 + reg2
14289   //  reg + SIZE_IN_BYTES * reg
14290
14291   // No global is ever allowed as a base.
14292   if (AM.BaseGV)
14293     return false;
14294
14295   // No reg+reg+imm addressing.
14296   if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
14297     return false;
14298
14299   // FIXME: Update this method to support scalable addressing modes.
14300   if (isa<ScalableVectorType>(Ty)) {
14301     uint64_t VecElemNumBytes =
14302         DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
14303     return AM.HasBaseReg && !AM.BaseOffs &&
14304            (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
14305   }
14306
14307   // check reg + imm case:
14308   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
14309   uint64_t NumBytes = 0;
14310   if (Ty->isSized()) {
14311     uint64_t NumBits = DL.getTypeSizeInBits(Ty);
14312     NumBytes = NumBits / 8;
14313     if (!isPowerOf2_64(NumBits))
14314       NumBytes = 0;
14315   }
14316
14317   if (!AM.Scale) {
14318     int64_t Offset = AM.BaseOffs;
14319
14320     // 9-bit signed offset
14321     if (isInt<9>(Offset))
14322       return true;
14323
14324     // 12-bit unsigned offset
14325     unsigned shift = Log2_64(NumBytes);
14326     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
14327         // Must be a multiple of NumBytes (NumBytes is a power of 2)
14328         (Offset >> shift) << shift == Offset)
14329       return true;
14330     return false;
14331   }
14332
14333   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
14334
14335   return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
14336 }
14337
14338 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
14339   // Consider splitting large offset of struct or array.
14340   return true;
14341 }
14342
14343 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
14344     const MachineFunction &MF, EVT VT) const {
14345   VT = VT.getScalarType();
14346
14347   if (!VT.isSimple())
14348     return false;
14349
14350   switch (VT.getSimpleVT().SimpleTy) {
14351   case MVT::f16:
14352     return Subtarget->hasFullFP16();
14353   case MVT::f32:
14354   case MVT::f64:
14355     return true;
14356   default:
14357     break;
14358   }
14359
14360   return false;
14361 }
14362
14363 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
14364                                                        Type *Ty) const {
14365   switch (Ty->getScalarType()->getTypeID()) {
14366   case Type::FloatTyID:
14367   case Type::DoubleTyID:
14368     return true;
14369   default:
14370     return false;
14371   }
14372 }
14373
14374 bool AArch64TargetLowering::generateFMAsInMachineCombiner(
14375     EVT VT, CodeGenOpt::Level OptLevel) const {
14376   return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector() &&
14377          !useSVEForFixedLengthVectorVT(VT);
14378 }
14379
14380 const MCPhysReg *
14381 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
14382   // LR is a callee-save register, but we must treat it as clobbered by any call
14383   // site. Hence we include LR in the scratch registers, which are in turn added
14384   // as implicit-defs for stackmaps and patchpoints.
14385   static const MCPhysReg ScratchRegs[] = {
14386     AArch64::X16, AArch64::X17, AArch64::LR, 0
14387   };
14388   return ScratchRegs;
14389 }
14390
14391 bool
14392 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
14393                                                      CombineLevel Level) const {
14394   assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
14395           N->getOpcode() == ISD::SRL) &&
14396          "Expected shift op");
14397
14398   SDValue ShiftLHS = N->getOperand(0);
14399   EVT VT = N->getValueType(0);
14400
14401   // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not combine
14402   // it with shift 'N' to let it be lowered to UBFX.
14403   if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
14404       isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
14405     uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
14406     if (isMask_64(TruncMask) &&
14407         ShiftLHS.getOperand(0).getOpcode() == ISD::SRL &&
14408         isa<ConstantSDNode>(ShiftLHS.getOperand(0).getOperand(1)))
14409       return false;
14410   }
14411   return true;
14412 }
14413
14414 bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
14415     const SDNode *N) const {
14416   assert(N->getOpcode() == ISD::XOR &&
14417          (N->getOperand(0).getOpcode() == ISD::SHL ||
14418           N->getOperand(0).getOpcode() == ISD::SRL) &&
14419          "Expected XOR(SHIFT) pattern");
14420
14421   // Only commute if the entire NOT mask is a hidden shifted mask.
14422   auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
14423   auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
14424   if (XorC && ShiftC) {
14425     unsigned MaskIdx, MaskLen;
14426     if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
14427       unsigned ShiftAmt = ShiftC->getZExtValue();
14428       unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
14429       if (N->getOperand(0).getOpcode() == ISD::SHL)
14430         return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
14431       return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
14432     }
14433   }
14434
14435   return false;
14436 }
14437
14438 bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
14439     const SDNode *N, CombineLevel Level) const {
14440   assert(((N->getOpcode() == ISD::SHL &&
14441            N->getOperand(0).getOpcode() == ISD::SRL) ||
14442           (N->getOpcode() == ISD::SRL &&
14443            N->getOperand(0).getOpcode() == ISD::SHL)) &&
14444          "Expected shift-shift mask");
14445   // Don't allow multiuse shift folding with the same shift amount.
14446   if (!N->getOperand(0)->hasOneUse())
14447     return false;
14448
14449   // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
14450   EVT VT = N->getValueType(0);
14451   if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
14452     auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
14453     auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
14454     return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
14455   }
14456
14457   return true;
14458 }
14459
14460 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
14461                                                               Type *Ty) const {
14462   assert(Ty->isIntegerTy());
14463
14464   unsigned BitSize = Ty->getPrimitiveSizeInBits();
14465   if (BitSize == 0)
14466     return false;
14467
14468   int64_t Val = Imm.getSExtValue();
14469   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
14470     return true;
14471
14472   if ((int64_t)Val < 0)
14473     Val = ~Val;
14474   if (BitSize == 32)
14475     Val &= (1LL << 32) - 1;
14476
14477   unsigned LZ = countLeadingZeros((uint64_t)Val);
14478   unsigned Shift = (63 - LZ) / 16;
14479   // MOVZ is free so return true for one or fewer MOVK.
14480   return Shift < 3;
14481 }
14482
14483 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
14484                                                     unsigned Index) const {
14485   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
14486     return false;
14487
14488   return (Index == 0 || Index == ResVT.getVectorMinNumElements());
14489 }
14490
14491 /// Turn vector tests of the signbit in the form of:
14492 ///   xor (sra X, elt_size(X)-1), -1
14493 /// into:
14494 ///   cmge X, X, #0
14495 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
14496                                          const AArch64Subtarget *Subtarget) {
14497   EVT VT = N->getValueType(0);
14498   if (!Subtarget->hasNEON() || !VT.isVector())
14499     return SDValue();
14500
14501   // There must be a shift right algebraic before the xor, and the xor must be a
14502   // 'not' operation.
14503   SDValue Shift = N->getOperand(0);
14504   SDValue Ones = N->getOperand(1);
14505   if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
14506       !ISD::isBuildVectorAllOnes(Ones.getNode()))
14507     return SDValue();
14508
14509   // The shift should be smearing the sign bit across each vector element.
14510   auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
14511   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
14512   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
14513     return SDValue();
14514
14515   return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
14516 }
14517
14518 // Given a vecreduce_add node, detect the below pattern and convert it to the
14519 // node sequence with UABDL, [S|U]ADB and UADDLP.
14520 //
14521 // i32 vecreduce_add(
14522 //  v16i32 abs(
14523 //    v16i32 sub(
14524 //     v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
14525 // =================>
14526 // i32 vecreduce_add(
14527 //   v4i32 UADDLP(
14528 //     v8i16 add(
14529 //       v8i16 zext(
14530 //         v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
14531 //       v8i16 zext(
14532 //         v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
14533 static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
14534                                                     SelectionDAG &DAG) {
14535   // Assumed i32 vecreduce_add
14536   if (N->getValueType(0) != MVT::i32)
14537     return SDValue();
14538
14539   SDValue VecReduceOp0 = N->getOperand(0);
14540   unsigned Opcode = VecReduceOp0.getOpcode();
14541   // Assumed v16i32 abs
14542   if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
14543     return SDValue();
14544
14545   SDValue ABS = VecReduceOp0;
14546   // Assumed v16i32 sub
14547   if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
14548       ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
14549     return SDValue();
14550
14551   SDValue SUB = ABS->getOperand(0);
14552   unsigned Opcode0 = SUB->getOperand(0).getOpcode();
14553   unsigned Opcode1 = SUB->getOperand(1).getOpcode();
14554   // Assumed v16i32 type
14555   if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
14556       SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
14557     return SDValue();
14558
14559   // Assumed zext or sext
14560   bool IsZExt = false;
14561   if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
14562     IsZExt = true;
14563   } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
14564     IsZExt = false;
14565   } else
14566     return SDValue();
14567
14568   SDValue EXT0 = SUB->getOperand(0);
14569   SDValue EXT1 = SUB->getOperand(1);
14570   // Assumed zext's operand has v16i8 type
14571   if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
14572       EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
14573     return SDValue();
14574
14575   // Pattern is dectected. Let's convert it to sequence of nodes.
14576   SDLoc DL(N);
14577
14578   // First, create the node pattern of UABD/SABD.
14579   SDValue UABDHigh8Op0 =
14580       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
14581                   DAG.getConstant(8, DL, MVT::i64));
14582   SDValue UABDHigh8Op1 =
14583       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
14584                   DAG.getConstant(8, DL, MVT::i64));
14585   SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
14586                                   UABDHigh8Op0, UABDHigh8Op1);
14587   SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
14588
14589   // Second, create the node pattern of UABAL.
14590   SDValue UABDLo8Op0 =
14591       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
14592                   DAG.getConstant(0, DL, MVT::i64));
14593   SDValue UABDLo8Op1 =
14594       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
14595                   DAG.getConstant(0, DL, MVT::i64));
14596   SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
14597                                 UABDLo8Op0, UABDLo8Op1);
14598   SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
14599   SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
14600
14601   // Third, create the node of UADDLP.
14602   SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
14603
14604   // Fourth, create the node of VECREDUCE_ADD.
14605   return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
14606 }
14607
14608 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
14609 //   vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
14610 //   vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
14611 static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
14612                                           const AArch64Subtarget *ST) {
14613   if (!ST->hasDotProd())
14614     return performVecReduceAddCombineWithUADDLP(N, DAG);
14615
14616   SDValue Op0 = N->getOperand(0);
14617   if (N->getValueType(0) != MVT::i32 ||
14618       Op0.getValueType().getVectorElementType() != MVT::i32)
14619     return SDValue();
14620
14621   unsigned ExtOpcode = Op0.getOpcode();
14622   SDValue A = Op0;
14623   SDValue B;
14624   if (ExtOpcode == ISD::MUL) {
14625     A = Op0.getOperand(0);
14626     B = Op0.getOperand(1);
14627     if (A.getOpcode() != B.getOpcode() ||
14628         A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
14629       return SDValue();
14630     ExtOpcode = A.getOpcode();
14631   }
14632   if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
14633     return SDValue();
14634
14635   EVT Op0VT = A.getOperand(0).getValueType();
14636   if (Op0VT != MVT::v8i8 && Op0VT != MVT::v16i8)
14637     return SDValue();
14638
14639   SDLoc DL(Op0);
14640   // For non-mla reductions B can be set to 1. For MLA we take the operand of
14641   // the extend B.
14642   if (!B)
14643     B = DAG.getConstant(1, DL, Op0VT);
14644   else
14645     B = B.getOperand(0);
14646
14647   SDValue Zeros =
14648       DAG.getConstant(0, DL, Op0VT == MVT::v8i8 ? MVT::v2i32 : MVT::v4i32);
14649   auto DotOpcode =
14650       (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
14651   SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
14652                             A.getOperand(0), B);
14653   return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
14654 }
14655
14656 // Given an (integer) vecreduce, we know the order of the inputs does not
14657 // matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
14658 // into UADDV(UADDLP(x)). This can also happen through an extra add, where we
14659 // transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
14660 static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
14661   auto DetectAddExtract = [&](SDValue A) {
14662     // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
14663     // UADDLP(x) if found.
14664     if (A.getOpcode() != ISD::ADD)
14665       return SDValue();
14666     EVT VT = A.getValueType();
14667     SDValue Op0 = A.getOperand(0);
14668     SDValue Op1 = A.getOperand(1);
14669     if (Op0.getOpcode() != Op0.getOpcode() ||
14670         (Op0.getOpcode() != ISD::ZERO_EXTEND &&
14671          Op0.getOpcode() != ISD::SIGN_EXTEND))
14672       return SDValue();
14673     SDValue Ext0 = Op0.getOperand(0);
14674     SDValue Ext1 = Op1.getOperand(0);
14675     if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14676         Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14677         Ext0.getOperand(0) != Ext1.getOperand(0))
14678       return SDValue();
14679     // Check that the type is twice the add types, and the extract are from
14680     // upper/lower parts of the same source.
14681     if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=
14682         VT.getVectorNumElements() * 2)
14683       return SDValue();
14684     if ((Ext0.getConstantOperandVal(1) != 0 &&
14685          Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&
14686         (Ext1.getConstantOperandVal(1) != 0 &&
14687          Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))
14688       return SDValue();
14689     unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
14690                                                           : AArch64ISD::SADDLP;
14691     return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
14692   };
14693
14694   SDValue A = N->getOperand(0);
14695   if (SDValue R = DetectAddExtract(A))
14696     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
14697   if (A.getOpcode() == ISD::ADD) {
14698     if (SDValue R = DetectAddExtract(A.getOperand(0)))
14699       return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
14700                          DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
14701                                      A.getOperand(1)));
14702     if (SDValue R = DetectAddExtract(A.getOperand(1)))
14703       return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
14704                          DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
14705                                      A.getOperand(0)));
14706   }
14707   return SDValue();
14708 }
14709
14710
14711 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
14712                                  TargetLowering::DAGCombinerInfo &DCI,
14713                                  const AArch64Subtarget *Subtarget) {
14714   if (DCI.isBeforeLegalizeOps())
14715     return SDValue();
14716
14717   return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
14718 }
14719
14720 SDValue
14721 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
14722                                      SelectionDAG &DAG,
14723                                      SmallVectorImpl<SDNode *> &Created) const {
14724   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
14725   if (isIntDivCheap(N->getValueType(0), Attr))
14726     return SDValue(N,0); // Lower SDIV as SDIV
14727
14728   EVT VT = N->getValueType(0);
14729
14730   // For scalable and fixed types, mark them as cheap so we can handle it much
14731   // later. This allows us to handle larger than legal types.
14732   if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
14733     return SDValue(N, 0);
14734
14735   // fold (sdiv X, pow2)
14736   if ((VT != MVT::i32 && VT != MVT::i64) ||
14737       !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
14738     return SDValue();
14739
14740   SDLoc DL(N);
14741   SDValue N0 = N->getOperand(0);
14742   unsigned Lg2 = Divisor.countTrailingZeros();
14743   SDValue Zero = DAG.getConstant(0, DL, VT);
14744   SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
14745
14746   // Add (N0 < 0) ? Pow2 - 1 : 0;
14747   SDValue CCVal;
14748   SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
14749   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
14750   SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
14751
14752   Created.push_back(Cmp.getNode());
14753   Created.push_back(Add.getNode());
14754   Created.push_back(CSel.getNode());
14755
14756   // Divide by pow2.
14757   SDValue SRA =
14758       DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
14759
14760   // If we're dividing by a positive value, we're done.  Otherwise, we must
14761   // negate the result.
14762   if (Divisor.isNonNegative())
14763     return SRA;
14764
14765   Created.push_back(SRA.getNode());
14766   return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
14767 }
14768
14769 SDValue
14770 AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
14771                                      SelectionDAG &DAG,
14772                                      SmallVectorImpl<SDNode *> &Created) const {
14773   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
14774   if (isIntDivCheap(N->getValueType(0), Attr))
14775     return SDValue(N, 0); // Lower SREM as SREM
14776
14777   EVT VT = N->getValueType(0);
14778
14779   // For scalable and fixed types, mark them as cheap so we can handle it much
14780   // later. This allows us to handle larger than legal types.
14781   if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
14782     return SDValue(N, 0);
14783
14784   // fold (srem X, pow2)
14785   if ((VT != MVT::i32 && VT != MVT::i64) ||
14786       !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
14787     return SDValue();
14788
14789   unsigned Lg2 = Divisor.countTrailingZeros();
14790   if (Lg2 == 0)
14791     return SDValue();
14792
14793   SDLoc DL(N);
14794   SDValue N0 = N->getOperand(0);
14795   SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
14796   SDValue Zero = DAG.getConstant(0, DL, VT);
14797   SDValue CCVal, CSNeg;
14798   if (Lg2 == 1) {
14799     SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
14800     SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
14801     CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
14802
14803     Created.push_back(Cmp.getNode());
14804     Created.push_back(And.getNode());
14805   } else {
14806     SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
14807     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
14808
14809     SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
14810     SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
14811     SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
14812     CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
14813                         Negs.getValue(1));
14814
14815     Created.push_back(Negs.getNode());
14816     Created.push_back(AndPos.getNode());
14817     Created.push_back(AndNeg.getNode());
14818   }
14819
14820   return CSNeg;
14821 }
14822
14823 static bool IsSVECntIntrinsic(SDValue S) {
14824   switch(getIntrinsicID(S.getNode())) {
14825   default:
14826     break;
14827   case Intrinsic::aarch64_sve_cntb:
14828   case Intrinsic::aarch64_sve_cnth:
14829   case Intrinsic::aarch64_sve_cntw:
14830   case Intrinsic::aarch64_sve_cntd:
14831     return true;
14832   }
14833   return false;
14834 }
14835
14836 /// Calculates what the pre-extend type is, based on the extension
14837 /// operation node provided by \p Extend.
14838 ///
14839 /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
14840 /// pre-extend type is pulled directly from the operand, while other extend
14841 /// operations need a bit more inspection to get this information.
14842 ///
14843 /// \param Extend The SDNode from the DAG that represents the extend operation
14844 ///
14845 /// \returns The type representing the \p Extend source type, or \p MVT::Other
14846 /// if no valid type can be determined
14847 static EVT calculatePreExtendType(SDValue Extend) {
14848   switch (Extend.getOpcode()) {
14849   case ISD::SIGN_EXTEND:
14850   case ISD::ZERO_EXTEND:
14851     return Extend.getOperand(0).getValueType();
14852   case ISD::AssertSext:
14853   case ISD::AssertZext:
14854   case ISD::SIGN_EXTEND_INREG: {
14855     VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
14856     if (!TypeNode)
14857       return MVT::Other;
14858     return TypeNode->getVT();
14859   }
14860   case ISD::AND: {
14861     ConstantSDNode *Constant =
14862         dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
14863     if (!Constant)
14864       return MVT::Other;
14865
14866     uint32_t Mask = Constant->getZExtValue();
14867
14868     if (Mask == UCHAR_MAX)
14869       return MVT::i8;
14870     else if (Mask == USHRT_MAX)
14871       return MVT::i16;
14872     else if (Mask == UINT_MAX)
14873       return MVT::i32;
14874
14875     return MVT::Other;
14876   }
14877   default:
14878     return MVT::Other;
14879   }
14880 }
14881
14882 /// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
14883 /// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
14884 /// SExt/ZExt rather than the scalar SExt/ZExt
14885 static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
14886   EVT VT = BV.getValueType();
14887   if (BV.getOpcode() != ISD::BUILD_VECTOR &&
14888       BV.getOpcode() != ISD::VECTOR_SHUFFLE)
14889     return SDValue();
14890
14891   // Use the first item in the buildvector/shuffle to get the size of the
14892   // extend, and make sure it looks valid.
14893   SDValue Extend = BV->getOperand(0);
14894   unsigned ExtendOpcode = Extend.getOpcode();
14895   bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
14896                 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
14897                 ExtendOpcode == ISD::AssertSext;
14898   if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
14899       ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
14900     return SDValue();
14901   // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
14902   // calculatePreExtendType will work without issue.
14903   if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
14904       ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
14905     return SDValue();
14906
14907   // Restrict valid pre-extend data type
14908   EVT PreExtendType = calculatePreExtendType(Extend);
14909   if (PreExtendType == MVT::Other ||
14910       PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
14911     return SDValue();
14912
14913   // Make sure all other operands are equally extended
14914   for (SDValue Op : drop_begin(BV->ops())) {
14915     if (Op.isUndef())
14916       continue;
14917     unsigned Opc = Op.getOpcode();
14918     bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
14919                      Opc == ISD::AssertSext;
14920     if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
14921       return SDValue();
14922   }
14923
14924   SDValue NBV;
14925   SDLoc DL(BV);
14926   if (BV.getOpcode() == ISD::BUILD_VECTOR) {
14927     EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
14928     EVT PreExtendLegalType =
14929         PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
14930     SmallVector<SDValue, 8> NewOps;
14931     for (SDValue Op : BV->ops())
14932       NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
14933                                     : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
14934                                                            PreExtendLegalType));
14935     NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
14936   } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
14937     EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
14938     NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
14939                                BV.getOperand(1).isUndef()
14940                                    ? DAG.getUNDEF(PreExtendVT)
14941                                    : BV.getOperand(1).getOperand(0),
14942                                cast<ShuffleVectorSDNode>(BV)->getMask());
14943   }
14944   return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
14945 }
14946
14947 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
14948 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
14949 static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
14950   // If the value type isn't a vector, none of the operands are going to be dups
14951   EVT VT = Mul->getValueType(0);
14952   if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
14953     return SDValue();
14954
14955   SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
14956   SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
14957
14958   // Neither operands have been changed, don't make any further changes
14959   if (!Op0 && !Op1)
14960     return SDValue();
14961
14962   SDLoc DL(Mul);
14963   return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
14964                      Op1 ? Op1 : Mul->getOperand(1));
14965 }
14966
14967 // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
14968 // Same for other types with equivalent constants.
14969 static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
14970   EVT VT = N->getValueType(0);
14971   if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
14972       VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
14973     return SDValue();
14974   if (N->getOperand(0).getOpcode() != ISD::AND ||
14975       N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
14976     return SDValue();
14977
14978   SDValue And = N->getOperand(0);
14979   SDValue Srl = And.getOperand(0);
14980
14981   APInt V1, V2, V3;
14982   if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
14983       !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
14984       !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))
14985     return SDValue();
14986
14987   unsigned HalfSize = VT.getScalarSizeInBits() / 2;
14988   if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
14989       V3 != (HalfSize - 1))
14990     return SDValue();
14991
14992   EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
14993                                 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
14994                                 VT.getVectorElementCount() * 2);
14995
14996   SDLoc DL(N);
14997   SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
14998   SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
14999   return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
15000 }
15001
15002 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
15003                                  TargetLowering::DAGCombinerInfo &DCI,
15004                                  const AArch64Subtarget *Subtarget) {
15005
15006   if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
15007     return Ext;
15008   if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
15009     return Ext;
15010
15011   if (DCI.isBeforeLegalizeOps())
15012     return SDValue();
15013
15014   // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
15015   // and in MachineCombiner pass, add+mul will be combined into madd.
15016   // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
15017   SDLoc DL(N);
15018   EVT VT = N->getValueType(0);
15019   SDValue N0 = N->getOperand(0);
15020   SDValue N1 = N->getOperand(1);
15021   SDValue MulOper;
15022   unsigned AddSubOpc;
15023
15024   auto IsAddSubWith1 = [&](SDValue V) -> bool {
15025     AddSubOpc = V->getOpcode();
15026     if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
15027       SDValue Opnd = V->getOperand(1);
15028       MulOper = V->getOperand(0);
15029       if (AddSubOpc == ISD::SUB)
15030         std::swap(Opnd, MulOper);
15031       if (auto C = dyn_cast<ConstantSDNode>(Opnd))
15032         return C->isOne();
15033     }
15034     return false;
15035   };
15036
15037   if (IsAddSubWith1(N0)) {
15038     SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
15039     return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
15040   }
15041
15042   if (IsAddSubWith1(N1)) {
15043     SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
15044     return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
15045   }
15046
15047   // The below optimizations require a constant RHS.
15048   if (!isa<ConstantSDNode>(N1))
15049     return SDValue();
15050
15051   ConstantSDNode *C = cast<ConstantSDNode>(N1);
15052   const APInt &ConstValue = C->getAPIntValue();
15053
15054   // Allow the scaling to be folded into the `cnt` instruction by preventing
15055   // the scaling to be obscured here. This makes it easier to pattern match.
15056   if (IsSVECntIntrinsic(N0) ||
15057      (N0->getOpcode() == ISD::TRUNCATE &&
15058       (IsSVECntIntrinsic(N0->getOperand(0)))))
15059        if (ConstValue.sge(1) && ConstValue.sle(16))
15060          return SDValue();
15061
15062   // Multiplication of a power of two plus/minus one can be done more
15063   // cheaply as as shift+add/sub. For now, this is true unilaterally. If
15064   // future CPUs have a cheaper MADD instruction, this may need to be
15065   // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
15066   // 64-bit is 5 cycles, so this is always a win.
15067   // More aggressively, some multiplications N0 * C can be lowered to
15068   // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
15069   // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
15070   // TODO: lower more cases.
15071
15072   // TrailingZeroes is used to test if the mul can be lowered to
15073   // shift+add+shift.
15074   unsigned TrailingZeroes = ConstValue.countTrailingZeros();
15075   if (TrailingZeroes) {
15076     // Conservatively do not lower to shift+add+shift if the mul might be
15077     // folded into smul or umul.
15078     if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
15079                             isZeroExtended(N0.getNode(), DAG)))
15080       return SDValue();
15081     // Conservatively do not lower to shift+add+shift if the mul might be
15082     // folded into madd or msub.
15083     if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
15084                            N->use_begin()->getOpcode() == ISD::SUB))
15085       return SDValue();
15086   }
15087   // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
15088   // and shift+add+shift.
15089   APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
15090   unsigned ShiftAmt;
15091
15092   auto Shl = [&](SDValue N0, unsigned N1) {
15093     SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
15094     return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
15095   };
15096   auto Add = [&](SDValue N0, SDValue N1) {
15097     return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
15098   };
15099   auto Sub = [&](SDValue N0, SDValue N1) {
15100     return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
15101   };
15102   auto Negate = [&](SDValue N) {
15103     SDValue Zero = DAG.getConstant(0, DL, VT);
15104     return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
15105   };
15106
15107   // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
15108   // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
15109   // the (2^N - 1) can't be execused via a single instruction.
15110   auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
15111     unsigned BitWidth = C.getBitWidth();
15112     for (unsigned i = 1; i < BitWidth / 2; i++) {
15113       APInt Rem;
15114       APInt X(BitWidth, (1 << i) + 1);
15115       APInt::sdivrem(C, X, N, Rem);
15116       APInt NVMinus1 = N - 1;
15117       if (Rem == 0 && NVMinus1.isPowerOf2()) {
15118         M = X;
15119         return true;
15120       }
15121     }
15122     return false;
15123   };
15124
15125   if (ConstValue.isNonNegative()) {
15126     // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
15127     // (mul x, 2^N - 1) => (sub (shl x, N), x)
15128     // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
15129     // (mul x, (2^M + 1) * (2^N + 1))
15130     //     => MV = (add (shl x, M), x); (add (shl MV, N), MV)
15131     APInt SCVMinus1 = ShiftedConstValue - 1;
15132     APInt SCVPlus1 = ShiftedConstValue + 1;
15133     APInt CVPlus1 = ConstValue + 1;
15134     APInt CVM, CVN;
15135     if (SCVMinus1.isPowerOf2()) {
15136       ShiftAmt = SCVMinus1.logBase2();
15137       return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
15138     } else if (CVPlus1.isPowerOf2()) {
15139       ShiftAmt = CVPlus1.logBase2();
15140       return Sub(Shl(N0, ShiftAmt), N0);
15141     } else if (SCVPlus1.isPowerOf2()) {
15142       ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
15143       return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
15144     } else if (Subtarget->hasLSLFast() &&
15145                isPowPlusPlusConst(ConstValue, CVM, CVN)) {
15146       APInt CVMMinus1 = CVM - 1;
15147       APInt CVNMinus1 = CVN - 1;
15148       unsigned ShiftM1 = CVMMinus1.logBase2();
15149       unsigned ShiftN1 = CVNMinus1.logBase2();
15150       // LSLFast implicate that Shifts <= 3 places are fast
15151       if (ShiftM1 <= 3 && ShiftN1 <= 3) {
15152         SDValue MVal = Add(Shl(N0, ShiftM1), N0);
15153         return Add(Shl(MVal, ShiftN1), MVal);
15154       }
15155     }
15156   } else {
15157     // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
15158     // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
15159     // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
15160     APInt SCVPlus1 = -ShiftedConstValue + 1;
15161     APInt CVNegPlus1 = -ConstValue + 1;
15162     APInt CVNegMinus1 = -ConstValue - 1;
15163     if (CVNegPlus1.isPowerOf2()) {
15164       ShiftAmt = CVNegPlus1.logBase2();
15165       return Sub(N0, Shl(N0, ShiftAmt));
15166     } else if (CVNegMinus1.isPowerOf2()) {
15167       ShiftAmt = CVNegMinus1.logBase2();
15168       return Negate(Add(Shl(N0, ShiftAmt), N0));
15169     } else if (SCVPlus1.isPowerOf2()) {
15170       ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
15171       return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
15172     }
15173   }
15174
15175   return SDValue();
15176 }
15177
15178 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
15179                                                          SelectionDAG &DAG) {
15180   // Take advantage of vector comparisons producing 0 or -1 in each lane to
15181   // optimize away operation when it's from a constant.
15182   //
15183   // The general transformation is:
15184   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
15185   //       AND(VECTOR_CMP(x,y), constant2)
15186   //    constant2 = UNARYOP(constant)
15187
15188   // Early exit if this isn't a vector operation, the operand of the
15189   // unary operation isn't a bitwise AND, or if the sizes of the operations
15190   // aren't the same.
15191   EVT VT = N->getValueType(0);
15192   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
15193       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
15194       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
15195     return SDValue();
15196
15197   // Now check that the other operand of the AND is a constant. We could
15198   // make the transformation for non-constant splats as well, but it's unclear
15199   // that would be a benefit as it would not eliminate any operations, just
15200   // perform one more step in scalar code before moving to the vector unit.
15201   if (BuildVectorSDNode *BV =
15202           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
15203     // Bail out if the vector isn't a constant.
15204     if (!BV->isConstant())
15205       return SDValue();
15206
15207     // Everything checks out. Build up the new and improved node.
15208     SDLoc DL(N);
15209     EVT IntVT = BV->getValueType(0);
15210     // Create a new constant of the appropriate type for the transformed
15211     // DAG.
15212     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
15213     // The AND node needs bitcasts to/from an integer vector type around it.
15214     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
15215     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
15216                                  N->getOperand(0)->getOperand(0), MaskConst);
15217     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
15218     return Res;
15219   }
15220
15221   return SDValue();
15222 }
15223
15224 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
15225                                      const AArch64Subtarget *Subtarget) {
15226   // First try to optimize away the conversion when it's conditionally from
15227   // a constant. Vectors only.
15228   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
15229     return Res;
15230
15231   EVT VT = N->getValueType(0);
15232   if (VT != MVT::f32 && VT != MVT::f64)
15233     return SDValue();
15234
15235   // Only optimize when the source and destination types have the same width.
15236   if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
15237     return SDValue();
15238
15239   // If the result of an integer load is only used by an integer-to-float
15240   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
15241   // This eliminates an "integer-to-vector-move" UOP and improves throughput.
15242   SDValue N0 = N->getOperand(0);
15243   if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
15244       // Do not change the width of a volatile load.
15245       !cast<LoadSDNode>(N0)->isVolatile()) {
15246     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15247     SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
15248                                LN0->getPointerInfo(), LN0->getAlign(),
15249                                LN0->getMemOperand()->getFlags());
15250
15251     // Make sure successors of the original load stay after it by updating them
15252     // to use the new Chain.
15253     DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
15254
15255     unsigned Opcode =
15256         (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
15257     return DAG.getNode(Opcode, SDLoc(N), VT, Load);
15258   }
15259
15260   return SDValue();
15261 }
15262
15263 /// Fold a floating-point multiply by power of two into floating-point to
15264 /// fixed-point conversion.
15265 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
15266                                      TargetLowering::DAGCombinerInfo &DCI,
15267                                      const AArch64Subtarget *Subtarget) {
15268   if (!Subtarget->hasNEON())
15269     return SDValue();
15270
15271   if (!N->getValueType(0).isSimple())
15272     return SDValue();
15273
15274   SDValue Op = N->getOperand(0);
15275   if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
15276     return SDValue();
15277
15278   if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
15279     return SDValue();
15280
15281   SDValue ConstVec = Op->getOperand(1);
15282   if (!isa<BuildVectorSDNode>(ConstVec))
15283     return SDValue();
15284
15285   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
15286   uint32_t FloatBits = FloatTy.getSizeInBits();
15287   if (FloatBits != 32 && FloatBits != 64 &&
15288       (FloatBits != 16 || !Subtarget->hasFullFP16()))
15289     return SDValue();
15290
15291   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
15292   uint32_t IntBits = IntTy.getSizeInBits();
15293   if (IntBits != 16 && IntBits != 32 && IntBits != 64)
15294     return SDValue();
15295
15296   // Avoid conversions where iN is larger than the float (e.g., float -> i64).
15297   if (IntBits > FloatBits)
15298     return SDValue();
15299
15300   BitVector UndefElements;
15301   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
15302   int32_t Bits = IntBits == 64 ? 64 : 32;
15303   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
15304   if (C == -1 || C == 0 || C > Bits)
15305     return SDValue();
15306
15307   EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
15308   if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
15309     return SDValue();
15310
15311   if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
15312       N->getOpcode() == ISD::FP_TO_UINT_SAT) {
15313     EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
15314     if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
15315       return SDValue();
15316   }
15317
15318   SDLoc DL(N);
15319   bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
15320                    N->getOpcode() == ISD::FP_TO_SINT_SAT);
15321   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
15322                                       : Intrinsic::aarch64_neon_vcvtfp2fxu;
15323   SDValue FixConv =
15324       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
15325                   DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
15326                   Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
15327   // We can handle smaller integers by generating an extra trunc.
15328   if (IntBits < FloatBits)
15329     FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
15330
15331   return FixConv;
15332 }
15333
15334 /// Fold a floating-point divide by power of two into fixed-point to
15335 /// floating-point conversion.
15336 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
15337                                   TargetLowering::DAGCombinerInfo &DCI,
15338                                   const AArch64Subtarget *Subtarget) {
15339   if (!Subtarget->hasNEON())
15340     return SDValue();
15341
15342   SDValue Op = N->getOperand(0);
15343   unsigned Opc = Op->getOpcode();
15344   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
15345       !Op.getOperand(0).getValueType().isSimple() ||
15346       (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
15347     return SDValue();
15348
15349   SDValue ConstVec = N->getOperand(1);
15350   if (!isa<BuildVectorSDNode>(ConstVec))
15351     return SDValue();
15352
15353   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
15354   int32_t IntBits = IntTy.getSizeInBits();
15355   if (IntBits != 16 && IntBits != 32 && IntBits != 64)
15356     return SDValue();
15357
15358   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
15359   int32_t FloatBits = FloatTy.getSizeInBits();
15360   if (FloatBits != 32 && FloatBits != 64)
15361     return SDValue();
15362
15363   // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
15364   if (IntBits > FloatBits)
15365     return SDValue();
15366
15367   BitVector UndefElements;
15368   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
15369   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
15370   if (C == -1 || C == 0 || C > FloatBits)
15371     return SDValue();
15372
15373   MVT ResTy;
15374   unsigned NumLanes = Op.getValueType().getVectorNumElements();
15375   switch (NumLanes) {
15376   default:
15377     return SDValue();
15378   case 2:
15379     ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
15380     break;
15381   case 4:
15382     ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
15383     break;
15384   }
15385
15386   if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
15387     return SDValue();
15388
15389   SDLoc DL(N);
15390   SDValue ConvInput = Op.getOperand(0);
15391   bool IsSigned = Opc == ISD::SINT_TO_FP;
15392   if (IntBits < FloatBits)
15393     ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
15394                             ResTy, ConvInput);
15395
15396   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
15397                                       : Intrinsic::aarch64_neon_vcvtfxu2fp;
15398   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
15399                      DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
15400                      DAG.getConstant(C, DL, MVT::i32));
15401 }
15402
15403 /// An EXTR instruction is made up of two shifts, ORed together. This helper
15404 /// searches for and classifies those shifts.
15405 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
15406                          bool &FromHi) {
15407   if (N.getOpcode() == ISD::SHL)
15408     FromHi = false;
15409   else if (N.getOpcode() == ISD::SRL)
15410     FromHi = true;
15411   else
15412     return false;
15413
15414   if (!isa<ConstantSDNode>(N.getOperand(1)))
15415     return false;
15416
15417   ShiftAmount = N->getConstantOperandVal(1);
15418   Src = N->getOperand(0);
15419   return true;
15420 }
15421
15422 /// EXTR instruction extracts a contiguous chunk of bits from two existing
15423 /// registers viewed as a high/low pair. This function looks for the pattern:
15424 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
15425 /// with an EXTR. Can't quite be done in TableGen because the two immediates
15426 /// aren't independent.
15427 static SDValue tryCombineToEXTR(SDNode *N,
15428                                 TargetLowering::DAGCombinerInfo &DCI) {
15429   SelectionDAG &DAG = DCI.DAG;
15430   SDLoc DL(N);
15431   EVT VT = N->getValueType(0);
15432
15433   assert(N->getOpcode() == ISD::OR && "Unexpected root");
15434
15435   if (VT != MVT::i32 && VT != MVT::i64)
15436     return SDValue();
15437
15438   SDValue LHS;
15439   uint32_t ShiftLHS = 0;
15440   bool LHSFromHi = false;
15441   if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
15442     return SDValue();
15443
15444   SDValue RHS;
15445   uint32_t ShiftRHS = 0;
15446   bool RHSFromHi = false;
15447   if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
15448     return SDValue();
15449
15450   // If they're both trying to come from the high part of the register, they're
15451   // not really an EXTR.
15452   if (LHSFromHi == RHSFromHi)
15453     return SDValue();
15454
15455   if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
15456     return SDValue();
15457
15458   if (LHSFromHi) {
15459     std::swap(LHS, RHS);
15460     std::swap(ShiftLHS, ShiftRHS);
15461   }
15462
15463   return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
15464                      DAG.getConstant(ShiftRHS, DL, MVT::i64));
15465 }
15466
15467 static SDValue tryCombineToBSL(SDNode *N,
15468                                 TargetLowering::DAGCombinerInfo &DCI) {
15469   EVT VT = N->getValueType(0);
15470   SelectionDAG &DAG = DCI.DAG;
15471   SDLoc DL(N);
15472
15473   if (!VT.isVector())
15474     return SDValue();
15475
15476   // The combining code currently only works for NEON vectors. In particular,
15477   // it does not work for SVE when dealing with vectors wider than 128 bits.
15478   if (!VT.is64BitVector() && !VT.is128BitVector())
15479     return SDValue();
15480
15481   SDValue N0 = N->getOperand(0);
15482   if (N0.getOpcode() != ISD::AND)
15483     return SDValue();
15484
15485   SDValue N1 = N->getOperand(1);
15486   if (N1.getOpcode() != ISD::AND)
15487     return SDValue();
15488
15489   // InstCombine does (not (neg a)) => (add a -1).
15490   // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15491   // Loop over all combinations of AND operands.
15492   for (int i = 1; i >= 0; --i) {
15493     for (int j = 1; j >= 0; --j) {
15494       SDValue O0 = N0->getOperand(i);
15495       SDValue O1 = N1->getOperand(j);
15496       SDValue Sub, Add, SubSibling, AddSibling;
15497
15498       // Find a SUB and an ADD operand, one from each AND.
15499       if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15500         Sub = O0;
15501         Add = O1;
15502         SubSibling = N0->getOperand(1 - i);
15503         AddSibling = N1->getOperand(1 - j);
15504       } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15505         Add = O0;
15506         Sub = O1;
15507         AddSibling = N0->getOperand(1 - i);
15508         SubSibling = N1->getOperand(1 - j);
15509       } else
15510         continue;
15511
15512       if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()))
15513         continue;
15514
15515       // Constant ones is always righthand operand of the Add.
15516       if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
15517         continue;
15518
15519       if (Sub.getOperand(1) != Add.getOperand(0))
15520         continue;
15521
15522       return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15523     }
15524   }
15525
15526   // (or (and a b) (and (not a) c)) => (bsl a b c)
15527   // We only have to look for constant vectors here since the general, variable
15528   // case can be handled in TableGen.
15529   unsigned Bits = VT.getScalarSizeInBits();
15530   uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
15531   for (int i = 1; i >= 0; --i)
15532     for (int j = 1; j >= 0; --j) {
15533       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
15534       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
15535       if (!BVN0 || !BVN1)
15536         continue;
15537
15538       bool FoundMatch = true;
15539       for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15540         ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
15541         ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
15542         if (!CN0 || !CN1 ||
15543             CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
15544           FoundMatch = false;
15545           break;
15546         }
15547       }
15548
15549       if (FoundMatch)
15550         return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
15551                            N0->getOperand(1 - i), N1->getOperand(1 - j));
15552     }
15553
15554   return SDValue();
15555 }
15556
15557 // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
15558 // convert to csel(ccmp(.., cc0)), depending on cc1:
15559
15560 // (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
15561 // =>
15562 // (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
15563 //
15564 // (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
15565 // =>
15566 // (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
15567 static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
15568   EVT VT = N->getValueType(0);
15569   SDValue CSel0 = N->getOperand(0);
15570   SDValue CSel1 = N->getOperand(1);
15571
15572   if (CSel0.getOpcode() != AArch64ISD::CSEL ||
15573       CSel1.getOpcode() != AArch64ISD::CSEL)
15574     return SDValue();
15575
15576   if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
15577     return SDValue();
15578
15579   if (!isNullConstant(CSel0.getOperand(0)) ||
15580       !isOneConstant(CSel0.getOperand(1)) ||
15581       !isNullConstant(CSel1.getOperand(0)) ||
15582       !isOneConstant(CSel1.getOperand(1)))
15583     return SDValue();
15584
15585   SDValue Cmp0 = CSel0.getOperand(3);
15586   SDValue Cmp1 = CSel1.getOperand(3);
15587   AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2);
15588   AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2);
15589   if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
15590     return SDValue();
15591   if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
15592       Cmp0.getOpcode() == AArch64ISD::SUBS) {
15593     std::swap(Cmp0, Cmp1);
15594     std::swap(CC0, CC1);
15595   }
15596
15597   if (Cmp1.getOpcode() != AArch64ISD::SUBS)
15598     return SDValue();
15599
15600   SDLoc DL(N);
15601   SDValue CCmp, Condition;
15602   unsigned NZCV;
15603
15604   if (N->getOpcode() == ISD::AND) {
15605     AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0);
15606     Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
15607     NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1);
15608   } else {
15609     AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1);
15610     Condition = DAG.getConstant(CC0, DL, MVT_CC);
15611     NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1);
15612   }
15613
15614   SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
15615
15616   auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
15617   if (Op1 && Op1->getAPIntValue().isNegative() &&
15618       Op1->getAPIntValue().sgt(-32)) {
15619     // CCMP accept the constant int the range [0, 31]
15620     // if the Op1 is a constant in the range [-31, -1], we
15621     // can select to CCMN to avoid the extra mov
15622     SDValue AbsOp1 =
15623         DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
15624     CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
15625                        NZCVOp, Condition, Cmp0);
15626   } else {
15627     CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
15628                        Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
15629   }
15630   return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
15631                      CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
15632                      CCmp);
15633 }
15634
15635 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
15636                                 const AArch64Subtarget *Subtarget) {
15637   SelectionDAG &DAG = DCI.DAG;
15638   EVT VT = N->getValueType(0);
15639
15640   if (SDValue R = performANDORCSELCombine(N, DAG))
15641     return R;
15642
15643   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
15644     return SDValue();
15645
15646   // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
15647   if (SDValue Res = tryCombineToEXTR(N, DCI))
15648     return Res;
15649
15650   if (SDValue Res = tryCombineToBSL(N, DCI))
15651     return Res;
15652
15653   return SDValue();
15654 }
15655
15656 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
15657   if (!MemVT.getVectorElementType().isSimple())
15658     return false;
15659
15660   uint64_t MaskForTy = 0ull;
15661   switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
15662   case MVT::i8:
15663     MaskForTy = 0xffull;
15664     break;
15665   case MVT::i16:
15666     MaskForTy = 0xffffull;
15667     break;
15668   case MVT::i32:
15669     MaskForTy = 0xffffffffull;
15670     break;
15671   default:
15672     return false;
15673     break;
15674   }
15675
15676   if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
15677     if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
15678       return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
15679
15680   return false;
15681 }
15682
15683 static SDValue performSVEAndCombine(SDNode *N,
15684                                     TargetLowering::DAGCombinerInfo &DCI) {
15685   if (DCI.isBeforeLegalizeOps())
15686     return SDValue();
15687
15688   SelectionDAG &DAG = DCI.DAG;
15689   SDValue Src = N->getOperand(0);
15690   unsigned Opc = Src->getOpcode();
15691
15692   // Zero/any extend of an unsigned unpack
15693   if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
15694     SDValue UnpkOp = Src->getOperand(0);
15695     SDValue Dup = N->getOperand(1);
15696
15697     if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
15698       return SDValue();
15699
15700     SDLoc DL(N);
15701     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
15702     if (!C)
15703       return SDValue();
15704
15705     uint64_t ExtVal = C->getZExtValue();
15706
15707     // If the mask is fully covered by the unpack, we don't need to push
15708     // a new AND onto the operand
15709     EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
15710     if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
15711         (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
15712         (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
15713       return Src;
15714
15715     // Truncate to prevent a DUP with an over wide constant
15716     APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
15717
15718     // Otherwise, make sure we propagate the AND to the operand
15719     // of the unpack
15720     Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
15721                       DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
15722
15723     SDValue And = DAG.getNode(ISD::AND, DL,
15724                               UnpkOp->getValueType(0), UnpkOp, Dup);
15725
15726     return DAG.getNode(Opc, DL, N->getValueType(0), And);
15727   }
15728
15729   if (!EnableCombineMGatherIntrinsics)
15730     return SDValue();
15731
15732   SDValue Mask = N->getOperand(1);
15733
15734   if (!Src.hasOneUse())
15735     return SDValue();
15736
15737   EVT MemVT;
15738
15739   // SVE load instructions perform an implicit zero-extend, which makes them
15740   // perfect candidates for combining.
15741   switch (Opc) {
15742   case AArch64ISD::LD1_MERGE_ZERO:
15743   case AArch64ISD::LDNF1_MERGE_ZERO:
15744   case AArch64ISD::LDFF1_MERGE_ZERO:
15745     MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
15746     break;
15747   case AArch64ISD::GLD1_MERGE_ZERO:
15748   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
15749   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
15750   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
15751   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
15752   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
15753   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
15754   case AArch64ISD::GLDFF1_MERGE_ZERO:
15755   case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
15756   case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
15757   case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
15758   case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
15759   case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
15760   case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
15761   case AArch64ISD::GLDNT1_MERGE_ZERO:
15762     MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
15763     break;
15764   default:
15765     return SDValue();
15766   }
15767
15768   if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
15769     return Src;
15770
15771   return SDValue();
15772 }
15773
15774 static SDValue performANDCombine(SDNode *N,
15775                                  TargetLowering::DAGCombinerInfo &DCI) {
15776   SelectionDAG &DAG = DCI.DAG;
15777   SDValue LHS = N->getOperand(0);
15778   SDValue RHS = N->getOperand(1);
15779   EVT VT = N->getValueType(0);
15780
15781   if (SDValue R = performANDORCSELCombine(N, DAG))
15782     return R;
15783
15784   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
15785     return SDValue();
15786
15787   if (VT.isScalableVector())
15788     return performSVEAndCombine(N, DCI);
15789
15790   // The combining code below works only for NEON vectors. In particular, it
15791   // does not work for SVE when dealing with vectors wider than 128 bits.
15792   if (!VT.is64BitVector() && !VT.is128BitVector())
15793     return SDValue();
15794
15795   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
15796   if (!BVN)
15797     return SDValue();
15798
15799   // AND does not accept an immediate, so check if we can use a BIC immediate
15800   // instruction instead. We do this here instead of using a (and x, (mvni imm))
15801   // pattern in isel, because some immediates may be lowered to the preferred
15802   // (and x, (movi imm)) form, even though an mvni representation also exists.
15803   APInt DefBits(VT.getSizeInBits(), 0);
15804   APInt UndefBits(VT.getSizeInBits(), 0);
15805   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15806     SDValue NewOp;
15807
15808     DefBits = ~DefBits;
15809     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
15810                                     DefBits, &LHS)) ||
15811         (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
15812                                     DefBits, &LHS)))
15813       return NewOp;
15814
15815     UndefBits = ~UndefBits;
15816     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
15817                                     UndefBits, &LHS)) ||
15818         (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
15819                                     UndefBits, &LHS)))
15820       return NewOp;
15821   }
15822
15823   return SDValue();
15824 }
15825
15826 static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
15827   switch (Opcode) {
15828   case ISD::STRICT_FADD:
15829   case ISD::FADD:
15830     return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
15831   case ISD::ADD:
15832     return VT == MVT::i64;
15833   default:
15834     return false;
15835   }
15836 }
15837
15838 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
15839                         AArch64CC::CondCode Cond);
15840
15841 static bool isPredicateCCSettingOp(SDValue N) {
15842   if ((N.getOpcode() == ISD::SETCC) ||
15843       (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
15844        (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
15845         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
15846         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
15847         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
15848         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
15849         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
15850         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
15851         N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
15852         // get_active_lane_mask is lowered to a whilelo instruction.
15853         N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
15854     return true;
15855
15856   return false;
15857 }
15858
15859 // Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
15860 // ... into: "ptrue p, all" + PTEST
15861 static SDValue
15862 performFirstTrueTestVectorCombine(SDNode *N,
15863                                   TargetLowering::DAGCombinerInfo &DCI,
15864                                   const AArch64Subtarget *Subtarget) {
15865   assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
15866   // Make sure PTEST can be legalised with illegal types.
15867   if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
15868     return SDValue();
15869
15870   SDValue N0 = N->getOperand(0);
15871   EVT VT = N0.getValueType();
15872
15873   if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
15874       !isNullConstant(N->getOperand(1)))
15875     return SDValue();
15876
15877   // Restricted the DAG combine to only cases where we're extracting from a
15878   // flag-setting operation.
15879   if (!isPredicateCCSettingOp(N0))
15880     return SDValue();
15881
15882   // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
15883   SelectionDAG &DAG = DCI.DAG;
15884   SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
15885   return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
15886 }
15887
15888 // Materialize : Idx = (add (mul vscale, NumEls), -1)
15889 //               i1 = extract_vector_elt t37, Constant:i64<Idx>
15890 //     ... into: "ptrue p, all" + PTEST
15891 static SDValue
15892 performLastTrueTestVectorCombine(SDNode *N,
15893                                  TargetLowering::DAGCombinerInfo &DCI,
15894                                  const AArch64Subtarget *Subtarget) {
15895   assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
15896   // Make sure PTEST is legal types.
15897   if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
15898     return SDValue();
15899
15900   SDValue N0 = N->getOperand(0);
15901   EVT OpVT = N0.getValueType();
15902
15903   if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
15904     return SDValue();
15905
15906   // Idx == (add (mul vscale, NumEls), -1)
15907   SDValue Idx = N->getOperand(1);
15908   if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
15909     return SDValue();
15910
15911   SDValue VS = Idx.getOperand(0);
15912   if (VS.getOpcode() != ISD::VSCALE)
15913     return SDValue();
15914
15915   unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
15916   if (VS.getConstantOperandVal(0) != NumEls)
15917     return SDValue();
15918
15919   // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
15920   SelectionDAG &DAG = DCI.DAG;
15921   SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
15922   return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
15923 }
15924
15925 static SDValue
15926 performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
15927                                const AArch64Subtarget *Subtarget) {
15928   assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
15929   if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
15930     return Res;
15931   if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
15932     return Res;
15933
15934   SelectionDAG &DAG = DCI.DAG;
15935   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
15936   ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
15937
15938   EVT VT = N->getValueType(0);
15939   const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
15940   bool IsStrict = N0->isStrictFPOpcode();
15941
15942   // extract(dup x) -> x
15943   if (N0.getOpcode() == AArch64ISD::DUP)
15944     return DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
15945
15946   // Rewrite for pairwise fadd pattern
15947   //   (f32 (extract_vector_elt
15948   //           (fadd (vXf32 Other)
15949   //                 (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
15950   // ->
15951   //   (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
15952   //              (extract_vector_elt (vXf32 Other) 1))
15953   // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
15954   // we can only do this when it's used only by the extract_vector_elt.
15955   if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
15956       hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
15957       (!IsStrict || N0.hasOneUse())) {
15958     SDLoc DL(N0);
15959     SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
15960     SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
15961
15962     ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
15963     SDValue Other = N00;
15964
15965     // And handle the commutative case.
15966     if (!Shuffle) {
15967       Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
15968       Other = N01;
15969     }
15970
15971     if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
15972         Other == Shuffle->getOperand(0)) {
15973       SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
15974                                      DAG.getConstant(0, DL, MVT::i64));
15975       SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
15976                                      DAG.getConstant(1, DL, MVT::i64));
15977       if (!IsStrict)
15978         return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
15979
15980       // For strict_fadd we need uses of the final extract_vector to be replaced
15981       // with the strict_fadd, but we also need uses of the chain output of the
15982       // original strict_fadd to use the chain output of the new strict_fadd as
15983       // otherwise it may not be deleted.
15984       SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
15985                                 {VT, MVT::Other},
15986                                 {N0->getOperand(0), Extract1, Extract2});
15987       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
15988       DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
15989       return SDValue(N, 0);
15990     }
15991   }
15992
15993   return SDValue();
15994 }
15995
15996 static SDValue performConcatVectorsCombine(SDNode *N,
15997                                            TargetLowering::DAGCombinerInfo &DCI,
15998                                            SelectionDAG &DAG) {
15999   SDLoc dl(N);
16000   EVT VT = N->getValueType(0);
16001   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
16002   unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
16003
16004   if (VT.isScalableVector())
16005     return SDValue();
16006
16007   // Optimize concat_vectors of truncated vectors, where the intermediate
16008   // type is illegal, to avoid said illegality,  e.g.,
16009   //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
16010   //                          (v2i16 (truncate (v2i64)))))
16011   // ->
16012   //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
16013   //                                    (v4i32 (bitcast (v2i64))),
16014   //                                    <0, 2, 4, 6>)))
16015   // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
16016   // on both input and result type, so we might generate worse code.
16017   // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
16018   if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
16019       N1Opc == ISD::TRUNCATE) {
16020     SDValue N00 = N0->getOperand(0);
16021     SDValue N10 = N1->getOperand(0);
16022     EVT N00VT = N00.getValueType();
16023
16024     if (N00VT == N10.getValueType() &&
16025         (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
16026         N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
16027       MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
16028       SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
16029       for (size_t i = 0; i < Mask.size(); ++i)
16030         Mask[i] = i * 2;
16031       return DAG.getNode(ISD::TRUNCATE, dl, VT,
16032                          DAG.getVectorShuffle(
16033                              MidVT, dl,
16034                              DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
16035                              DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
16036     }
16037   }
16038
16039   if (N->getOperand(0).getValueType() == MVT::v4i8) {
16040     // If we have a concat of v4i8 loads, convert them to a buildvector of f32
16041     // loads to prevent having to go through the v4i8 load legalization that
16042     // needs to extend each element into a larger type.
16043     if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
16044           if (V.getValueType() != MVT::v4i8)
16045             return false;
16046           if (V.isUndef())
16047             return true;
16048           LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
16049           return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
16050                  LD->getExtensionType() == ISD::NON_EXTLOAD;
16051         })) {
16052       EVT NVT =
16053           EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
16054       SmallVector<SDValue> Ops;
16055
16056       for (unsigned i = 0; i < N->getNumOperands(); i++) {
16057         SDValue V = N->getOperand(i);
16058         if (V.isUndef())
16059           Ops.push_back(DAG.getUNDEF(MVT::f32));
16060         else {
16061           LoadSDNode *LD = cast<LoadSDNode>(V);
16062           SDValue NewLoad =
16063               DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
16064                           LD->getMemOperand());
16065           DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
16066           Ops.push_back(NewLoad);
16067         }
16068       }
16069       return DAG.getBitcast(N->getValueType(0),
16070                             DAG.getBuildVector(NVT, dl, Ops));
16071     }
16072   }
16073
16074
16075   // Wait 'til after everything is legalized to try this. That way we have
16076   // legal vector types and such.
16077   if (DCI.isBeforeLegalizeOps())
16078     return SDValue();
16079
16080   // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use
16081   // extracted subvectors from the same original vectors. Combine these into a
16082   // single avg that operates on the two original vectors.
16083   // avgceil is the target independant name for rhadd, avgfloor is a hadd.
16084   // Example:
16085   //  (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>),
16086   //                                   extract_subvector (v16i8 OpB, <0>))),
16087   //                  (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>),
16088   //                                   extract_subvector (v16i8 OpB, <8>)))))
16089   // ->
16090   //  (v16i8(avgceils(v16i8 OpA, v16i8 OpB)))
16091   if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
16092       (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
16093        N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) {
16094     SDValue N00 = N0->getOperand(0);
16095     SDValue N01 = N0->getOperand(1);
16096     SDValue N10 = N1->getOperand(0);
16097     SDValue N11 = N1->getOperand(1);
16098
16099     EVT N00VT = N00.getValueType();
16100     EVT N10VT = N10.getValueType();
16101
16102     if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16103         N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16104         N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16105         N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
16106       SDValue N00Source = N00->getOperand(0);
16107       SDValue N01Source = N01->getOperand(0);
16108       SDValue N10Source = N10->getOperand(0);
16109       SDValue N11Source = N11->getOperand(0);
16110
16111       if (N00Source == N10Source && N01Source == N11Source &&
16112           N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
16113         assert(N0.getValueType() == N1.getValueType());
16114
16115         uint64_t N00Index = N00.getConstantOperandVal(1);
16116         uint64_t N01Index = N01.getConstantOperandVal(1);
16117         uint64_t N10Index = N10.getConstantOperandVal(1);
16118         uint64_t N11Index = N11.getConstantOperandVal(1);
16119
16120         if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
16121             N10Index == N00VT.getVectorNumElements())
16122           return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
16123       }
16124     }
16125   }
16126
16127   // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
16128   // splat. The indexed instructions are going to be expecting a DUPLANE64, so
16129   // canonicalise to that.
16130   if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
16131     assert(VT.getScalarSizeInBits() == 64);
16132     return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
16133                        DAG.getConstant(0, dl, MVT::i64));
16134   }
16135
16136   // Canonicalise concat_vectors so that the right-hand vector has as few
16137   // bit-casts as possible before its real operation. The primary matching
16138   // destination for these operations will be the narrowing "2" instructions,
16139   // which depend on the operation being performed on this right-hand vector.
16140   // For example,
16141   //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
16142   // becomes
16143   //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
16144
16145   if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
16146     return SDValue();
16147   SDValue RHS = N1->getOperand(0);
16148   MVT RHSTy = RHS.getValueType().getSimpleVT();
16149   // If the RHS is not a vector, this is not the pattern we're looking for.
16150   if (!RHSTy.isVector())
16151     return SDValue();
16152
16153   LLVM_DEBUG(
16154       dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
16155
16156   MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
16157                                   RHSTy.getVectorNumElements() * 2);
16158   return DAG.getNode(ISD::BITCAST, dl, VT,
16159                      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
16160                                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
16161                                  RHS));
16162 }
16163
16164 static SDValue
16165 performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
16166                                SelectionDAG &DAG) {
16167   if (DCI.isBeforeLegalizeOps())
16168     return SDValue();
16169
16170   EVT VT = N->getValueType(0);
16171   if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
16172     return SDValue();
16173
16174   SDValue V = N->getOperand(0);
16175
16176   // NOTE: This combine exists in DAGCombiner, but that version's legality check
16177   // blocks this combine because the non-const case requires custom lowering.
16178   //
16179   // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
16180   if (V.getOpcode() == ISD::SPLAT_VECTOR)
16181     if (isa<ConstantSDNode>(V.getOperand(0)))
16182       return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
16183
16184   return SDValue();
16185 }
16186
16187 static SDValue
16188 performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
16189                               SelectionDAG &DAG) {
16190   SDLoc DL(N);
16191   SDValue Vec = N->getOperand(0);
16192   SDValue SubVec = N->getOperand(1);
16193   uint64_t IdxVal = N->getConstantOperandVal(2);
16194   EVT VecVT = Vec.getValueType();
16195   EVT SubVT = SubVec.getValueType();
16196
16197   // Only do this for legal fixed vector types.
16198   if (!VecVT.isFixedLengthVector() ||
16199       !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
16200       !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
16201     return SDValue();
16202
16203   // Ignore widening patterns.
16204   if (IdxVal == 0 && Vec.isUndef())
16205     return SDValue();
16206
16207   // Subvector must be half the width and an "aligned" insertion.
16208   unsigned NumSubElts = SubVT.getVectorNumElements();
16209   if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
16210       (IdxVal != 0 && IdxVal != NumSubElts))
16211     return SDValue();
16212
16213   // Fold insert_subvector -> concat_vectors
16214   // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
16215   // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
16216   SDValue Lo, Hi;
16217   if (IdxVal == 0) {
16218     Lo = SubVec;
16219     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
16220                      DAG.getVectorIdxConstant(NumSubElts, DL));
16221   } else {
16222     Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
16223                      DAG.getVectorIdxConstant(0, DL));
16224     Hi = SubVec;
16225   }
16226   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
16227 }
16228
16229 static SDValue tryCombineFixedPointConvert(SDNode *N,
16230                                            TargetLowering::DAGCombinerInfo &DCI,
16231                                            SelectionDAG &DAG) {
16232   // Wait until after everything is legalized to try this. That way we have
16233   // legal vector types and such.
16234   if (DCI.isBeforeLegalizeOps())
16235     return SDValue();
16236   // Transform a scalar conversion of a value from a lane extract into a
16237   // lane extract of a vector conversion. E.g., from foo1 to foo2:
16238   // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
16239   // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
16240   //
16241   // The second form interacts better with instruction selection and the
16242   // register allocator to avoid cross-class register copies that aren't
16243   // coalescable due to a lane reference.
16244
16245   // Check the operand and see if it originates from a lane extract.
16246   SDValue Op1 = N->getOperand(1);
16247   if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16248     return SDValue();
16249
16250   // Yep, no additional predication needed. Perform the transform.
16251   SDValue IID = N->getOperand(0);
16252   SDValue Shift = N->getOperand(2);
16253   SDValue Vec = Op1.getOperand(0);
16254   SDValue Lane = Op1.getOperand(1);
16255   EVT ResTy = N->getValueType(0);
16256   EVT VecResTy;
16257   SDLoc DL(N);
16258
16259   // The vector width should be 128 bits by the time we get here, even
16260   // if it started as 64 bits (the extract_vector handling will have
16261   // done so). Bail if it is not.
16262   if (Vec.getValueSizeInBits() != 128)
16263     return SDValue();
16264
16265   if (Vec.getValueType() == MVT::v4i32)
16266     VecResTy = MVT::v4f32;
16267   else if (Vec.getValueType() == MVT::v2i64)
16268     VecResTy = MVT::v2f64;
16269   else
16270     return SDValue();
16271
16272   SDValue Convert =
16273       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
16274   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
16275 }
16276
16277 // AArch64 high-vector "long" operations are formed by performing the non-high
16278 // version on an extract_subvector of each operand which gets the high half:
16279 //
16280 //  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
16281 //
16282 // However, there are cases which don't have an extract_high explicitly, but
16283 // have another operation that can be made compatible with one for free. For
16284 // example:
16285 //
16286 //  (dupv64 scalar) --> (extract_high (dup128 scalar))
16287 //
16288 // This routine does the actual conversion of such DUPs, once outer routines
16289 // have determined that everything else is in order.
16290 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
16291 // similarly here.
16292 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
16293   MVT VT = N.getSimpleValueType();
16294   if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16295       N.getConstantOperandVal(1) == 0)
16296     N = N.getOperand(0);
16297
16298   switch (N.getOpcode()) {
16299   case AArch64ISD::DUP:
16300   case AArch64ISD::DUPLANE8:
16301   case AArch64ISD::DUPLANE16:
16302   case AArch64ISD::DUPLANE32:
16303   case AArch64ISD::DUPLANE64:
16304   case AArch64ISD::MOVI:
16305   case AArch64ISD::MOVIshift:
16306   case AArch64ISD::MOVIedit:
16307   case AArch64ISD::MOVImsl:
16308   case AArch64ISD::MVNIshift:
16309   case AArch64ISD::MVNImsl:
16310     break;
16311   default:
16312     // FMOV could be supported, but isn't very useful, as it would only occur
16313     // if you passed a bitcast' floating point immediate to an eligible long
16314     // integer op (addl, smull, ...).
16315     return SDValue();
16316   }
16317
16318   if (!VT.is64BitVector())
16319     return SDValue();
16320
16321   SDLoc DL(N);
16322   unsigned NumElems = VT.getVectorNumElements();
16323   if (N.getValueType().is64BitVector()) {
16324     MVT ElementTy = VT.getVectorElementType();
16325     MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
16326     N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
16327   }
16328
16329   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
16330                      DAG.getConstant(NumElems, DL, MVT::i64));
16331 }
16332
16333 static bool isEssentiallyExtractHighSubvector(SDValue N) {
16334   if (N.getOpcode() == ISD::BITCAST)
16335     N = N.getOperand(0);
16336   if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
16337     return false;
16338   if (N.getOperand(0).getValueType().isScalableVector())
16339     return false;
16340   return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
16341          N.getOperand(0).getValueType().getVectorNumElements() / 2;
16342 }
16343
16344 /// Helper structure to keep track of ISD::SET_CC operands.
16345 struct GenericSetCCInfo {
16346   const SDValue *Opnd0;
16347   const SDValue *Opnd1;
16348   ISD::CondCode CC;
16349 };
16350
16351 /// Helper structure to keep track of a SET_CC lowered into AArch64 code.
16352 struct AArch64SetCCInfo {
16353   const SDValue *Cmp;
16354   AArch64CC::CondCode CC;
16355 };
16356
16357 /// Helper structure to keep track of SetCC information.
16358 union SetCCInfo {
16359   GenericSetCCInfo Generic;
16360   AArch64SetCCInfo AArch64;
16361 };
16362
16363 /// Helper structure to be able to read SetCC information.  If set to
16364 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
16365 /// GenericSetCCInfo.
16366 struct SetCCInfoAndKind {
16367   SetCCInfo Info;
16368   bool IsAArch64;
16369 };
16370
16371 /// Check whether or not \p Op is a SET_CC operation, either a generic or
16372 /// an
16373 /// AArch64 lowered one.
16374 /// \p SetCCInfo is filled accordingly.
16375 /// \post SetCCInfo is meanginfull only when this function returns true.
16376 /// \return True when Op is a kind of SET_CC operation.
16377 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
16378   // If this is a setcc, this is straight forward.
16379   if (Op.getOpcode() == ISD::SETCC) {
16380     SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
16381     SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
16382     SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16383     SetCCInfo.IsAArch64 = false;
16384     return true;
16385   }
16386   // Otherwise, check if this is a matching csel instruction.
16387   // In other words:
16388   // - csel 1, 0, cc
16389   // - csel 0, 1, !cc
16390   if (Op.getOpcode() != AArch64ISD::CSEL)
16391     return false;
16392   // Set the information about the operands.
16393   // TODO: we want the operands of the Cmp not the csel
16394   SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
16395   SetCCInfo.IsAArch64 = true;
16396   SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
16397       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
16398
16399   // Check that the operands matches the constraints:
16400   // (1) Both operands must be constants.
16401   // (2) One must be 1 and the other must be 0.
16402   ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
16403   ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16404
16405   // Check (1).
16406   if (!TValue || !FValue)
16407     return false;
16408
16409   // Check (2).
16410   if (!TValue->isOne()) {
16411     // Update the comparison when we are interested in !cc.
16412     std::swap(TValue, FValue);
16413     SetCCInfo.Info.AArch64.CC =
16414         AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
16415   }
16416   return TValue->isOne() && FValue->isZero();
16417 }
16418
16419 // Returns true if Op is setcc or zext of setcc.
16420 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
16421   if (isSetCC(Op, Info))
16422     return true;
16423   return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
16424     isSetCC(Op->getOperand(0), Info));
16425 }
16426
16427 // The folding we want to perform is:
16428 // (add x, [zext] (setcc cc ...) )
16429 //   -->
16430 // (csel x, (add x, 1), !cc ...)
16431 //
16432 // The latter will get matched to a CSINC instruction.
16433 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
16434   assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
16435   SDValue LHS = Op->getOperand(0);
16436   SDValue RHS = Op->getOperand(1);
16437   SetCCInfoAndKind InfoAndKind;
16438
16439   // If both operands are a SET_CC, then we don't want to perform this
16440   // folding and create another csel as this results in more instructions
16441   // (and higher register usage).
16442   if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
16443       isSetCCOrZExtSetCC(RHS, InfoAndKind))
16444     return SDValue();
16445
16446   // If neither operand is a SET_CC, give up.
16447   if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
16448     std::swap(LHS, RHS);
16449     if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
16450       return SDValue();
16451   }
16452
16453   // FIXME: This could be generatized to work for FP comparisons.
16454   EVT CmpVT = InfoAndKind.IsAArch64
16455                   ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
16456                   : InfoAndKind.Info.Generic.Opnd0->getValueType();
16457   if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
16458     return SDValue();
16459
16460   SDValue CCVal;
16461   SDValue Cmp;
16462   SDLoc dl(Op);
16463   if (InfoAndKind.IsAArch64) {
16464     CCVal = DAG.getConstant(
16465         AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
16466         MVT::i32);
16467     Cmp = *InfoAndKind.Info.AArch64.Cmp;
16468   } else
16469     Cmp = getAArch64Cmp(
16470         *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
16471         ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
16472         dl);
16473
16474   EVT VT = Op->getValueType(0);
16475   LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
16476   return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
16477 }
16478
16479 // ADD(UADDV a, UADDV b) -->  UADDV(ADD a, b)
16480 static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
16481   EVT VT = N->getValueType(0);
16482   // Only scalar integer and vector types.
16483   if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
16484     return SDValue();
16485
16486   SDValue LHS = N->getOperand(0);
16487   SDValue RHS = N->getOperand(1);
16488   if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16489       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
16490     return SDValue();
16491
16492   auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
16493   auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
16494   if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
16495     return SDValue();
16496
16497   SDValue Op1 = LHS->getOperand(0);
16498   SDValue Op2 = RHS->getOperand(0);
16499   EVT OpVT1 = Op1.getValueType();
16500   EVT OpVT2 = Op2.getValueType();
16501   if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
16502       Op2.getOpcode() != AArch64ISD::UADDV ||
16503       OpVT1.getVectorElementType() != VT)
16504     return SDValue();
16505
16506   SDValue Val1 = Op1.getOperand(0);
16507   SDValue Val2 = Op2.getOperand(0);
16508   EVT ValVT = Val1->getValueType(0);
16509   SDLoc DL(N);
16510   SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
16511   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
16512                      DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
16513                      DAG.getConstant(0, DL, MVT::i64));
16514 }
16515
16516 /// Perform the scalar expression combine in the form of:
16517 ///   CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
16518 ///   CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
16519 static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
16520   EVT VT = N->getValueType(0);
16521   if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
16522     return SDValue();
16523
16524   SDValue LHS = N->getOperand(0);
16525   SDValue RHS = N->getOperand(1);
16526
16527   // Handle commutivity.
16528   if (LHS.getOpcode() != AArch64ISD::CSEL &&
16529       LHS.getOpcode() != AArch64ISD::CSNEG) {
16530     std::swap(LHS, RHS);
16531     if (LHS.getOpcode() != AArch64ISD::CSEL &&
16532         LHS.getOpcode() != AArch64ISD::CSNEG) {
16533       return SDValue();
16534     }
16535   }
16536
16537   if (!LHS.hasOneUse())
16538     return SDValue();
16539
16540   AArch64CC::CondCode AArch64CC =
16541       static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
16542
16543   // The CSEL should include a const one operand, and the CSNEG should include
16544   // One or NegOne operand.
16545   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
16546   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
16547   if (!CTVal || !CFVal)
16548     return SDValue();
16549
16550   if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
16551         (CTVal->isOne() || CFVal->isOne())) &&
16552       !(LHS.getOpcode() == AArch64ISD::CSNEG &&
16553         (CTVal->isOne() || CFVal->isAllOnes())))
16554     return SDValue();
16555
16556   // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
16557   if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
16558       !CFVal->isOne()) {
16559     std::swap(CTVal, CFVal);
16560     AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
16561   }
16562
16563   SDLoc DL(N);
16564   // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
16565   if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
16566       !CFVal->isAllOnes()) {
16567     APInt C = -1 * CFVal->getAPIntValue();
16568     CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
16569     CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
16570     AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
16571   }
16572
16573   // It might be neutral for larger constants, as the immediate need to be
16574   // materialized in a register.
16575   APInt ADDC = CTVal->getAPIntValue();
16576   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16577   if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
16578     return SDValue();
16579
16580   assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
16581           (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
16582          "Unexpected constant value");
16583
16584   SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
16585   SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
16586   SDValue Cmp = LHS.getOperand(3);
16587
16588   return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
16589 }
16590
16591 // ADD(UDOT(zero, x, y), A) -->  UDOT(A, x, y)
16592 static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
16593   EVT VT = N->getValueType(0);
16594   if (N->getOpcode() != ISD::ADD)
16595     return SDValue();
16596
16597   SDValue Dot = N->getOperand(0);
16598   SDValue A = N->getOperand(1);
16599   // Handle commutivity
16600   auto isZeroDot = [](SDValue Dot) {
16601     return (Dot.getOpcode() == AArch64ISD::UDOT ||
16602             Dot.getOpcode() == AArch64ISD::SDOT) &&
16603            isZerosVector(Dot.getOperand(0).getNode());
16604   };
16605   if (!isZeroDot(Dot))
16606     std::swap(Dot, A);
16607   if (!isZeroDot(Dot))
16608     return SDValue();
16609
16610   return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
16611                      Dot.getOperand(2));
16612 }
16613
16614 static bool isNegatedInteger(SDValue Op) {
16615   return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
16616 }
16617
16618 static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
16619   SDLoc DL(Op);
16620   EVT VT = Op.getValueType();
16621   SDValue Zero = DAG.getConstant(0, DL, VT);
16622   return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
16623 }
16624
16625 // Try to fold
16626 //
16627 // (neg (csel X, Y)) -> (csel (neg X), (neg Y))
16628 //
16629 // The folding helps csel to be matched with csneg without generating
16630 // redundant neg instruction, which includes negation of the csel expansion
16631 // of abs node lowered by lowerABS.
16632 static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
16633   if (!isNegatedInteger(SDValue(N, 0)))
16634     return SDValue();
16635
16636   SDValue CSel = N->getOperand(1);
16637   if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
16638     return SDValue();
16639
16640   SDValue N0 = CSel.getOperand(0);
16641   SDValue N1 = CSel.getOperand(1);
16642
16643   // If both of them is not negations, it's not worth the folding as it
16644   // introduces two additional negations while reducing one negation.
16645   if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
16646     return SDValue();
16647
16648   SDValue N0N = getNegatedInteger(N0, DAG);
16649   SDValue N1N = getNegatedInteger(N1, DAG);
16650
16651   SDLoc DL(N);
16652   EVT VT = CSel.getValueType();
16653   return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
16654                      CSel.getOperand(3));
16655 }
16656
16657 // The basic add/sub long vector instructions have variants with "2" on the end
16658 // which act on the high-half of their inputs. They are normally matched by
16659 // patterns like:
16660 //
16661 // (add (zeroext (extract_high LHS)),
16662 //      (zeroext (extract_high RHS)))
16663 // -> uaddl2 vD, vN, vM
16664 //
16665 // However, if one of the extracts is something like a duplicate, this
16666 // instruction can still be used profitably. This function puts the DAG into a
16667 // more appropriate form for those patterns to trigger.
16668 static SDValue performAddSubLongCombine(SDNode *N,
16669                                         TargetLowering::DAGCombinerInfo &DCI,
16670                                         SelectionDAG &DAG) {
16671   if (DCI.isBeforeLegalizeOps())
16672     return SDValue();
16673
16674   MVT VT = N->getSimpleValueType(0);
16675   if (!VT.is128BitVector()) {
16676     if (N->getOpcode() == ISD::ADD)
16677       return performSetccAddFolding(N, DAG);
16678     return SDValue();
16679   }
16680
16681   // Make sure both branches are extended in the same way.
16682   SDValue LHS = N->getOperand(0);
16683   SDValue RHS = N->getOperand(1);
16684   if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
16685        LHS.getOpcode() != ISD::SIGN_EXTEND) ||
16686       LHS.getOpcode() != RHS.getOpcode())
16687     return SDValue();
16688
16689   unsigned ExtType = LHS.getOpcode();
16690
16691   // It's not worth doing if at least one of the inputs isn't already an
16692   // extract, but we don't know which it'll be so we have to try both.
16693   if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
16694     RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
16695     if (!RHS.getNode())
16696       return SDValue();
16697
16698     RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
16699   } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
16700     LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
16701     if (!LHS.getNode())
16702       return SDValue();
16703
16704     LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
16705   }
16706
16707   return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
16708 }
16709
16710 static bool isCMP(SDValue Op) {
16711   return Op.getOpcode() == AArch64ISD::SUBS &&
16712          !Op.getNode()->hasAnyUseOfValue(0);
16713 }
16714
16715 // (CSEL 1 0 CC Cond) => CC
16716 // (CSEL 0 1 CC Cond) => !CC
16717 static Optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
16718   if (Op.getOpcode() != AArch64ISD::CSEL)
16719     return None;
16720   auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
16721   if (CC == AArch64CC::AL || CC == AArch64CC::NV)
16722     return None;
16723   SDValue OpLHS = Op.getOperand(0);
16724   SDValue OpRHS = Op.getOperand(1);
16725   if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
16726     return CC;
16727   if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
16728     return getInvertedCondCode(CC);
16729
16730   return None;
16731 }
16732
16733 // (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
16734 // (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
16735 static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
16736   SDValue CmpOp = Op->getOperand(2);
16737   if (!isCMP(CmpOp))
16738     return SDValue();
16739
16740   if (IsAdd) {
16741     if (!isOneConstant(CmpOp.getOperand(1)))
16742       return SDValue();
16743   } else {
16744     if (!isNullConstant(CmpOp.getOperand(0)))
16745       return SDValue();
16746   }
16747
16748   SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
16749   auto CC = getCSETCondCode(CsetOp);
16750   if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
16751     return SDValue();
16752
16753   return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
16754                      Op->getOperand(0), Op->getOperand(1),
16755                      CsetOp.getOperand(3));
16756 }
16757
16758 // (ADC x 0 cond) => (CINC x HS cond)
16759 static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
16760   SDValue LHS = N->getOperand(0);
16761   SDValue RHS = N->getOperand(1);
16762   SDValue Cond = N->getOperand(2);
16763
16764   if (!isNullConstant(RHS))
16765     return SDValue();
16766
16767   EVT VT = N->getValueType(0);
16768   SDLoc DL(N);
16769
16770   // (CINC x cc cond) <=> (CSINC x x !cc cond)
16771   SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
16772   return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
16773 }
16774
16775 // Transform vector add(zext i8 to i32, zext i8 to i32)
16776 //  into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
16777 // This allows extra uses of saddl/uaddl at the lower vector widths, and less
16778 // extends.
16779 static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
16780   EVT VT = N->getValueType(0);
16781   if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
16782       (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
16783        N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
16784       (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
16785        N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
16786       N->getOperand(0).getOperand(0).getValueType() !=
16787           N->getOperand(1).getOperand(0).getValueType())
16788     return SDValue();
16789
16790   SDValue N0 = N->getOperand(0).getOperand(0);
16791   SDValue N1 = N->getOperand(1).getOperand(0);
16792   EVT InVT = N0.getValueType();
16793
16794   EVT S1 = InVT.getScalarType();
16795   EVT S2 = VT.getScalarType();
16796   if ((S2 == MVT::i32 && S1 == MVT::i8) ||
16797       (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
16798     SDLoc DL(N);
16799     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
16800                                   S2.getHalfSizedIntegerVT(*DAG.getContext()),
16801                                   VT.getVectorElementCount());
16802     SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
16803     SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
16804     SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
16805     return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
16806   }
16807   return SDValue();
16808 }
16809
16810 static SDValue performBuildVectorCombine(SDNode *N,
16811                                          TargetLowering::DAGCombinerInfo &DCI,
16812                                          SelectionDAG &DAG) {
16813   SDLoc DL(N);
16814   EVT VT = N->getValueType(0);
16815
16816   // A build vector of two extracted elements is equivalent to an
16817   // extract subvector where the inner vector is any-extended to the
16818   // extract_vector_elt VT.
16819   //    (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
16820   //                  (extract_elt_iXX_to_i32 vec Idx+1))
16821   // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
16822
16823   // For now, only consider the v2i32 case, which arises as a result of
16824   // legalization.
16825   if (VT != MVT::v2i32)
16826     return SDValue();
16827
16828   SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
16829   // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
16830   if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
16831       Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
16832       // Constant index.
16833       isa<ConstantSDNode>(Elt0->getOperand(1)) &&
16834       isa<ConstantSDNode>(Elt1->getOperand(1)) &&
16835       // Both EXTRACT_VECTOR_ELT from same vector...
16836       Elt0->getOperand(0) == Elt1->getOperand(0) &&
16837       // ... and contiguous. First element's index +1 == second element's index.
16838       Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
16839       // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
16840       // ResultType's known minimum vector length.
16841       Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
16842     SDValue VecToExtend = Elt0->getOperand(0);
16843     EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
16844     if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
16845       return SDValue();
16846
16847     SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
16848
16849     SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
16850     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
16851                        SubvectorIdx);
16852   }
16853
16854   return SDValue();
16855 }
16856
16857 static SDValue performAddCombineForShiftedOperands(SDNode *N,
16858                                                    SelectionDAG &DAG) {
16859   // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
16860   // commutative.
16861   if (N->getOpcode() != ISD::ADD)
16862     return SDValue();
16863
16864   // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
16865   // shifted register is only available for i32 and i64.
16866   EVT VT = N->getValueType(0);
16867   if (VT != MVT::i32 && VT != MVT::i64)
16868     return SDValue();
16869
16870   SDLoc DL(N);
16871   SDValue LHS = N->getOperand(0);
16872   SDValue RHS = N->getOperand(1);
16873
16874   uint64_t LHSImm = 0, RHSImm = 0;
16875   // If both operand are shifted by imm and shift amount is not greater than 4
16876   // for one operand, swap LHS and RHS to put operand with smaller shift amount
16877   // on RHS.
16878   //
16879   // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
16880   // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
16881   // with LSL (shift > 4). For the rest of processors, this is no-op for
16882   // performance or correctness.
16883   if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
16884       isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
16885       RHSImm > 4 && LHS.hasOneUse())
16886     return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
16887
16888   return SDValue();
16889 }
16890
16891 static SDValue performAddSubCombine(SDNode *N,
16892                                     TargetLowering::DAGCombinerInfo &DCI,
16893                                     SelectionDAG &DAG) {
16894   // Try to change sum of two reductions.
16895   if (SDValue Val = performAddUADDVCombine(N, DAG))
16896     return Val;
16897   if (SDValue Val = performAddDotCombine(N, DAG))
16898     return Val;
16899   if (SDValue Val = performAddCSelIntoCSinc(N, DAG))
16900     return Val;
16901   if (SDValue Val = performNegCSelCombine(N, DAG))
16902     return Val;
16903   if (SDValue Val = performVectorAddSubExtCombine(N, DAG))
16904     return Val;
16905   if (SDValue Val = performAddCombineForShiftedOperands(N, DAG))
16906     return Val;
16907
16908   return performAddSubLongCombine(N, DCI, DAG);
16909 }
16910
16911 // Massage DAGs which we can use the high-half "long" operations on into
16912 // something isel will recognize better. E.g.
16913 //
16914 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
16915 //   (aarch64_neon_umull (extract_high (v2i64 vec)))
16916 //                     (extract_high (v2i64 (dup128 scalar)))))
16917 //
16918 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
16919                                        TargetLowering::DAGCombinerInfo &DCI,
16920                                        SelectionDAG &DAG) {
16921   if (DCI.isBeforeLegalizeOps())
16922     return SDValue();
16923
16924   SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
16925   SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
16926   assert(LHS.getValueType().is64BitVector() &&
16927          RHS.getValueType().is64BitVector() &&
16928          "unexpected shape for long operation");
16929
16930   // Either node could be a DUP, but it's not worth doing both of them (you'd
16931   // just as well use the non-high version) so look for a corresponding extract
16932   // operation on the other "wing".
16933   if (isEssentiallyExtractHighSubvector(LHS)) {
16934     RHS = tryExtendDUPToExtractHigh(RHS, DAG);
16935     if (!RHS.getNode())
16936       return SDValue();
16937   } else if (isEssentiallyExtractHighSubvector(RHS)) {
16938     LHS = tryExtendDUPToExtractHigh(LHS, DAG);
16939     if (!LHS.getNode())
16940       return SDValue();
16941   }
16942
16943   if (IID == Intrinsic::not_intrinsic)
16944     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
16945
16946   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
16947                      N->getOperand(0), LHS, RHS);
16948 }
16949
16950 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
16951   MVT ElemTy = N->getSimpleValueType(0).getScalarType();
16952   unsigned ElemBits = ElemTy.getSizeInBits();
16953
16954   int64_t ShiftAmount;
16955   if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
16956     APInt SplatValue, SplatUndef;
16957     unsigned SplatBitSize;
16958     bool HasAnyUndefs;
16959     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
16960                               HasAnyUndefs, ElemBits) ||
16961         SplatBitSize != ElemBits)
16962       return SDValue();
16963
16964     ShiftAmount = SplatValue.getSExtValue();
16965   } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
16966     ShiftAmount = CVN->getSExtValue();
16967   } else
16968     return SDValue();
16969
16970   unsigned Opcode;
16971   bool IsRightShift;
16972   switch (IID) {
16973   default:
16974     llvm_unreachable("Unknown shift intrinsic");
16975   case Intrinsic::aarch64_neon_sqshl:
16976     Opcode = AArch64ISD::SQSHL_I;
16977     IsRightShift = false;
16978     break;
16979   case Intrinsic::aarch64_neon_uqshl:
16980     Opcode = AArch64ISD::UQSHL_I;
16981     IsRightShift = false;
16982     break;
16983   case Intrinsic::aarch64_neon_srshl:
16984     Opcode = AArch64ISD::SRSHR_I;
16985     IsRightShift = true;
16986     break;
16987   case Intrinsic::aarch64_neon_urshl:
16988     Opcode = AArch64ISD::URSHR_I;
16989     IsRightShift = true;
16990     break;
16991   case Intrinsic::aarch64_neon_sqshlu:
16992     Opcode = AArch64ISD::SQSHLU_I;
16993     IsRightShift = false;
16994     break;
16995   case Intrinsic::aarch64_neon_sshl:
16996   case Intrinsic::aarch64_neon_ushl:
16997     // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
16998     // left shift for positive shift amounts. Below, we only replace the current
16999     // node with VSHL, if this condition is met.
17000     Opcode = AArch64ISD::VSHL;
17001     IsRightShift = false;
17002     break;
17003   }
17004
17005   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
17006     SDLoc dl(N);
17007     return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
17008                        DAG.getConstant(-ShiftAmount, dl, MVT::i32));
17009   } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
17010     SDLoc dl(N);
17011     return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
17012                        DAG.getConstant(ShiftAmount, dl, MVT::i32));
17013   }
17014
17015   return SDValue();
17016 }
17017
17018 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
17019 // the intrinsics must be legal and take an i32, this means there's almost
17020 // certainly going to be a zext in the DAG which we can eliminate.
17021 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
17022   SDValue AndN = N->getOperand(2);
17023   if (AndN.getOpcode() != ISD::AND)
17024     return SDValue();
17025
17026   ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
17027   if (!CMask || CMask->getZExtValue() != Mask)
17028     return SDValue();
17029
17030   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
17031                      N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
17032 }
17033
17034 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
17035                                            SelectionDAG &DAG) {
17036   SDLoc dl(N);
17037   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
17038                      DAG.getNode(Opc, dl,
17039                                  N->getOperand(1).getSimpleValueType(),
17040                                  N->getOperand(1)),
17041                      DAG.getConstant(0, dl, MVT::i64));
17042 }
17043
17044 static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
17045   SDLoc DL(N);
17046   SDValue Op1 = N->getOperand(1);
17047   SDValue Op2 = N->getOperand(2);
17048   EVT ScalarTy = Op2.getValueType();
17049   if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
17050     ScalarTy = MVT::i32;
17051
17052   // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
17053   SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
17054   SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
17055   SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
17056   SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
17057   return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
17058 }
17059
17060 static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
17061   SDLoc dl(N);
17062   SDValue Scalar = N->getOperand(3);
17063   EVT ScalarTy = Scalar.getValueType();
17064
17065   if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
17066     Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
17067
17068   SDValue Passthru = N->getOperand(1);
17069   SDValue Pred = N->getOperand(2);
17070   return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
17071                      Pred, Scalar, Passthru);
17072 }
17073
17074 static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
17075   SDLoc dl(N);
17076   LLVMContext &Ctx = *DAG.getContext();
17077   EVT VT = N->getValueType(0);
17078
17079   assert(VT.isScalableVector() && "Expected a scalable vector.");
17080
17081   // Current lowering only supports the SVE-ACLE types.
17082   if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
17083     return SDValue();
17084
17085   unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
17086   unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
17087   EVT ByteVT =
17088       EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
17089
17090   // Convert everything to the domain of EXT (i.e bytes).
17091   SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
17092   SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
17093   SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
17094                             DAG.getConstant(ElemSize, dl, MVT::i32));
17095
17096   SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
17097   return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
17098 }
17099
17100 static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
17101                                         TargetLowering::DAGCombinerInfo &DCI,
17102                                         SelectionDAG &DAG) {
17103   if (DCI.isBeforeLegalize())
17104     return SDValue();
17105
17106   SDValue Comparator = N->getOperand(3);
17107   if (Comparator.getOpcode() == AArch64ISD::DUP ||
17108       Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
17109     unsigned IID = getIntrinsicID(N);
17110     EVT VT = N->getValueType(0);
17111     EVT CmpVT = N->getOperand(2).getValueType();
17112     SDValue Pred = N->getOperand(1);
17113     SDValue Imm;
17114     SDLoc DL(N);
17115
17116     switch (IID) {
17117     default:
17118       llvm_unreachable("Called with wrong intrinsic!");
17119       break;
17120
17121     // Signed comparisons
17122     case Intrinsic::aarch64_sve_cmpeq_wide:
17123     case Intrinsic::aarch64_sve_cmpne_wide:
17124     case Intrinsic::aarch64_sve_cmpge_wide:
17125     case Intrinsic::aarch64_sve_cmpgt_wide:
17126     case Intrinsic::aarch64_sve_cmplt_wide:
17127     case Intrinsic::aarch64_sve_cmple_wide: {
17128       if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
17129         int64_t ImmVal = CN->getSExtValue();
17130         if (ImmVal >= -16 && ImmVal <= 15)
17131           Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
17132         else
17133           return SDValue();
17134       }
17135       break;
17136     }
17137     // Unsigned comparisons
17138     case Intrinsic::aarch64_sve_cmphs_wide:
17139     case Intrinsic::aarch64_sve_cmphi_wide:
17140     case Intrinsic::aarch64_sve_cmplo_wide:
17141     case Intrinsic::aarch64_sve_cmpls_wide:  {
17142       if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
17143         uint64_t ImmVal = CN->getZExtValue();
17144         if (ImmVal <= 127)
17145           Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
17146         else
17147           return SDValue();
17148       }
17149       break;
17150     }
17151     }
17152
17153     if (!Imm)
17154       return SDValue();
17155
17156     SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
17157     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
17158                        N->getOperand(2), Splat, DAG.getCondCode(CC));
17159   }
17160
17161   return SDValue();
17162 }
17163
17164 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
17165                         AArch64CC::CondCode Cond) {
17166   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17167
17168   SDLoc DL(Op);
17169   assert(Op.getValueType().isScalableVector() &&
17170          TLI.isTypeLegal(Op.getValueType()) &&
17171          "Expected legal scalable vector type!");
17172   assert(Op.getValueType() == Pg.getValueType() &&
17173          "Expected same type for PTEST operands");
17174
17175   // Ensure target specific opcodes are using legal type.
17176   EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
17177   SDValue TVal = DAG.getConstant(1, DL, OutVT);
17178   SDValue FVal = DAG.getConstant(0, DL, OutVT);
17179
17180   // Ensure operands have type nxv16i1.
17181   if (Op.getValueType() != MVT::nxv16i1) {
17182     if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) &&
17183         isZeroingInactiveLanes(Op))
17184       Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
17185     else
17186       Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
17187     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
17188   }
17189
17190   // Set condition code (CC) flags.
17191   SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op);
17192
17193   // Convert CC to integer based on requested condition.
17194   // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
17195   SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
17196   SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
17197   return DAG.getZExtOrTrunc(Res, DL, VT);
17198 }
17199
17200 static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
17201                                       SelectionDAG &DAG) {
17202   SDLoc DL(N);
17203
17204   SDValue Pred = N->getOperand(1);
17205   SDValue VecToReduce = N->getOperand(2);
17206
17207   // NOTE: The integer reduction's result type is not always linked to the
17208   // operand's element type so we construct it from the intrinsic's result type.
17209   EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
17210   SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
17211
17212   // SVE reductions set the whole vector register with the first element
17213   // containing the reduction result, which we'll now extract.
17214   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17215   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
17216                      Zero);
17217 }
17218
17219 static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
17220                                      SelectionDAG &DAG) {
17221   SDLoc DL(N);
17222
17223   SDValue Pred = N->getOperand(1);
17224   SDValue VecToReduce = N->getOperand(2);
17225
17226   EVT ReduceVT = VecToReduce.getValueType();
17227   SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
17228
17229   // SVE reductions set the whole vector register with the first element
17230   // containing the reduction result, which we'll now extract.
17231   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17232   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
17233                      Zero);
17234 }
17235
17236 static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
17237                                             SelectionDAG &DAG) {
17238   SDLoc DL(N);
17239
17240   SDValue Pred = N->getOperand(1);
17241   SDValue InitVal = N->getOperand(2);
17242   SDValue VecToReduce = N->getOperand(3);
17243   EVT ReduceVT = VecToReduce.getValueType();
17244
17245   // Ordered reductions use the first lane of the result vector as the
17246   // reduction's initial value.
17247   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17248   InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
17249                         DAG.getUNDEF(ReduceVT), InitVal, Zero);
17250
17251   SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
17252
17253   // SVE reductions set the whole vector register with the first element
17254   // containing the reduction result, which we'll now extract.
17255   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
17256                      Zero);
17257 }
17258
17259 static bool isAllInactivePredicate(SDValue N) {
17260   // Look through cast.
17261   while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
17262     N = N.getOperand(0);
17263
17264   return ISD::isConstantSplatVectorAllZeros(N.getNode());
17265 }
17266
17267 static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
17268   unsigned NumElts = N.getValueType().getVectorMinNumElements();
17269
17270   // Look through cast.
17271   while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
17272     N = N.getOperand(0);
17273     // When reinterpreting from a type with fewer elements the "new" elements
17274     // are not active, so bail if they're likely to be used.
17275     if (N.getValueType().getVectorMinNumElements() < NumElts)
17276       return false;
17277   }
17278
17279   if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
17280     return true;
17281
17282   // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
17283   // or smaller than the implicit element type represented by N.
17284   // NOTE: A larger element count implies a smaller element type.
17285   if (N.getOpcode() == AArch64ISD::PTRUE &&
17286       N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
17287     return N.getValueType().getVectorMinNumElements() >= NumElts;
17288
17289   // If we're compiling for a specific vector-length, we can check if the
17290   // pattern's VL equals that of the scalable vector at runtime.
17291   if (N.getOpcode() == AArch64ISD::PTRUE) {
17292     const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17293     unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
17294     unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
17295     if (MaxSVESize && MinSVESize == MaxSVESize) {
17296       unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
17297       unsigned PatNumElts =
17298           getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
17299       return PatNumElts == (NumElts * VScale);
17300     }
17301   }
17302
17303   return false;
17304 }
17305
17306 // If a merged operation has no inactive lanes we can relax it to a predicated
17307 // or unpredicated operation, which potentially allows better isel (perhaps
17308 // using immediate forms) or relaxing register reuse requirements.
17309 static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
17310                                        SelectionDAG &DAG, bool UnpredOp = false,
17311                                        bool SwapOperands = false) {
17312   assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
17313   assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
17314   SDValue Pg = N->getOperand(1);
17315   SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
17316   SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
17317
17318   // ISD way to specify an all active predicate.
17319   if (isAllActivePredicate(DAG, Pg)) {
17320     if (UnpredOp)
17321       return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
17322
17323     return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
17324   }
17325
17326   // FUTURE: SplatVector(true)
17327   return SDValue();
17328 }
17329
17330 static SDValue performIntrinsicCombine(SDNode *N,
17331                                        TargetLowering::DAGCombinerInfo &DCI,
17332                                        const AArch64Subtarget *Subtarget) {
17333   SelectionDAG &DAG = DCI.DAG;
17334   unsigned IID = getIntrinsicID(N);
17335   switch (IID) {
17336   default:
17337     break;
17338   case Intrinsic::get_active_lane_mask: {
17339     SDValue Res = SDValue();
17340     EVT VT = N->getValueType(0);
17341     if (VT.isFixedLengthVector()) {
17342       // We can use the SVE whilelo instruction to lower this intrinsic by
17343       // creating the appropriate sequence of scalable vector operations and
17344       // then extracting a fixed-width subvector from the scalable vector.
17345
17346       SDLoc DL(N);
17347       SDValue ID =
17348           DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
17349
17350       EVT WhileVT = EVT::getVectorVT(
17351           *DAG.getContext(), MVT::i1,
17352           ElementCount::getScalable(VT.getVectorNumElements()));
17353
17354       // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
17355       EVT PromVT = getPromotedVTForPredicate(WhileVT);
17356
17357       // Get the fixed-width equivalent of PromVT for extraction.
17358       EVT ExtVT =
17359           EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
17360                            VT.getVectorElementCount());
17361
17362       Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
17363                         N->getOperand(1), N->getOperand(2));
17364       Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
17365       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
17366                         DAG.getConstant(0, DL, MVT::i64));
17367       Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
17368     }
17369     return Res;
17370   }
17371   case Intrinsic::aarch64_neon_vcvtfxs2fp:
17372   case Intrinsic::aarch64_neon_vcvtfxu2fp:
17373     return tryCombineFixedPointConvert(N, DCI, DAG);
17374   case Intrinsic::aarch64_neon_saddv:
17375     return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
17376   case Intrinsic::aarch64_neon_uaddv:
17377     return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
17378   case Intrinsic::aarch64_neon_sminv:
17379     return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
17380   case Intrinsic::aarch64_neon_uminv:
17381     return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
17382   case Intrinsic::aarch64_neon_smaxv:
17383     return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
17384   case Intrinsic::aarch64_neon_umaxv:
17385     return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
17386   case Intrinsic::aarch64_neon_fmax:
17387     return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
17388                        N->getOperand(1), N->getOperand(2));
17389   case Intrinsic::aarch64_neon_fmin:
17390     return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
17391                        N->getOperand(1), N->getOperand(2));
17392   case Intrinsic::aarch64_neon_fmaxnm:
17393     return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
17394                        N->getOperand(1), N->getOperand(2));
17395   case Intrinsic::aarch64_neon_fminnm:
17396     return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
17397                        N->getOperand(1), N->getOperand(2));
17398   case Intrinsic::aarch64_neon_smull:
17399     return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
17400                        N->getOperand(1), N->getOperand(2));
17401   case Intrinsic::aarch64_neon_umull:
17402     return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
17403                        N->getOperand(1), N->getOperand(2));
17404   case Intrinsic::aarch64_neon_pmull:
17405     return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
17406                        N->getOperand(1), N->getOperand(2));
17407   case Intrinsic::aarch64_neon_sqdmull:
17408     return tryCombineLongOpWithDup(IID, N, DCI, DAG);
17409   case Intrinsic::aarch64_neon_sqshl:
17410   case Intrinsic::aarch64_neon_uqshl:
17411   case Intrinsic::aarch64_neon_sqshlu:
17412   case Intrinsic::aarch64_neon_srshl:
17413   case Intrinsic::aarch64_neon_urshl:
17414   case Intrinsic::aarch64_neon_sshl:
17415   case Intrinsic::aarch64_neon_ushl:
17416     return tryCombineShiftImm(IID, N, DAG);
17417   case Intrinsic::aarch64_crc32b:
17418   case Intrinsic::aarch64_crc32cb:
17419     return tryCombineCRC32(0xff, N, DAG);
17420   case Intrinsic::aarch64_crc32h:
17421   case Intrinsic::aarch64_crc32ch:
17422     return tryCombineCRC32(0xffff, N, DAG);
17423   case Intrinsic::aarch64_sve_saddv:
17424     // There is no i64 version of SADDV because the sign is irrelevant.
17425     if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
17426       return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
17427     else
17428       return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
17429   case Intrinsic::aarch64_sve_uaddv:
17430     return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
17431   case Intrinsic::aarch64_sve_smaxv:
17432     return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
17433   case Intrinsic::aarch64_sve_umaxv:
17434     return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
17435   case Intrinsic::aarch64_sve_sminv:
17436     return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
17437   case Intrinsic::aarch64_sve_uminv:
17438     return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
17439   case Intrinsic::aarch64_sve_orv:
17440     return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
17441   case Intrinsic::aarch64_sve_eorv:
17442     return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
17443   case Intrinsic::aarch64_sve_andv:
17444     return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
17445   case Intrinsic::aarch64_sve_index:
17446     return LowerSVEIntrinsicIndex(N, DAG);
17447   case Intrinsic::aarch64_sve_dup:
17448     return LowerSVEIntrinsicDUP(N, DAG);
17449   case Intrinsic::aarch64_sve_dup_x:
17450     return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
17451                        N->getOperand(1));
17452   case Intrinsic::aarch64_sve_ext:
17453     return LowerSVEIntrinsicEXT(N, DAG);
17454   case Intrinsic::aarch64_sve_mul:
17455     return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG);
17456   case Intrinsic::aarch64_sve_smulh:
17457     return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG);
17458   case Intrinsic::aarch64_sve_umulh:
17459     return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG);
17460   case Intrinsic::aarch64_sve_smin:
17461     return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
17462   case Intrinsic::aarch64_sve_umin:
17463     return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG);
17464   case Intrinsic::aarch64_sve_smax:
17465     return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG);
17466   case Intrinsic::aarch64_sve_umax:
17467     return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG);
17468   case Intrinsic::aarch64_sve_lsl:
17469     return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG);
17470   case Intrinsic::aarch64_sve_lsr:
17471     return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
17472   case Intrinsic::aarch64_sve_asr:
17473     return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
17474   case Intrinsic::aarch64_sve_fadd:
17475     return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG);
17476   case Intrinsic::aarch64_sve_fsub:
17477     return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG);
17478   case Intrinsic::aarch64_sve_fmul:
17479     return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG);
17480   case Intrinsic::aarch64_sve_add:
17481     return convertMergedOpToPredOp(N, ISD::ADD, DAG, true);
17482   case Intrinsic::aarch64_sve_sub:
17483     return convertMergedOpToPredOp(N, ISD::SUB, DAG, true);
17484   case Intrinsic::aarch64_sve_subr:
17485     return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
17486   case Intrinsic::aarch64_sve_and:
17487     return convertMergedOpToPredOp(N, ISD::AND, DAG, true);
17488   case Intrinsic::aarch64_sve_bic:
17489     return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true);
17490   case Intrinsic::aarch64_sve_eor:
17491     return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);
17492   case Intrinsic::aarch64_sve_orr:
17493     return convertMergedOpToPredOp(N, ISD::OR, DAG, true);
17494   case Intrinsic::aarch64_sve_sabd:
17495     return convertMergedOpToPredOp(N, ISD::ABDS, DAG, true);
17496   case Intrinsic::aarch64_sve_uabd:
17497     return convertMergedOpToPredOp(N, ISD::ABDU, DAG, true);
17498   case Intrinsic::aarch64_sve_sqadd:
17499     return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
17500   case Intrinsic::aarch64_sve_sqsub:
17501     return convertMergedOpToPredOp(N, ISD::SSUBSAT, DAG, true);
17502   case Intrinsic::aarch64_sve_uqadd:
17503     return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
17504   case Intrinsic::aarch64_sve_uqsub:
17505     return convertMergedOpToPredOp(N, ISD::USUBSAT, DAG, true);
17506   case Intrinsic::aarch64_sve_sqadd_x:
17507     return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
17508                        N->getOperand(1), N->getOperand(2));
17509   case Intrinsic::aarch64_sve_sqsub_x:
17510     return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
17511                        N->getOperand(1), N->getOperand(2));
17512   case Intrinsic::aarch64_sve_uqadd_x:
17513     return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
17514                        N->getOperand(1), N->getOperand(2));
17515   case Intrinsic::aarch64_sve_uqsub_x:
17516     return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
17517                        N->getOperand(1), N->getOperand(2));
17518   case Intrinsic::aarch64_sve_asrd:
17519     return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
17520                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
17521   case Intrinsic::aarch64_sve_cmphs:
17522     if (!N->getOperand(2).getValueType().isFloatingPoint())
17523       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17524                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
17525                          N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
17526     break;
17527   case Intrinsic::aarch64_sve_cmphi:
17528     if (!N->getOperand(2).getValueType().isFloatingPoint())
17529       return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17530                          N->getValueType(0), N->getOperand(1), N->getOperand(2),
17531                          N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
17532     break;
17533   case Intrinsic::aarch64_sve_fcmpge:
17534   case Intrinsic::aarch64_sve_cmpge:
17535     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17536                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
17537                        N->getOperand(3), DAG.getCondCode(ISD::SETGE));
17538     break;
17539   case Intrinsic::aarch64_sve_fcmpgt:
17540   case Intrinsic::aarch64_sve_cmpgt:
17541     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17542                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
17543                        N->getOperand(3), DAG.getCondCode(ISD::SETGT));
17544     break;
17545   case Intrinsic::aarch64_sve_fcmpeq:
17546   case Intrinsic::aarch64_sve_cmpeq:
17547     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17548                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
17549                        N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
17550     break;
17551   case Intrinsic::aarch64_sve_fcmpne:
17552   case Intrinsic::aarch64_sve_cmpne:
17553     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17554                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
17555                        N->getOperand(3), DAG.getCondCode(ISD::SETNE));
17556     break;
17557   case Intrinsic::aarch64_sve_fcmpuo:
17558     return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17559                        N->getValueType(0), N->getOperand(1), N->getOperand(2),
17560                        N->getOperand(3), DAG.getCondCode(ISD::SETUO));
17561     break;
17562   case Intrinsic::aarch64_sve_fadda:
17563     return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
17564   case Intrinsic::aarch64_sve_faddv:
17565     return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
17566   case Intrinsic::aarch64_sve_fmaxnmv:
17567     return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
17568   case Intrinsic::aarch64_sve_fmaxv:
17569     return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
17570   case Intrinsic::aarch64_sve_fminnmv:
17571     return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
17572   case Intrinsic::aarch64_sve_fminv:
17573     return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
17574   case Intrinsic::aarch64_sve_sel:
17575     return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
17576                        N->getOperand(1), N->getOperand(2), N->getOperand(3));
17577   case Intrinsic::aarch64_sve_cmpeq_wide:
17578     return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
17579   case Intrinsic::aarch64_sve_cmpne_wide:
17580     return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
17581   case Intrinsic::aarch64_sve_cmpge_wide:
17582     return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
17583   case Intrinsic::aarch64_sve_cmpgt_wide:
17584     return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
17585   case Intrinsic::aarch64_sve_cmplt_wide:
17586     return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
17587   case Intrinsic::aarch64_sve_cmple_wide:
17588     return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
17589   case Intrinsic::aarch64_sve_cmphs_wide:
17590     return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
17591   case Intrinsic::aarch64_sve_cmphi_wide:
17592     return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
17593   case Intrinsic::aarch64_sve_cmplo_wide:
17594     return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
17595   case Intrinsic::aarch64_sve_cmpls_wide:
17596     return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
17597   case Intrinsic::aarch64_sve_ptest_any:
17598     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
17599                     AArch64CC::ANY_ACTIVE);
17600   case Intrinsic::aarch64_sve_ptest_first:
17601     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
17602                     AArch64CC::FIRST_ACTIVE);
17603   case Intrinsic::aarch64_sve_ptest_last:
17604     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
17605                     AArch64CC::LAST_ACTIVE);
17606   }
17607   return SDValue();
17608 }
17609
17610 static bool isCheapToExtend(const SDValue &N) {
17611   unsigned OC = N->getOpcode();
17612   return OC == ISD::LOAD || OC == ISD::MLOAD ||
17613          ISD::isConstantSplatVectorAllZeros(N.getNode());
17614 }
17615
17616 static SDValue
17617 performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
17618                               SelectionDAG &DAG) {
17619   // If we have (sext (setcc A B)) and A and B are cheap to extend,
17620   // we can move the sext into the arguments and have the same result. For
17621   // example, if A and B are both loads, we can make those extending loads and
17622   // avoid an extra instruction. This pattern appears often in VLS code
17623   // generation where the inputs to the setcc have a different size to the
17624   // instruction that wants to use the result of the setcc.
17625   assert(N->getOpcode() == ISD::SIGN_EXTEND &&
17626          N->getOperand(0)->getOpcode() == ISD::SETCC);
17627   const SDValue SetCC = N->getOperand(0);
17628
17629   const SDValue CCOp0 = SetCC.getOperand(0);
17630   const SDValue CCOp1 = SetCC.getOperand(1);
17631   if (!CCOp0->getValueType(0).isInteger() ||
17632       !CCOp1->getValueType(0).isInteger())
17633     return SDValue();
17634
17635   ISD::CondCode Code =
17636       cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
17637
17638   ISD::NodeType ExtType =
17639       isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
17640
17641   if (isCheapToExtend(SetCC.getOperand(0)) &&
17642       isCheapToExtend(SetCC.getOperand(1))) {
17643     const SDValue Ext1 =
17644         DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
17645     const SDValue Ext2 =
17646         DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
17647
17648     return DAG.getSetCC(
17649         SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
17650         cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
17651   }
17652
17653   return SDValue();
17654 }
17655
17656 static SDValue performExtendCombine(SDNode *N,
17657                                     TargetLowering::DAGCombinerInfo &DCI,
17658                                     SelectionDAG &DAG) {
17659   // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
17660   // we can convert that DUP into another extract_high (of a bigger DUP), which
17661   // helps the backend to decide that an sabdl2 would be useful, saving a real
17662   // extract_high operation.
17663   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
17664       (N->getOperand(0).getOpcode() == ISD::ABDU ||
17665        N->getOperand(0).getOpcode() == ISD::ABDS)) {
17666     SDNode *ABDNode = N->getOperand(0).getNode();
17667     SDValue NewABD =
17668         tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
17669     if (!NewABD.getNode())
17670       return SDValue();
17671
17672     return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
17673   }
17674
17675   if (N->getValueType(0).isFixedLengthVector() &&
17676       N->getOpcode() == ISD::SIGN_EXTEND &&
17677       N->getOperand(0)->getOpcode() == ISD::SETCC)
17678     return performSignExtendSetCCCombine(N, DCI, DAG);
17679
17680   return SDValue();
17681 }
17682
17683 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
17684                                SDValue SplatVal, unsigned NumVecElts) {
17685   assert(!St.isTruncatingStore() && "cannot split truncating vector store");
17686   Align OrigAlignment = St.getAlign();
17687   unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
17688
17689   // Create scalar stores. This is at least as good as the code sequence for a
17690   // split unaligned store which is a dup.s, ext.b, and two stores.
17691   // Most of the time the three stores should be replaced by store pair
17692   // instructions (stp).
17693   SDLoc DL(&St);
17694   SDValue BasePtr = St.getBasePtr();
17695   uint64_t BaseOffset = 0;
17696
17697   const MachinePointerInfo &PtrInfo = St.getPointerInfo();
17698   SDValue NewST1 =
17699       DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
17700                    OrigAlignment, St.getMemOperand()->getFlags());
17701
17702   // As this in ISel, we will not merge this add which may degrade results.
17703   if (BasePtr->getOpcode() == ISD::ADD &&
17704       isa<ConstantSDNode>(BasePtr->getOperand(1))) {
17705     BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
17706     BasePtr = BasePtr->getOperand(0);
17707   }
17708
17709   unsigned Offset = EltOffset;
17710   while (--NumVecElts) {
17711     Align Alignment = commonAlignment(OrigAlignment, Offset);
17712     SDValue OffsetPtr =
17713         DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
17714                     DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
17715     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
17716                           PtrInfo.getWithOffset(Offset), Alignment,
17717                           St.getMemOperand()->getFlags());
17718     Offset += EltOffset;
17719   }
17720   return NewST1;
17721 }
17722
17723 // Returns an SVE type that ContentTy can be trivially sign or zero extended
17724 // into.
17725 static MVT getSVEContainerType(EVT ContentTy) {
17726   assert(ContentTy.isSimple() && "No SVE containers for extended types");
17727
17728   switch (ContentTy.getSimpleVT().SimpleTy) {
17729   default:
17730     llvm_unreachable("No known SVE container for this MVT type");
17731   case MVT::nxv2i8:
17732   case MVT::nxv2i16:
17733   case MVT::nxv2i32:
17734   case MVT::nxv2i64:
17735   case MVT::nxv2f32:
17736   case MVT::nxv2f64:
17737     return MVT::nxv2i64;
17738   case MVT::nxv4i8:
17739   case MVT::nxv4i16:
17740   case MVT::nxv4i32:
17741   case MVT::nxv4f32:
17742     return MVT::nxv4i32;
17743   case MVT::nxv8i8:
17744   case MVT::nxv8i16:
17745   case MVT::nxv8f16:
17746   case MVT::nxv8bf16:
17747     return MVT::nxv8i16;
17748   case MVT::nxv16i8:
17749     return MVT::nxv16i8;
17750   }
17751 }
17752
17753 static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
17754   SDLoc DL(N);
17755   EVT VT = N->getValueType(0);
17756
17757   if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
17758     return SDValue();
17759
17760   EVT ContainerVT = VT;
17761   if (ContainerVT.isInteger())
17762     ContainerVT = getSVEContainerType(ContainerVT);
17763
17764   SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
17765   SDValue Ops[] = { N->getOperand(0), // Chain
17766                     N->getOperand(2), // Pg
17767                     N->getOperand(3), // Base
17768                     DAG.getValueType(VT) };
17769
17770   SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
17771   SDValue LoadChain = SDValue(Load.getNode(), 1);
17772
17773   if (ContainerVT.isInteger() && (VT != ContainerVT))
17774     Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
17775
17776   return DAG.getMergeValues({ Load, LoadChain }, DL);
17777 }
17778
17779 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
17780   SDLoc DL(N);
17781   EVT VT = N->getValueType(0);
17782   EVT PtrTy = N->getOperand(3).getValueType();
17783
17784   EVT LoadVT = VT;
17785   if (VT.isFloatingPoint())
17786     LoadVT = VT.changeTypeToInteger();
17787
17788   auto *MINode = cast<MemIntrinsicSDNode>(N);
17789   SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
17790   SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
17791                                 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
17792                                 MINode->getOperand(2), PassThru,
17793                                 MINode->getMemoryVT(), MINode->getMemOperand(),
17794                                 ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
17795
17796    if (VT.isFloatingPoint()) {
17797      SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
17798      return DAG.getMergeValues(Ops, DL);
17799    }
17800
17801   return L;
17802 }
17803
17804 template <unsigned Opcode>
17805 static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
17806   static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
17807                     Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
17808                 "Unsupported opcode.");
17809   SDLoc DL(N);
17810   EVT VT = N->getValueType(0);
17811
17812   EVT LoadVT = VT;
17813   if (VT.isFloatingPoint())
17814     LoadVT = VT.changeTypeToInteger();
17815
17816   SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
17817   SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
17818   SDValue LoadChain = SDValue(Load.getNode(), 1);
17819
17820   if (VT.isFloatingPoint())
17821     Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
17822
17823   return DAG.getMergeValues({Load, LoadChain}, DL);
17824 }
17825
17826 static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
17827   SDLoc DL(N);
17828   SDValue Data = N->getOperand(2);
17829   EVT DataVT = Data.getValueType();
17830   EVT HwSrcVt = getSVEContainerType(DataVT);
17831   SDValue InputVT = DAG.getValueType(DataVT);
17832
17833   if (DataVT.isFloatingPoint())
17834     InputVT = DAG.getValueType(HwSrcVt);
17835
17836   SDValue SrcNew;
17837   if (Data.getValueType().isFloatingPoint())
17838     SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
17839   else
17840     SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
17841
17842   SDValue Ops[] = { N->getOperand(0), // Chain
17843                     SrcNew,
17844                     N->getOperand(4), // Base
17845                     N->getOperand(3), // Pg
17846                     InputVT
17847                   };
17848
17849   return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
17850 }
17851
17852 static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
17853   SDLoc DL(N);
17854
17855   SDValue Data = N->getOperand(2);
17856   EVT DataVT = Data.getValueType();
17857   EVT PtrTy = N->getOperand(4).getValueType();
17858
17859   if (DataVT.isFloatingPoint())
17860     Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
17861
17862   auto *MINode = cast<MemIntrinsicSDNode>(N);
17863   return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
17864                             DAG.getUNDEF(PtrTy), MINode->getOperand(3),
17865                             MINode->getMemoryVT(), MINode->getMemOperand(),
17866                             ISD::UNINDEXED, false, false);
17867 }
17868
17869 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
17870 /// load store optimizer pass will merge them to store pair stores.  This should
17871 /// be better than a movi to create the vector zero followed by a vector store
17872 /// if the zero constant is not re-used, since one instructions and one register
17873 /// live range will be removed.
17874 ///
17875 /// For example, the final generated code should be:
17876 ///
17877 ///   stp xzr, xzr, [x0]
17878 ///
17879 /// instead of:
17880 ///
17881 ///   movi v0.2d, #0
17882 ///   str q0, [x0]
17883 ///
17884 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
17885   SDValue StVal = St.getValue();
17886   EVT VT = StVal.getValueType();
17887
17888   // Avoid scalarizing zero splat stores for scalable vectors.
17889   if (VT.isScalableVector())
17890     return SDValue();
17891
17892   // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
17893   // 2, 3 or 4 i32 elements.
17894   int NumVecElts = VT.getVectorNumElements();
17895   if (!(((NumVecElts == 2 || NumVecElts == 3) &&
17896          VT.getVectorElementType().getSizeInBits() == 64) ||
17897         ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
17898          VT.getVectorElementType().getSizeInBits() == 32)))
17899     return SDValue();
17900
17901   if (StVal.getOpcode() != ISD::BUILD_VECTOR)
17902     return SDValue();
17903
17904   // If the zero constant has more than one use then the vector store could be
17905   // better since the constant mov will be amortized and stp q instructions
17906   // should be able to be formed.
17907   if (!StVal.hasOneUse())
17908     return SDValue();
17909
17910   // If the store is truncating then it's going down to i16 or smaller, which
17911   // means it can be implemented in a single store anyway.
17912   if (St.isTruncatingStore())
17913     return SDValue();
17914
17915   // If the immediate offset of the address operand is too large for the stp
17916   // instruction, then bail out.
17917   if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
17918     int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
17919     if (Offset < -512 || Offset > 504)
17920       return SDValue();
17921   }
17922
17923   for (int I = 0; I < NumVecElts; ++I) {
17924     SDValue EltVal = StVal.getOperand(I);
17925     if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
17926       return SDValue();
17927   }
17928
17929   // Use a CopyFromReg WZR/XZR here to prevent
17930   // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
17931   SDLoc DL(&St);
17932   unsigned ZeroReg;
17933   EVT ZeroVT;
17934   if (VT.getVectorElementType().getSizeInBits() == 32) {
17935     ZeroReg = AArch64::WZR;
17936     ZeroVT = MVT::i32;
17937   } else {
17938     ZeroReg = AArch64::XZR;
17939     ZeroVT = MVT::i64;
17940   }
17941   SDValue SplatVal =
17942       DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
17943   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
17944 }
17945
17946 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
17947 /// value. The load store optimizer pass will merge them to store pair stores.
17948 /// This has better performance than a splat of the scalar followed by a split
17949 /// vector store. Even if the stores are not merged it is four stores vs a dup,
17950 /// followed by an ext.b and two stores.
17951 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
17952   SDValue StVal = St.getValue();
17953   EVT VT = StVal.getValueType();
17954
17955   // Don't replace floating point stores, they possibly won't be transformed to
17956   // stp because of the store pair suppress pass.
17957   if (VT.isFloatingPoint())
17958     return SDValue();
17959
17960   // We can express a splat as store pair(s) for 2 or 4 elements.
17961   unsigned NumVecElts = VT.getVectorNumElements();
17962   if (NumVecElts != 4 && NumVecElts != 2)
17963     return SDValue();
17964
17965   // If the store is truncating then it's going down to i16 or smaller, which
17966   // means it can be implemented in a single store anyway.
17967   if (St.isTruncatingStore())
17968     return SDValue();
17969
17970   // Check that this is a splat.
17971   // Make sure that each of the relevant vector element locations are inserted
17972   // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
17973   std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
17974   SDValue SplatVal;
17975   for (unsigned I = 0; I < NumVecElts; ++I) {
17976     // Check for insert vector elements.
17977     if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
17978       return SDValue();
17979
17980     // Check that same value is inserted at each vector element.
17981     if (I == 0)
17982       SplatVal = StVal.getOperand(1);
17983     else if (StVal.getOperand(1) != SplatVal)
17984       return SDValue();
17985
17986     // Check insert element index.
17987     ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
17988     if (!CIndex)
17989       return SDValue();
17990     uint64_t IndexVal = CIndex->getZExtValue();
17991     if (IndexVal >= NumVecElts)
17992       return SDValue();
17993     IndexNotInserted.reset(IndexVal);
17994
17995     StVal = StVal.getOperand(0);
17996   }
17997   // Check that all vector element locations were inserted to.
17998   if (IndexNotInserted.any())
17999       return SDValue();
18000
18001   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
18002 }
18003
18004 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18005                            SelectionDAG &DAG,
18006                            const AArch64Subtarget *Subtarget) {
18007
18008   StoreSDNode *S = cast<StoreSDNode>(N);
18009   if (S->isVolatile() || S->isIndexed())
18010     return SDValue();
18011
18012   SDValue StVal = S->getValue();
18013   EVT VT = StVal.getValueType();
18014
18015   if (!VT.isFixedLengthVector())
18016     return SDValue();
18017
18018   // If we get a splat of zeros, convert this vector store to a store of
18019   // scalars. They will be merged into store pairs of xzr thereby removing one
18020   // instruction and one register.
18021   if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
18022     return ReplacedZeroSplat;
18023
18024   // FIXME: The logic for deciding if an unaligned store should be split should
18025   // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
18026   // a call to that function here.
18027
18028   if (!Subtarget->isMisaligned128StoreSlow())
18029     return SDValue();
18030
18031   // Don't split at -Oz.
18032   if (DAG.getMachineFunction().getFunction().hasMinSize())
18033     return SDValue();
18034
18035   // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
18036   // those up regresses performance on micro-benchmarks and olden/bh.
18037   if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
18038     return SDValue();
18039
18040   // Split unaligned 16B stores. They are terrible for performance.
18041   // Don't split stores with alignment of 1 or 2. Code that uses clang vector
18042   // extensions can use this to mark that it does not want splitting to happen
18043   // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
18044   // eliminating alignment hazards is only 1 in 8 for alignment of 2.
18045   if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
18046       S->getAlign() <= Align(2))
18047     return SDValue();
18048
18049   // If we get a splat of a scalar convert this vector store to a store of
18050   // scalars. They will be merged into store pairs thereby removing two
18051   // instructions.
18052   if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
18053     return ReplacedSplat;
18054
18055   SDLoc DL(S);
18056
18057   // Split VT into two.
18058   EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18059   unsigned NumElts = HalfVT.getVectorNumElements();
18060   SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
18061                                    DAG.getConstant(0, DL, MVT::i64));
18062   SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
18063                                    DAG.getConstant(NumElts, DL, MVT::i64));
18064   SDValue BasePtr = S->getBasePtr();
18065   SDValue NewST1 =
18066       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
18067                    S->getAlign(), S->getMemOperand()->getFlags());
18068   SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
18069                                   DAG.getConstant(8, DL, MVT::i64));
18070   return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
18071                       S->getPointerInfo(), S->getAlign(),
18072                       S->getMemOperand()->getFlags());
18073 }
18074
18075 static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
18076   assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
18077
18078   // splice(pg, op1, undef) -> op1
18079   if (N->getOperand(2).isUndef())
18080     return N->getOperand(1);
18081
18082   return SDValue();
18083 }
18084
18085 static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
18086                                     const AArch64Subtarget *Subtarget) {
18087   assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
18088           N->getOpcode() == AArch64ISD::UUNPKLO) &&
18089          "Unexpected Opcode!");
18090
18091   // uunpklo/hi undef -> undef
18092   if (N->getOperand(0).isUndef())
18093     return DAG.getUNDEF(N->getValueType(0));
18094
18095   // If this is a masked load followed by an UUNPKLO, fold this into a masked
18096   // extending load.  We can do this even if this is already a masked
18097   // {z,}extload.
18098   if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
18099       N->getOpcode() == AArch64ISD::UUNPKLO) {
18100     MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
18101     SDValue Mask = MLD->getMask();
18102     SDLoc DL(N);
18103
18104     if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
18105         SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
18106         (MLD->getPassThru()->isUndef() ||
18107          isZerosVector(MLD->getPassThru().getNode()))) {
18108       unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
18109       unsigned PgPattern = Mask->getConstantOperandVal(0);
18110       EVT VT = N->getValueType(0);
18111
18112       // Ensure we can double the size of the predicate pattern
18113       unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
18114       if (NumElts &&
18115           NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
18116         Mask =
18117             getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
18118         SDValue PassThru = DAG.getConstant(0, DL, VT);
18119         SDValue NewLoad = DAG.getMaskedLoad(
18120             VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
18121             PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
18122             MLD->getAddressingMode(), ISD::ZEXTLOAD);
18123
18124         DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
18125
18126         return NewLoad;
18127       }
18128     }
18129   }
18130
18131   return SDValue();
18132 }
18133
18134 static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
18135   SDLoc DL(N);
18136   SDValue Op0 = N->getOperand(0);
18137   SDValue Op1 = N->getOperand(1);
18138   EVT ResVT = N->getValueType(0);
18139
18140   // uzp1(x, undef) -> concat(truncate(x), undef)
18141   if (Op1.getOpcode() == ISD::UNDEF) {
18142     EVT BCVT = MVT::Other, HalfVT = MVT::Other;
18143     switch (ResVT.getSimpleVT().SimpleTy) {
18144     default:
18145       break;
18146     case MVT::v16i8:
18147       BCVT = MVT::v8i16;
18148       HalfVT = MVT::v8i8;
18149       break;
18150     case MVT::v8i16:
18151       BCVT = MVT::v4i32;
18152       HalfVT = MVT::v4i16;
18153       break;
18154     case MVT::v4i32:
18155       BCVT = MVT::v2i64;
18156       HalfVT = MVT::v2i32;
18157       break;
18158     }
18159     if (BCVT != MVT::Other) {
18160       SDValue BC = DAG.getBitcast(BCVT, Op0);
18161       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
18162       return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
18163                          DAG.getUNDEF(HalfVT));
18164     }
18165   }
18166
18167   // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
18168   if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
18169     if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
18170       SDValue X = Op0.getOperand(0).getOperand(0);
18171       return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
18172     }
18173   }
18174
18175   // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
18176   if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
18177     if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
18178       SDValue Z = Op1.getOperand(0).getOperand(1);
18179       return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
18180     }
18181   }
18182
18183   // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
18184   // Only implemented on little-endian subtargets.
18185   bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
18186
18187   // This optimization only works on little endian.
18188   if (!IsLittleEndian)
18189     return SDValue();
18190
18191   if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
18192     return SDValue();
18193
18194   auto getSourceOp = [](SDValue Operand) -> SDValue {
18195     const unsigned Opcode = Operand.getOpcode();
18196     if (Opcode == ISD::TRUNCATE)
18197       return Operand->getOperand(0);
18198     if (Opcode == ISD::BITCAST &&
18199         Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
18200       return Operand->getOperand(0)->getOperand(0);
18201     return SDValue();
18202   };
18203
18204   SDValue SourceOp0 = getSourceOp(Op0);
18205   SDValue SourceOp1 = getSourceOp(Op1);
18206
18207   if (!SourceOp0 || !SourceOp1)
18208     return SDValue();
18209
18210   if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
18211       !SourceOp0.getValueType().isSimple())
18212     return SDValue();
18213
18214   EVT ResultTy;
18215
18216   switch (SourceOp0.getSimpleValueType().SimpleTy) {
18217   case MVT::v2i64:
18218     ResultTy = MVT::v4i32;
18219     break;
18220   case MVT::v4i32:
18221     ResultTy = MVT::v8i16;
18222     break;
18223   case MVT::v8i16:
18224     ResultTy = MVT::v16i8;
18225     break;
18226   default:
18227     return SDValue();
18228   }
18229
18230   SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
18231   SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
18232   SDValue UzpResult =
18233       DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
18234
18235   EVT BitcastResultTy;
18236
18237   switch (ResVT.getSimpleVT().SimpleTy) {
18238   case MVT::v2i32:
18239     BitcastResultTy = MVT::v2i64;
18240     break;
18241   case MVT::v4i16:
18242     BitcastResultTy = MVT::v4i32;
18243     break;
18244   case MVT::v8i8:
18245     BitcastResultTy = MVT::v8i16;
18246     break;
18247   default:
18248     llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
18249   }
18250
18251   return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
18252                      DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
18253 }
18254
18255 static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
18256   unsigned Opc = N->getOpcode();
18257
18258   assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
18259            Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
18260           (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
18261            Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
18262          "Invalid opcode.");
18263
18264   const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
18265                       Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
18266   const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
18267                       Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
18268   const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
18269                         Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
18270                         Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
18271                         Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
18272
18273   SDLoc DL(N);
18274   SDValue Chain = N->getOperand(0);
18275   SDValue Pg = N->getOperand(1);
18276   SDValue Base = N->getOperand(2);
18277   SDValue Offset = N->getOperand(3);
18278   SDValue Ty = N->getOperand(4);
18279
18280   EVT ResVT = N->getValueType(0);
18281
18282   const auto OffsetOpc = Offset.getOpcode();
18283   const bool OffsetIsZExt =
18284       OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
18285   const bool OffsetIsSExt =
18286       OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
18287
18288   // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
18289   if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
18290     SDValue ExtPg = Offset.getOperand(0);
18291     VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
18292     EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
18293
18294     // If the predicate for the sign- or zero-extended offset is the
18295     // same as the predicate used for this load and the sign-/zero-extension
18296     // was from a 32-bits...
18297     if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
18298       SDValue UnextendedOffset = Offset.getOperand(1);
18299
18300       unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
18301       if (Signed)
18302         NewOpc = getSignExtendedGatherOpcode(NewOpc);
18303
18304       return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
18305                          {Chain, Pg, Base, UnextendedOffset, Ty});
18306     }
18307   }
18308
18309   return SDValue();
18310 }
18311
18312 /// Optimize a vector shift instruction and its operand if shifted out
18313 /// bits are not used.
18314 static SDValue performVectorShiftCombine(SDNode *N,
18315                                          const AArch64TargetLowering &TLI,
18316                                          TargetLowering::DAGCombinerInfo &DCI) {
18317   assert(N->getOpcode() == AArch64ISD::VASHR ||
18318          N->getOpcode() == AArch64ISD::VLSHR);
18319
18320   SDValue Op = N->getOperand(0);
18321   unsigned OpScalarSize = Op.getScalarValueSizeInBits();
18322
18323   unsigned ShiftImm = N->getConstantOperandVal(1);
18324   assert(OpScalarSize > ShiftImm && "Invalid shift imm");
18325
18326   APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
18327   APInt DemandedMask = ~ShiftedOutBits;
18328
18329   if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
18330     return SDValue(N, 0);
18331
18332   return SDValue();
18333 }
18334
18335 static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
18336   // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
18337   // This transform works in partnership with performSetCCPunpkCombine to
18338   // remove unnecessary transfer of predicates into standard registers and back
18339   if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
18340       N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
18341           MVT::i1) {
18342     SDValue CC = N->getOperand(0)->getOperand(0);
18343     auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
18344     SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
18345                                DAG.getVectorIdxConstant(0, SDLoc(N)));
18346     return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
18347   }
18348
18349   return SDValue();
18350 }
18351
18352 /// Target-specific DAG combine function for post-increment LD1 (lane) and
18353 /// post-increment LD1R.
18354 static SDValue performPostLD1Combine(SDNode *N,
18355                                      TargetLowering::DAGCombinerInfo &DCI,
18356                                      bool IsLaneOp) {
18357   if (DCI.isBeforeLegalizeOps())
18358     return SDValue();
18359
18360   SelectionDAG &DAG = DCI.DAG;
18361   EVT VT = N->getValueType(0);
18362
18363   if (!VT.is128BitVector() && !VT.is64BitVector())
18364     return SDValue();
18365
18366   unsigned LoadIdx = IsLaneOp ? 1 : 0;
18367   SDNode *LD = N->getOperand(LoadIdx).getNode();
18368   // If it is not LOAD, can not do such combine.
18369   if (LD->getOpcode() != ISD::LOAD)
18370     return SDValue();
18371
18372   // The vector lane must be a constant in the LD1LANE opcode.
18373   SDValue Lane;
18374   if (IsLaneOp) {
18375     Lane = N->getOperand(2);
18376     auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
18377     if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
18378       return SDValue();
18379   }
18380
18381   LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
18382   EVT MemVT = LoadSDN->getMemoryVT();
18383   // Check if memory operand is the same type as the vector element.
18384   if (MemVT != VT.getVectorElementType())
18385     return SDValue();
18386
18387   // Check if there are other uses. If so, do not combine as it will introduce
18388   // an extra load.
18389   for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
18390        ++UI) {
18391     if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
18392       continue;
18393     if (*UI != N)
18394       return SDValue();
18395   }
18396
18397   SDValue Addr = LD->getOperand(1);
18398   SDValue Vector = N->getOperand(0);
18399   // Search for a use of the address operand that is an increment.
18400   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
18401        Addr.getNode()->use_end(); UI != UE; ++UI) {
18402     SDNode *User = *UI;
18403     if (User->getOpcode() != ISD::ADD
18404         || UI.getUse().getResNo() != Addr.getResNo())
18405       continue;
18406
18407     // If the increment is a constant, it must match the memory ref size.
18408     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
18409     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
18410       uint32_t IncVal = CInc->getZExtValue();
18411       unsigned NumBytes = VT.getScalarSizeInBits() / 8;
18412       if (IncVal != NumBytes)
18413         continue;
18414       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
18415     }
18416
18417     // To avoid cycle construction make sure that neither the load nor the add
18418     // are predecessors to each other or the Vector.
18419     SmallPtrSet<const SDNode *, 32> Visited;
18420     SmallVector<const SDNode *, 16> Worklist;
18421     Visited.insert(Addr.getNode());
18422     Worklist.push_back(User);
18423     Worklist.push_back(LD);
18424     Worklist.push_back(Vector.getNode());
18425     if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
18426         SDNode::hasPredecessorHelper(User, Visited, Worklist))
18427       continue;
18428
18429     SmallVector<SDValue, 8> Ops;
18430     Ops.push_back(LD->getOperand(0));  // Chain
18431     if (IsLaneOp) {
18432       Ops.push_back(Vector);           // The vector to be inserted
18433       Ops.push_back(Lane);             // The lane to be inserted in the vector
18434     }
18435     Ops.push_back(Addr);
18436     Ops.push_back(Inc);
18437
18438     EVT Tys[3] = { VT, MVT::i64, MVT::Other };
18439     SDVTList SDTys = DAG.getVTList(Tys);
18440     unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
18441     SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
18442                                            MemVT,
18443                                            LoadSDN->getMemOperand());
18444
18445     // Update the uses.
18446     SDValue NewResults[] = {
18447         SDValue(LD, 0),            // The result of load
18448         SDValue(UpdN.getNode(), 2) // Chain
18449     };
18450     DCI.CombineTo(LD, NewResults);
18451     DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
18452     DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
18453
18454     break;
18455   }
18456   return SDValue();
18457 }
18458
18459 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during
18460 /// address translation.
18461 static bool performTBISimplification(SDValue Addr,
18462                                      TargetLowering::DAGCombinerInfo &DCI,
18463                                      SelectionDAG &DAG) {
18464   APInt DemandedMask = APInt::getLowBitsSet(64, 56);
18465   KnownBits Known;
18466   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
18467                                         !DCI.isBeforeLegalizeOps());
18468   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18469   if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
18470     DCI.CommitTargetLoweringOpt(TLO);
18471     return true;
18472   }
18473   return false;
18474 }
18475
18476 static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
18477   assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
18478          "Expected STORE dag node in input!");
18479
18480   if (auto Store = dyn_cast<StoreSDNode>(N)) {
18481     if (!Store->isTruncatingStore() || Store->isIndexed())
18482       return SDValue();
18483     SDValue Ext = Store->getValue();
18484     auto ExtOpCode = Ext.getOpcode();
18485     if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
18486         ExtOpCode != ISD::ANY_EXTEND)
18487       return SDValue();
18488     SDValue Orig = Ext->getOperand(0);
18489     if (Store->getMemoryVT() != Orig.getValueType())
18490       return SDValue();
18491     return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
18492                         Store->getBasePtr(), Store->getMemOperand());
18493   }
18494
18495   return SDValue();
18496 }
18497
18498 // Perform TBI simplification if supported by the target and try to break up
18499 // nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
18500 // load instructions can be selected.
18501 static SDValue performLOADCombine(SDNode *N,
18502                                   TargetLowering::DAGCombinerInfo &DCI,
18503                                   SelectionDAG &DAG,
18504                                   const AArch64Subtarget *Subtarget) {
18505   if (Subtarget->supportsAddressTopByteIgnored())
18506     performTBISimplification(N->getOperand(1), DCI, DAG);
18507
18508   LoadSDNode *LD = cast<LoadSDNode>(N);
18509   EVT MemVT = LD->getMemoryVT();
18510   if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
18511     return SDValue(N, 0);
18512
18513   if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
18514       MemVT.getSizeInBits() % 256 == 0 ||
18515       256 % MemVT.getScalarSizeInBits() != 0)
18516     return SDValue(N, 0);
18517
18518   SDLoc DL(LD);
18519   SDValue Chain = LD->getChain();
18520   SDValue BasePtr = LD->getBasePtr();
18521   SDNodeFlags Flags = LD->getFlags();
18522   SmallVector<SDValue, 4> LoadOps;
18523   SmallVector<SDValue, 4> LoadOpsChain;
18524   // Replace any non temporal load over 256-bit with a series of 256 bit loads
18525   // and a scalar/vector load less than 256. This way we can utilize 256-bit
18526   // loads and reduce the amount of load instructions generated.
18527   MVT NewVT =
18528       MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
18529                        256 / MemVT.getVectorElementType().getSizeInBits());
18530   unsigned Num256Loads = MemVT.getSizeInBits() / 256;
18531   // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
18532   for (unsigned I = 0; I < Num256Loads; I++) {
18533     unsigned PtrOffset = I * 32;
18534     SDValue NewPtr = DAG.getMemBasePlusOffset(
18535         BasePtr, TypeSize::Fixed(PtrOffset), DL, Flags);
18536     Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
18537     SDValue NewLoad = DAG.getLoad(
18538         NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
18539         NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
18540     LoadOps.push_back(NewLoad);
18541     LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
18542   }
18543
18544   // Process remaining bits of the load operation.
18545   // This is done by creating an UNDEF vector to match the size of the
18546   // 256-bit loads and inserting the remaining load to it. We extract the
18547   // original load type at the end using EXTRACT_SUBVECTOR instruction.
18548   unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
18549   unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
18550   MVT RemainingVT = MVT::getVectorVT(
18551       MemVT.getVectorElementType().getSimpleVT(),
18552       BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
18553   SDValue NewPtr =
18554       DAG.getMemBasePlusOffset(BasePtr, TypeSize::Fixed(PtrOffset), DL, Flags);
18555   Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
18556   SDValue RemainingLoad =
18557       DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
18558                   LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
18559                   LD->getMemOperand()->getFlags(), LD->getAAInfo());
18560   SDValue UndefVector = DAG.getUNDEF(NewVT);
18561   SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
18562   SDValue ExtendedReminingLoad =
18563       DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
18564                   {UndefVector, RemainingLoad, InsertIdx});
18565   LoadOps.push_back(ExtendedReminingLoad);
18566   LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
18567   EVT ConcatVT =
18568       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18569                        LoadOps.size() * NewVT.getVectorNumElements());
18570   SDValue ConcatVectors =
18571       DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
18572   // Extract the original vector type size.
18573   SDValue ExtractSubVector =
18574       DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
18575                   {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
18576   SDValue TokenFactor =
18577       DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
18578   return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
18579 }
18580
18581 static SDValue performSTORECombine(SDNode *N,
18582                                    TargetLowering::DAGCombinerInfo &DCI,
18583                                    SelectionDAG &DAG,
18584                                    const AArch64Subtarget *Subtarget) {
18585   StoreSDNode *ST = cast<StoreSDNode>(N);
18586   SDValue Chain = ST->getChain();
18587   SDValue Value = ST->getValue();
18588   SDValue Ptr = ST->getBasePtr();
18589
18590   // If this is an FP_ROUND followed by a store, fold this into a truncating
18591   // store. We can do this even if this is already a truncstore.
18592   // We purposefully don't care about legality of the nodes here as we know
18593   // they can be split down into something legal.
18594   if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
18595       Value.getNode()->hasOneUse() && ST->isUnindexed() &&
18596       Subtarget->useSVEForFixedLengthVectors() &&
18597       Value.getValueType().isFixedLengthVector() &&
18598       Value.getValueType().getFixedSizeInBits() >=
18599           Subtarget->getMinSVEVectorSizeInBits())
18600     return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
18601                              ST->getMemoryVT(), ST->getMemOperand());
18602
18603   if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
18604     return Split;
18605
18606   if (Subtarget->supportsAddressTopByteIgnored() &&
18607       performTBISimplification(N->getOperand(2), DCI, DAG))
18608     return SDValue(N, 0);
18609
18610   if (SDValue Store = foldTruncStoreOfExt(DAG, N))
18611     return Store;
18612
18613   return SDValue();
18614 }
18615
18616 static SDValue performMSTORECombine(SDNode *N,
18617                                     TargetLowering::DAGCombinerInfo &DCI,
18618                                     SelectionDAG &DAG,
18619                                     const AArch64Subtarget *Subtarget) {
18620   MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
18621   SDValue Value = MST->getValue();
18622   SDValue Mask = MST->getMask();
18623   SDLoc DL(N);
18624
18625   // If this is a UZP1 followed by a masked store, fold this into a masked
18626   // truncating store.  We can do this even if this is already a masked
18627   // truncstore.
18628   if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
18629       MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
18630       Value.getValueType().isInteger()) {
18631     Value = Value.getOperand(0);
18632     if (Value.getOpcode() == ISD::BITCAST) {
18633       EVT HalfVT =
18634           Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
18635       EVT InVT = Value.getOperand(0).getValueType();
18636
18637       if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
18638         unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
18639         unsigned PgPattern = Mask->getConstantOperandVal(0);
18640
18641         // Ensure we can double the size of the predicate pattern
18642         unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
18643         if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
18644                            MinSVESize) {
18645           Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
18646                           PgPattern);
18647           return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
18648                                     MST->getBasePtr(), MST->getOffset(), Mask,
18649                                     MST->getMemoryVT(), MST->getMemOperand(),
18650                                     MST->getAddressingMode(),
18651                                     /*IsTruncating=*/true);
18652         }
18653       }
18654     }
18655   }
18656
18657   return SDValue();
18658 }
18659
18660 /// \return true if part of the index was folded into the Base.
18661 static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
18662                               SDLoc DL, SelectionDAG &DAG) {
18663   // This function assumes a vector of i64 indices.
18664   EVT IndexVT = Index.getValueType();
18665   if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
18666     return false;
18667
18668   // Simplify:
18669   //   BasePtr = Ptr
18670   //   Index = X + splat(Offset)
18671   // ->
18672   //   BasePtr = Ptr + Offset * scale.
18673   //   Index = X
18674   if (Index.getOpcode() == ISD::ADD) {
18675     if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
18676       Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
18677       BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
18678       Index = Index.getOperand(0);
18679       return true;
18680     }
18681   }
18682
18683   // Simplify:
18684   //   BasePtr = Ptr
18685   //   Index = (X + splat(Offset)) << splat(Shift)
18686   // ->
18687   //   BasePtr = Ptr + (Offset << Shift) * scale)
18688   //   Index = X << splat(shift)
18689   if (Index.getOpcode() == ISD::SHL &&
18690       Index.getOperand(0).getOpcode() == ISD::ADD) {
18691     SDValue Add = Index.getOperand(0);
18692     SDValue ShiftOp = Index.getOperand(1);
18693     SDValue OffsetOp = Add.getOperand(1);
18694     if (auto Shift = DAG.getSplatValue(ShiftOp))
18695       if (auto Offset = DAG.getSplatValue(OffsetOp)) {
18696         Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
18697         Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
18698         BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
18699         Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
18700                             Add.getOperand(0), ShiftOp);
18701         return true;
18702       }
18703   }
18704
18705   return false;
18706 }
18707
18708 // Analyse the specified address returning true if a more optimal addressing
18709 // mode is available. When returning true all parameters are updated to reflect
18710 // their recommended values.
18711 static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
18712                                      SDValue &BasePtr, SDValue &Index,
18713                                      SelectionDAG &DAG) {
18714   // Try to iteratively fold parts of the index into the base pointer to
18715   // simplify the index as much as possible.
18716   bool Changed = false;
18717   while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
18718     Changed = true;
18719
18720   // Only consider element types that are pointer sized as smaller types can
18721   // be easily promoted.
18722   EVT IndexVT = Index.getValueType();
18723   if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
18724     return Changed;
18725
18726   // Can indices be trivially shrunk?
18727   EVT DataVT = N->getOperand(1).getValueType();
18728   // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
18729   // will later be re-extended to 64 bits in legalization
18730   if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
18731     return Changed;
18732   if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
18733     EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
18734     Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
18735     return true;
18736   }
18737
18738   // Match:
18739   //   Index = step(const)
18740   int64_t Stride = 0;
18741   if (Index.getOpcode() == ISD::STEP_VECTOR) {
18742     Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
18743   }
18744   // Match:
18745   //   Index = step(const) << shift(const)
18746   else if (Index.getOpcode() == ISD::SHL &&
18747            Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
18748     SDValue RHS = Index.getOperand(1);
18749     if (auto *Shift =
18750             dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
18751       int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
18752       Stride = Step << Shift->getZExtValue();
18753     }
18754   }
18755
18756   // Return early because no supported pattern is found.
18757   if (Stride == 0)
18758     return Changed;
18759
18760   if (Stride < std::numeric_limits<int32_t>::min() ||
18761       Stride > std::numeric_limits<int32_t>::max())
18762     return Changed;
18763
18764   const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
18765   unsigned MaxVScale =
18766       Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
18767   int64_t LastElementOffset =
18768       IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
18769
18770   if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
18771       LastElementOffset > std::numeric_limits<int32_t>::max())
18772     return Changed;
18773
18774   EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
18775   // Stride does not scale explicitly by 'Scale', because it happens in
18776   // the gather/scatter addressing mode.
18777   Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
18778   return true;
18779 }
18780
18781 static SDValue performMaskedGatherScatterCombine(
18782     SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
18783   MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
18784   assert(MGS && "Can only combine gather load or scatter store nodes");
18785
18786   if (!DCI.isBeforeLegalize())
18787     return SDValue();
18788
18789   SDLoc DL(MGS);
18790   SDValue Chain = MGS->getChain();
18791   SDValue Scale = MGS->getScale();
18792   SDValue Index = MGS->getIndex();
18793   SDValue Mask = MGS->getMask();
18794   SDValue BasePtr = MGS->getBasePtr();
18795   ISD::MemIndexType IndexType = MGS->getIndexType();
18796
18797   if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
18798     return SDValue();
18799
18800   // Here we catch such cases early and change MGATHER's IndexType to allow
18801   // the use of an Index that's more legalisation friendly.
18802   if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
18803     SDValue PassThru = MGT->getPassThru();
18804     SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
18805     return DAG.getMaskedGather(
18806         DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
18807         Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
18808   }
18809   auto *MSC = cast<MaskedScatterSDNode>(MGS);
18810   SDValue Data = MSC->getValue();
18811   SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
18812   return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
18813                               Ops, MSC->getMemOperand(), IndexType,
18814                               MSC->isTruncatingStore());
18815 }
18816
18817 /// Target-specific DAG combine function for NEON load/store intrinsics
18818 /// to merge base address updates.
18819 static SDValue performNEONPostLDSTCombine(SDNode *N,
18820                                           TargetLowering::DAGCombinerInfo &DCI,
18821                                           SelectionDAG &DAG) {
18822   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
18823     return SDValue();
18824
18825   unsigned AddrOpIdx = N->getNumOperands() - 1;
18826   SDValue Addr = N->getOperand(AddrOpIdx);
18827
18828   // Search for a use of the address operand that is an increment.
18829   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
18830        UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
18831     SDNode *User = *UI;
18832     if (User->getOpcode() != ISD::ADD ||
18833         UI.getUse().getResNo() != Addr.getResNo())
18834       continue;
18835
18836     // Check that the add is independent of the load/store.  Otherwise, folding
18837     // it would create a cycle.
18838     SmallPtrSet<const SDNode *, 32> Visited;
18839     SmallVector<const SDNode *, 16> Worklist;
18840     Visited.insert(Addr.getNode());
18841     Worklist.push_back(N);
18842     Worklist.push_back(User);
18843     if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
18844         SDNode::hasPredecessorHelper(User, Visited, Worklist))
18845       continue;
18846
18847     // Find the new opcode for the updating load/store.
18848     bool IsStore = false;
18849     bool IsLaneOp = false;
18850     bool IsDupOp = false;
18851     unsigned NewOpc = 0;
18852     unsigned NumVecs = 0;
18853     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
18854     switch (IntNo) {
18855     default: llvm_unreachable("unexpected intrinsic for Neon base update");
18856     case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
18857       NumVecs = 2; break;
18858     case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
18859       NumVecs = 3; break;
18860     case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
18861       NumVecs = 4; break;
18862     case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
18863       NumVecs = 2; IsStore = true; break;
18864     case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
18865       NumVecs = 3; IsStore = true; break;
18866     case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
18867       NumVecs = 4; IsStore = true; break;
18868     case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
18869       NumVecs = 2; break;
18870     case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
18871       NumVecs = 3; break;
18872     case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
18873       NumVecs = 4; break;
18874     case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
18875       NumVecs = 2; IsStore = true; break;
18876     case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
18877       NumVecs = 3; IsStore = true; break;
18878     case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
18879       NumVecs = 4; IsStore = true; break;
18880     case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
18881       NumVecs = 2; IsDupOp = true; break;
18882     case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
18883       NumVecs = 3; IsDupOp = true; break;
18884     case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
18885       NumVecs = 4; IsDupOp = true; break;
18886     case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
18887       NumVecs = 2; IsLaneOp = true; break;
18888     case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
18889       NumVecs = 3; IsLaneOp = true; break;
18890     case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
18891       NumVecs = 4; IsLaneOp = true; break;
18892     case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
18893       NumVecs = 2; IsStore = true; IsLaneOp = true; break;
18894     case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
18895       NumVecs = 3; IsStore = true; IsLaneOp = true; break;
18896     case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
18897       NumVecs = 4; IsStore = true; IsLaneOp = true; break;
18898     }
18899
18900     EVT VecTy;
18901     if (IsStore)
18902       VecTy = N->getOperand(2).getValueType();
18903     else
18904       VecTy = N->getValueType(0);
18905
18906     // If the increment is a constant, it must match the memory ref size.
18907     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
18908     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
18909       uint32_t IncVal = CInc->getZExtValue();
18910       unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
18911       if (IsLaneOp || IsDupOp)
18912         NumBytes /= VecTy.getVectorNumElements();
18913       if (IncVal != NumBytes)
18914         continue;
18915       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
18916     }
18917     SmallVector<SDValue, 8> Ops;
18918     Ops.push_back(N->getOperand(0)); // Incoming chain
18919     // Load lane and store have vector list as input.
18920     if (IsLaneOp || IsStore)
18921       for (unsigned i = 2; i < AddrOpIdx; ++i)
18922         Ops.push_back(N->getOperand(i));
18923     Ops.push_back(Addr); // Base register
18924     Ops.push_back(Inc);
18925
18926     // Return Types.
18927     EVT Tys[6];
18928     unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
18929     unsigned n;
18930     for (n = 0; n < NumResultVecs; ++n)
18931       Tys[n] = VecTy;
18932     Tys[n++] = MVT::i64;  // Type of write back register
18933     Tys[n] = MVT::Other;  // Type of the chain
18934     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
18935
18936     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
18937     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
18938                                            MemInt->getMemoryVT(),
18939                                            MemInt->getMemOperand());
18940
18941     // Update the uses.
18942     std::vector<SDValue> NewResults;
18943     for (unsigned i = 0; i < NumResultVecs; ++i) {
18944       NewResults.push_back(SDValue(UpdN.getNode(), i));
18945     }
18946     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
18947     DCI.CombineTo(N, NewResults);
18948     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
18949
18950     break;
18951   }
18952   return SDValue();
18953 }
18954
18955 // Checks to see if the value is the prescribed width and returns information
18956 // about its extension mode.
18957 static
18958 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
18959   ExtType = ISD::NON_EXTLOAD;
18960   switch(V.getNode()->getOpcode()) {
18961   default:
18962     return false;
18963   case ISD::LOAD: {
18964     LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
18965     if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
18966        || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
18967       ExtType = LoadNode->getExtensionType();
18968       return true;
18969     }
18970     return false;
18971   }
18972   case ISD::AssertSext: {
18973     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
18974     if ((TypeNode->getVT() == MVT::i8 && width == 8)
18975        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
18976       ExtType = ISD::SEXTLOAD;
18977       return true;
18978     }
18979     return false;
18980   }
18981   case ISD::AssertZext: {
18982     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
18983     if ((TypeNode->getVT() == MVT::i8 && width == 8)
18984        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
18985       ExtType = ISD::ZEXTLOAD;
18986       return true;
18987     }
18988     return false;
18989   }
18990   case ISD::Constant:
18991   case ISD::TargetConstant: {
18992     return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
18993            1LL << (width - 1);
18994   }
18995   }
18996
18997   return true;
18998 }
18999
19000 // This function does a whole lot of voodoo to determine if the tests are
19001 // equivalent without and with a mask. Essentially what happens is that given a
19002 // DAG resembling:
19003 //
19004 //  +-------------+ +-------------+ +-------------+ +-------------+
19005 //  |    Input    | | AddConstant | | CompConstant| |     CC      |
19006 //  +-------------+ +-------------+ +-------------+ +-------------+
19007 //           |           |           |               |
19008 //           V           V           |    +----------+
19009 //          +-------------+  +----+  |    |
19010 //          |     ADD     |  |0xff|  |    |
19011 //          +-------------+  +----+  |    |
19012 //                  |           |    |    |
19013 //                  V           V    |    |
19014 //                 +-------------+   |    |
19015 //                 |     AND     |   |    |
19016 //                 +-------------+   |    |
19017 //                      |            |    |
19018 //                      +-----+      |    |
19019 //                            |      |    |
19020 //                            V      V    V
19021 //                           +-------------+
19022 //                           |     CMP     |
19023 //                           +-------------+
19024 //
19025 // The AND node may be safely removed for some combinations of inputs. In
19026 // particular we need to take into account the extension type of the Input,
19027 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
19028 // width of the input (this can work for any width inputs, the above graph is
19029 // specific to 8 bits.
19030 //
19031 // The specific equations were worked out by generating output tables for each
19032 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
19033 // problem was simplified by working with 4 bit inputs, which means we only
19034 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
19035 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
19036 // patterns present in both extensions (0,7). For every distinct set of
19037 // AddConstant and CompConstants bit patterns we can consider the masked and
19038 // unmasked versions to be equivalent if the result of this function is true for
19039 // all 16 distinct bit patterns of for the current extension type of Input (w0).
19040 //
19041 //   sub      w8, w0, w1
19042 //   and      w10, w8, #0x0f
19043 //   cmp      w8, w2
19044 //   cset     w9, AArch64CC
19045 //   cmp      w10, w2
19046 //   cset     w11, AArch64CC
19047 //   cmp      w9, w11
19048 //   cset     w0, eq
19049 //   ret
19050 //
19051 // Since the above function shows when the outputs are equivalent it defines
19052 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
19053 // would be expensive to run during compiles. The equations below were written
19054 // in a test harness that confirmed they gave equivalent outputs to the above
19055 // for all inputs function, so they can be used determine if the removal is
19056 // legal instead.
19057 //
19058 // isEquivalentMaskless() is the code for testing if the AND can be removed
19059 // factored out of the DAG recognition as the DAG can take several forms.
19060
19061 static bool isEquivalentMaskless(unsigned CC, unsigned width,
19062                                  ISD::LoadExtType ExtType, int AddConstant,
19063                                  int CompConstant) {
19064   // By being careful about our equations and only writing the in term
19065   // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
19066   // make them generally applicable to all bit widths.
19067   int MaxUInt = (1 << width);
19068
19069   // For the purposes of these comparisons sign extending the type is
19070   // equivalent to zero extending the add and displacing it by half the integer
19071   // width. Provided we are careful and make sure our equations are valid over
19072   // the whole range we can just adjust the input and avoid writing equations
19073   // for sign extended inputs.
19074   if (ExtType == ISD::SEXTLOAD)
19075     AddConstant -= (1 << (width-1));
19076
19077   switch(CC) {
19078   case AArch64CC::LE:
19079   case AArch64CC::GT:
19080     if ((AddConstant == 0) ||
19081         (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
19082         (AddConstant >= 0 && CompConstant < 0) ||
19083         (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
19084       return true;
19085     break;
19086   case AArch64CC::LT:
19087   case AArch64CC::GE:
19088     if ((AddConstant == 0) ||
19089         (AddConstant >= 0 && CompConstant <= 0) ||
19090         (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
19091       return true;
19092     break;
19093   case AArch64CC::HI:
19094   case AArch64CC::LS:
19095     if ((AddConstant >= 0 && CompConstant < 0) ||
19096        (AddConstant <= 0 && CompConstant >= -1 &&
19097         CompConstant < AddConstant + MaxUInt))
19098       return true;
19099    break;
19100   case AArch64CC::PL:
19101   case AArch64CC::MI:
19102     if ((AddConstant == 0) ||
19103         (AddConstant > 0 && CompConstant <= 0) ||
19104         (AddConstant < 0 && CompConstant <= AddConstant))
19105       return true;
19106     break;
19107   case AArch64CC::LO:
19108   case AArch64CC::HS:
19109     if ((AddConstant >= 0 && CompConstant <= 0) ||
19110         (AddConstant <= 0 && CompConstant >= 0 &&
19111          CompConstant <= AddConstant + MaxUInt))
19112       return true;
19113     break;
19114   case AArch64CC::EQ:
19115   case AArch64CC::NE:
19116     if ((AddConstant > 0 && CompConstant < 0) ||
19117         (AddConstant < 0 && CompConstant >= 0 &&
19118          CompConstant < AddConstant + MaxUInt) ||
19119         (AddConstant >= 0 && CompConstant >= 0 &&
19120          CompConstant >= AddConstant) ||
19121         (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
19122       return true;
19123     break;
19124   case AArch64CC::VS:
19125   case AArch64CC::VC:
19126   case AArch64CC::AL:
19127   case AArch64CC::NV:
19128     return true;
19129   case AArch64CC::Invalid:
19130     break;
19131   }
19132
19133   return false;
19134 }
19135
19136 static
19137 SDValue performCONDCombine(SDNode *N,
19138                            TargetLowering::DAGCombinerInfo &DCI,
19139                            SelectionDAG &DAG, unsigned CCIndex,
19140                            unsigned CmpIndex) {
19141   unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
19142   SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
19143   unsigned CondOpcode = SubsNode->getOpcode();
19144
19145   if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0))
19146     return SDValue();
19147
19148   // There is a SUBS feeding this condition. Is it fed by a mask we can
19149   // use?
19150
19151   SDNode *AndNode = SubsNode->getOperand(0).getNode();
19152   unsigned MaskBits = 0;
19153
19154   if (AndNode->getOpcode() != ISD::AND)
19155     return SDValue();
19156
19157   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
19158     uint32_t CNV = CN->getZExtValue();
19159     if (CNV == 255)
19160       MaskBits = 8;
19161     else if (CNV == 65535)
19162       MaskBits = 16;
19163   }
19164
19165   if (!MaskBits)
19166     return SDValue();
19167
19168   SDValue AddValue = AndNode->getOperand(0);
19169
19170   if (AddValue.getOpcode() != ISD::ADD)
19171     return SDValue();
19172
19173   // The basic dag structure is correct, grab the inputs and validate them.
19174
19175   SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
19176   SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
19177   SDValue SubsInputValue = SubsNode->getOperand(1);
19178
19179   // The mask is present and the provenance of all the values is a smaller type,
19180   // lets see if the mask is superfluous.
19181
19182   if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
19183       !isa<ConstantSDNode>(SubsInputValue.getNode()))
19184     return SDValue();
19185
19186   ISD::LoadExtType ExtType;
19187
19188   if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
19189       !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
19190       !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
19191     return SDValue();
19192
19193   if(!isEquivalentMaskless(CC, MaskBits, ExtType,
19194                 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
19195                 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
19196     return SDValue();
19197
19198   // The AND is not necessary, remove it.
19199
19200   SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
19201                                SubsNode->getValueType(1));
19202   SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
19203
19204   SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
19205   DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
19206
19207   return SDValue(N, 0);
19208 }
19209
19210 // Optimize compare with zero and branch.
19211 static SDValue performBRCONDCombine(SDNode *N,
19212                                     TargetLowering::DAGCombinerInfo &DCI,
19213                                     SelectionDAG &DAG) {
19214   MachineFunction &MF = DAG.getMachineFunction();
19215   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
19216   // will not be produced, as they are conditional branch instructions that do
19217   // not set flags.
19218   if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
19219     return SDValue();
19220
19221   if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
19222     N = NV.getNode();
19223   SDValue Chain = N->getOperand(0);
19224   SDValue Dest = N->getOperand(1);
19225   SDValue CCVal = N->getOperand(2);
19226   SDValue Cmp = N->getOperand(3);
19227
19228   assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
19229   unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
19230   if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
19231     return SDValue();
19232
19233   unsigned CmpOpc = Cmp.getOpcode();
19234   if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
19235     return SDValue();
19236
19237   // Only attempt folding if there is only one use of the flag and no use of the
19238   // value.
19239   if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
19240     return SDValue();
19241
19242   SDValue LHS = Cmp.getOperand(0);
19243   SDValue RHS = Cmp.getOperand(1);
19244
19245   assert(LHS.getValueType() == RHS.getValueType() &&
19246          "Expected the value type to be the same for both operands!");
19247   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
19248     return SDValue();
19249
19250   if (isNullConstant(LHS))
19251     std::swap(LHS, RHS);
19252
19253   if (!isNullConstant(RHS))
19254     return SDValue();
19255
19256   if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
19257       LHS.getOpcode() == ISD::SRL)
19258     return SDValue();
19259
19260   // Fold the compare into the branch instruction.
19261   SDValue BR;
19262   if (CC == AArch64CC::EQ)
19263     BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
19264   else
19265     BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
19266
19267   // Do not add new nodes to DAG combiner worklist.
19268   DCI.CombineTo(N, BR, false);
19269
19270   return SDValue();
19271 }
19272
19273 static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
19274   unsigned CC = N->getConstantOperandVal(2);
19275   SDValue SUBS = N->getOperand(3);
19276   SDValue Zero, CTTZ;
19277
19278   if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
19279     Zero = N->getOperand(0);
19280     CTTZ = N->getOperand(1);
19281   } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
19282     Zero = N->getOperand(1);
19283     CTTZ = N->getOperand(0);
19284   } else
19285     return SDValue();
19286
19287   if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
19288       (CTTZ.getOpcode() == ISD::TRUNCATE &&
19289        CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
19290     return SDValue();
19291
19292   assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
19293          "Illegal type in CTTZ folding");
19294
19295   if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
19296     return SDValue();
19297
19298   SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
19299                   ? CTTZ.getOperand(0).getOperand(0)
19300                   : CTTZ.getOperand(0);
19301
19302   if (X != SUBS.getOperand(0))
19303     return SDValue();
19304
19305   unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
19306                           ? CTTZ.getOperand(0).getValueSizeInBits()
19307                           : CTTZ.getValueSizeInBits();
19308   SDValue BitWidthMinusOne =
19309       DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
19310   return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
19311                      BitWidthMinusOne);
19312 }
19313
19314 // (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
19315 // (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
19316 // Where x and y are constants
19317
19318 // (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
19319 // (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
19320 // Where x and y are constants
19321 static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
19322   SDValue L = Op->getOperand(0);
19323   SDValue R = Op->getOperand(1);
19324   AArch64CC::CondCode OpCC =
19325       static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
19326
19327   SDValue OpCmp = Op->getOperand(3);
19328   if (!isCMP(OpCmp))
19329     return SDValue();
19330
19331   SDValue CmpLHS = OpCmp.getOperand(0);
19332   SDValue CmpRHS = OpCmp.getOperand(1);
19333
19334   if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
19335     std::swap(CmpLHS, CmpRHS);
19336   else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
19337     return SDValue();
19338
19339   SDValue X = CmpLHS->getOperand(0);
19340   SDValue Y = CmpLHS->getOperand(1);
19341   if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y)) {
19342     return SDValue();
19343   }
19344
19345   AArch64CC::CondCode CC =
19346       static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
19347   SDValue Cond = CmpLHS->getOperand(3);
19348
19349   if (CmpRHS == Y)
19350     CC = AArch64CC::getInvertedCondCode(CC);
19351   else if (CmpRHS != X)
19352     return SDValue();
19353
19354   if (OpCC == AArch64CC::NE)
19355     CC = AArch64CC::getInvertedCondCode(CC);
19356   else if (OpCC != AArch64CC::EQ)
19357     return SDValue();
19358
19359   SDLoc DL(Op);
19360   EVT VT = Op->getValueType(0);
19361
19362   SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
19363   return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
19364 }
19365
19366 // Optimize CSEL instructions
19367 static SDValue performCSELCombine(SDNode *N,
19368                                   TargetLowering::DAGCombinerInfo &DCI,
19369                                   SelectionDAG &DAG) {
19370   // CSEL x, x, cc -> x
19371   if (N->getOperand(0) == N->getOperand(1))
19372     return N->getOperand(0);
19373
19374   if (SDValue R = foldCSELOfCSEL(N, DAG))
19375     return R;
19376
19377   // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
19378   // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
19379   if (SDValue Folded = foldCSELofCTTZ(N, DAG))
19380                 return Folded;
19381
19382   return performCONDCombine(N, DCI, DAG, 2, 3);
19383 }
19384
19385 // Try to re-use an already extended operand of a vector SetCC feeding a
19386 // extended select. Doing so avoids requiring another full extension of the
19387 // SET_CC result when lowering the select.
19388 static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
19389   EVT Op0MVT = Op->getOperand(0).getValueType();
19390   if (!Op0MVT.isVector() || Op->use_empty())
19391     return SDValue();
19392
19393   // Make sure that all uses of Op are VSELECTs with result matching types where
19394   // the result type has a larger element type than the SetCC operand.
19395   SDNode *FirstUse = *Op->use_begin();
19396   if (FirstUse->getOpcode() != ISD::VSELECT)
19397     return SDValue();
19398   EVT UseMVT = FirstUse->getValueType(0);
19399   if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
19400     return SDValue();
19401   if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
19402         return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
19403       }))
19404     return SDValue();
19405
19406   APInt V;
19407   if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
19408     return SDValue();
19409
19410   SDLoc DL(Op);
19411   SDValue Op0ExtV;
19412   SDValue Op1ExtV;
19413   ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
19414   // Check if the first operand of the SET_CC is already extended. If it is,
19415   // split the SET_CC and re-use the extended version of the operand.
19416   SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
19417                                         Op->getOperand(0));
19418   SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
19419                                         Op->getOperand(0));
19420   if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
19421     Op0ExtV = SDValue(Op0SExt, 0);
19422     Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
19423   } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
19424     Op0ExtV = SDValue(Op0ZExt, 0);
19425     Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
19426   } else
19427     return SDValue();
19428
19429   return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
19430                      Op0ExtV, Op1ExtV, Op->getOperand(2));
19431 }
19432
19433 static SDValue performSETCCCombine(SDNode *N,
19434                                    TargetLowering::DAGCombinerInfo &DCI,
19435                                    SelectionDAG &DAG) {
19436   assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
19437   SDValue LHS = N->getOperand(0);
19438   SDValue RHS = N->getOperand(1);
19439   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
19440   SDLoc DL(N);
19441   EVT VT = N->getValueType(0);
19442
19443   if (SDValue V = tryToWidenSetCCOperands(N, DAG))
19444     return V;
19445
19446   // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
19447   if (Cond == ISD::SETNE && isOneConstant(RHS) &&
19448       LHS->getOpcode() == AArch64ISD::CSEL &&
19449       isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
19450       LHS->hasOneUse()) {
19451     // Invert CSEL's condition.
19452     auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2));
19453     auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue());
19454     auto NewCond = getInvertedCondCode(OldCond);
19455
19456     // csel 0, 1, !cond, X
19457     SDValue CSEL =
19458         DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
19459                     LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
19460                     LHS.getOperand(3));
19461     return DAG.getZExtOrTrunc(CSEL, DL, VT);
19462   }
19463
19464   // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
19465   if (Cond == ISD::SETNE && isNullConstant(RHS) &&
19466       LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
19467       LHS->hasOneUse()) {
19468     EVT TstVT = LHS->getValueType(0);
19469     if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
19470       // this pattern will get better opt in emitComparison
19471       uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
19472       SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
19473                                 DAG.getConstant(TstImm, DL, TstVT));
19474       return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
19475     }
19476   }
19477
19478   // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
19479   //   ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
19480   if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
19481       (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
19482       LHS->getOpcode() == ISD::BITCAST) {
19483     EVT ToVT = LHS->getValueType(0);
19484     EVT FromVT = LHS->getOperand(0).getValueType();
19485     if (FromVT.isFixedLengthVector() &&
19486         FromVT.getVectorElementType() == MVT::i1) {
19487       LHS = DAG.getNode(ISD::VECREDUCE_OR, DL, MVT::i1, LHS->getOperand(0));
19488       LHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ToVT, LHS);
19489       return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
19490     }
19491   }
19492
19493   return SDValue();
19494 }
19495
19496 // Replace a flag-setting operator (eg ANDS) with the generic version
19497 // (eg AND) if the flag is unused.
19498 static SDValue performFlagSettingCombine(SDNode *N,
19499                                          TargetLowering::DAGCombinerInfo &DCI,
19500                                          unsigned GenericOpcode) {
19501   SDLoc DL(N);
19502   SDValue LHS = N->getOperand(0);
19503   SDValue RHS = N->getOperand(1);
19504   EVT VT = N->getValueType(0);
19505
19506   // If the flag result isn't used, convert back to a generic opcode.
19507   if (!N->hasAnyUseOfValue(1)) {
19508     SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
19509     return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
19510                                   DL);
19511   }
19512
19513   // Combine identical generic nodes into this node, re-using the result.
19514   if (SDNode *Generic = DCI.DAG.getNodeIfExists(
19515           GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
19516     DCI.CombineTo(Generic, SDValue(N, 0));
19517
19518   return SDValue();
19519 }
19520
19521 static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
19522   // setcc_merge_zero pred
19523   //   (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
19524   //   => extract_subvector (inner setcc_merge_zero)
19525   SDValue Pred = N->getOperand(0);
19526   SDValue LHS = N->getOperand(1);
19527   SDValue RHS = N->getOperand(2);
19528   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
19529
19530   if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
19531       LHS->getOpcode() != ISD::SIGN_EXTEND)
19532     return SDValue();
19533
19534   SDValue Extract = LHS->getOperand(0);
19535   if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
19536       Extract->getValueType(0) != N->getValueType(0) ||
19537       Extract->getConstantOperandVal(1) != 0)
19538     return SDValue();
19539
19540   SDValue InnerSetCC = Extract->getOperand(0);
19541   if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
19542     return SDValue();
19543
19544   // By this point we've effectively got
19545   // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
19546   // lanes are already zero then the trunc(sext()) sequence is redundant and we
19547   // can operate on A directly.
19548   SDValue InnerPred = InnerSetCC.getOperand(0);
19549   if (Pred.getOpcode() == AArch64ISD::PTRUE &&
19550       InnerPred.getOpcode() == AArch64ISD::PTRUE &&
19551       Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
19552       Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
19553       Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
19554     return Extract;
19555
19556   return SDValue();
19557 }
19558
19559 static SDValue
19560 performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
19561   assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
19562          "Unexpected opcode!");
19563
19564   SelectionDAG &DAG = DCI.DAG;
19565   SDValue Pred = N->getOperand(0);
19566   SDValue LHS = N->getOperand(1);
19567   SDValue RHS = N->getOperand(2);
19568   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
19569
19570   if (SDValue V = performSetCCPunpkCombine(N, DAG))
19571     return V;
19572
19573   if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
19574       LHS->getOpcode() == ISD::SIGN_EXTEND &&
19575       LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
19576     //    setcc_merge_zero(
19577     //       pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
19578     // => setcc_merge_zero(pred, ...)
19579     if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
19580         LHS->getOperand(0)->getOperand(0) == Pred)
19581       return LHS->getOperand(0);
19582
19583     //    setcc_merge_zero(
19584     //        all_active, extend(nxvNi1 ...), != splat(0))
19585     // -> nxvNi1 ...
19586     if (isAllActivePredicate(DAG, Pred))
19587       return LHS->getOperand(0);
19588
19589     //    setcc_merge_zero(
19590     //        pred, extend(nxvNi1 ...), != splat(0))
19591     // -> nxvNi1 and(pred, ...)
19592     if (DCI.isAfterLegalizeDAG())
19593       // Do this after legalization to allow more folds on setcc_merge_zero
19594       // to be recognized.
19595       return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
19596                          LHS->getOperand(0), Pred);
19597   }
19598
19599   return SDValue();
19600 }
19601
19602 // Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test
19603 // as well as whether the test should be inverted.  This code is required to
19604 // catch these cases (as opposed to standard dag combines) because
19605 // AArch64ISD::TBZ is matched during legalization.
19606 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
19607                                  SelectionDAG &DAG) {
19608
19609   if (!Op->hasOneUse())
19610     return Op;
19611
19612   // We don't handle undef/constant-fold cases below, as they should have
19613   // already been taken care of (e.g. and of 0, test of undefined shifted bits,
19614   // etc.)
19615
19616   // (tbz (trunc x), b) -> (tbz x, b)
19617   // This case is just here to enable more of the below cases to be caught.
19618   if (Op->getOpcode() == ISD::TRUNCATE &&
19619       Bit < Op->getValueType(0).getSizeInBits()) {
19620     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19621   }
19622
19623   // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
19624   if (Op->getOpcode() == ISD::ANY_EXTEND &&
19625       Bit < Op->getOperand(0).getValueSizeInBits()) {
19626     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19627   }
19628
19629   if (Op->getNumOperands() != 2)
19630     return Op;
19631
19632   auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19633   if (!C)
19634     return Op;
19635
19636   switch (Op->getOpcode()) {
19637   default:
19638     return Op;
19639
19640   // (tbz (and x, m), b) -> (tbz x, b)
19641   case ISD::AND:
19642     if ((C->getZExtValue() >> Bit) & 1)
19643       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19644     return Op;
19645
19646   // (tbz (shl x, c), b) -> (tbz x, b-c)
19647   case ISD::SHL:
19648     if (C->getZExtValue() <= Bit &&
19649         (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
19650       Bit = Bit - C->getZExtValue();
19651       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19652     }
19653     return Op;
19654
19655   // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
19656   case ISD::SRA:
19657     Bit = Bit + C->getZExtValue();
19658     if (Bit >= Op->getValueType(0).getSizeInBits())
19659       Bit = Op->getValueType(0).getSizeInBits() - 1;
19660     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19661
19662   // (tbz (srl x, c), b) -> (tbz x, b+c)
19663   case ISD::SRL:
19664     if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
19665       Bit = Bit + C->getZExtValue();
19666       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19667     }
19668     return Op;
19669
19670   // (tbz (xor x, -1), b) -> (tbnz x, b)
19671   case ISD::XOR:
19672     if ((C->getZExtValue() >> Bit) & 1)
19673       Invert = !Invert;
19674     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19675   }
19676 }
19677
19678 // Optimize test single bit zero/non-zero and branch.
19679 static SDValue performTBZCombine(SDNode *N,
19680                                  TargetLowering::DAGCombinerInfo &DCI,
19681                                  SelectionDAG &DAG) {
19682   unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
19683   bool Invert = false;
19684   SDValue TestSrc = N->getOperand(1);
19685   SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
19686
19687   if (TestSrc == NewTestSrc)
19688     return SDValue();
19689
19690   unsigned NewOpc = N->getOpcode();
19691   if (Invert) {
19692     if (NewOpc == AArch64ISD::TBZ)
19693       NewOpc = AArch64ISD::TBNZ;
19694     else {
19695       assert(NewOpc == AArch64ISD::TBNZ);
19696       NewOpc = AArch64ISD::TBZ;
19697     }
19698   }
19699
19700   SDLoc DL(N);
19701   return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
19702                      DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
19703 }
19704
19705 // Swap vselect operands where it may allow a predicated operation to achieve
19706 // the `sel`.
19707 //
19708 //     (vselect (setcc ( condcode) (_) (_)) (a)          (op (a) (b)))
19709 //  => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
19710 static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
19711   auto SelectA = N->getOperand(1);
19712   auto SelectB = N->getOperand(2);
19713   auto NTy = N->getValueType(0);
19714
19715   if (!NTy.isScalableVector())
19716     return SDValue();
19717   SDValue SetCC = N->getOperand(0);
19718   if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
19719     return SDValue();
19720
19721   switch (SelectB.getOpcode()) {
19722   default:
19723     return SDValue();
19724   case ISD::FMUL:
19725   case ISD::FSUB:
19726   case ISD::FADD:
19727     break;
19728   }
19729   if (SelectA != SelectB.getOperand(0))
19730     return SDValue();
19731
19732   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
19733   ISD::CondCode InverseCC =
19734       ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType());
19735   auto InverseSetCC =
19736       DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
19737                    SetCC.getOperand(1), InverseCC);
19738
19739   return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
19740                      {InverseSetCC, SelectB, SelectA});
19741 }
19742
19743 // vselect (v1i1 setcc) ->
19744 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
19745 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
19746 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
19747 // such VSELECT.
19748 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
19749   if (auto SwapResult = trySwapVSelectOperands(N, DAG))
19750     return SwapResult;
19751
19752   SDValue N0 = N->getOperand(0);
19753   EVT CCVT = N0.getValueType();
19754
19755   if (isAllActivePredicate(DAG, N0))
19756     return N->getOperand(1);
19757
19758   if (isAllInactivePredicate(N0))
19759     return N->getOperand(2);
19760
19761   // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
19762   // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
19763   // supported types.
19764   SDValue SetCC = N->getOperand(0);
19765   if (SetCC.getOpcode() == ISD::SETCC &&
19766       SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
19767     SDValue CmpLHS = SetCC.getOperand(0);
19768     EVT VT = CmpLHS.getValueType();
19769     SDNode *CmpRHS = SetCC.getOperand(1).getNode();
19770     SDNode *SplatLHS = N->getOperand(1).getNode();
19771     SDNode *SplatRHS = N->getOperand(2).getNode();
19772     APInt SplatLHSVal;
19773     if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
19774         VT.isSimple() &&
19775         is_contained(
19776             makeArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
19777                           MVT::v2i32, MVT::v4i32, MVT::v2i64}),
19778             VT.getSimpleVT().SimpleTy) &&
19779         ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
19780         SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
19781         ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
19782       unsigned NumElts = VT.getVectorNumElements();
19783       SmallVector<SDValue, 8> Ops(
19784           NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
19785                                    VT.getScalarType()));
19786       SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
19787
19788       auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
19789       auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
19790       return Or;
19791     }
19792   }
19793
19794   if (N0.getOpcode() != ISD::SETCC ||
19795       CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
19796       CCVT.getVectorElementType() != MVT::i1)
19797     return SDValue();
19798
19799   EVT ResVT = N->getValueType(0);
19800   EVT CmpVT = N0.getOperand(0).getValueType();
19801   // Only combine when the result type is of the same size as the compared
19802   // operands.
19803   if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
19804     return SDValue();
19805
19806   SDValue IfTrue = N->getOperand(1);
19807   SDValue IfFalse = N->getOperand(2);
19808   SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
19809                        N0.getOperand(0), N0.getOperand(1),
19810                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
19811   return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
19812                      IfTrue, IfFalse);
19813 }
19814
19815 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
19816 /// the compare-mask instructions rather than going via NZCV, even if LHS and
19817 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
19818 /// with a vector one followed by a DUP shuffle on the result.
19819 static SDValue performSelectCombine(SDNode *N,
19820                                     TargetLowering::DAGCombinerInfo &DCI) {
19821   SelectionDAG &DAG = DCI.DAG;
19822   SDValue N0 = N->getOperand(0);
19823   EVT ResVT = N->getValueType(0);
19824
19825   if (N0.getOpcode() != ISD::SETCC)
19826     return SDValue();
19827
19828   if (ResVT.isScalableVector())
19829     return SDValue();
19830
19831   // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
19832   // scalar SetCCResultType. We also don't expect vectors, because we assume
19833   // that selects fed by vector SETCCs are canonicalized to VSELECT.
19834   assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
19835          "Scalar-SETCC feeding SELECT has unexpected result type!");
19836
19837   // If NumMaskElts == 0, the comparison is larger than select result. The
19838   // largest real NEON comparison is 64-bits per lane, which means the result is
19839   // at most 32-bits and an illegal vector. Just bail out for now.
19840   EVT SrcVT = N0.getOperand(0).getValueType();
19841
19842   // Don't try to do this optimization when the setcc itself has i1 operands.
19843   // There are no legal vectors of i1, so this would be pointless.
19844   if (SrcVT == MVT::i1)
19845     return SDValue();
19846
19847   int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
19848   if (!ResVT.isVector() || NumMaskElts == 0)
19849     return SDValue();
19850
19851   SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
19852   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
19853
19854   // Also bail out if the vector CCVT isn't the same size as ResVT.
19855   // This can happen if the SETCC operand size doesn't divide the ResVT size
19856   // (e.g., f64 vs v3f32).
19857   if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
19858     return SDValue();
19859
19860   // Make sure we didn't create illegal types, if we're not supposed to.
19861   assert(DCI.isBeforeLegalize() ||
19862          DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
19863
19864   // First perform a vector comparison, where lane 0 is the one we're interested
19865   // in.
19866   SDLoc DL(N0);
19867   SDValue LHS =
19868       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
19869   SDValue RHS =
19870       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
19871   SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
19872
19873   // Now duplicate the comparison mask we want across all other lanes.
19874   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
19875   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
19876   Mask = DAG.getNode(ISD::BITCAST, DL,
19877                      ResVT.changeVectorElementTypeToInteger(), Mask);
19878
19879   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
19880 }
19881
19882 static SDValue performDUPCombine(SDNode *N,
19883                                  TargetLowering::DAGCombinerInfo &DCI) {
19884   EVT VT = N->getValueType(0);
19885   // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
19886   // 128bit vector version.
19887   if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
19888     EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
19889     if (SDNode *LN = DCI.DAG.getNodeIfExists(
19890             N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) {
19891       SDLoc DL(N);
19892       return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
19893                              DCI.DAG.getConstant(0, DL, MVT::i64));
19894     }
19895   }
19896
19897   return performPostLD1Combine(N, DCI, false);
19898 }
19899
19900 /// Get rid of unnecessary NVCASTs (that don't change the type).
19901 static SDValue performNVCASTCombine(SDNode *N) {
19902   if (N->getValueType(0) == N->getOperand(0).getValueType())
19903     return N->getOperand(0);
19904
19905   return SDValue();
19906 }
19907
19908 // If all users of the globaladdr are of the form (globaladdr + constant), find
19909 // the smallest constant, fold it into the globaladdr's offset and rewrite the
19910 // globaladdr as (globaladdr + constant) - constant.
19911 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
19912                                            const AArch64Subtarget *Subtarget,
19913                                            const TargetMachine &TM) {
19914   auto *GN = cast<GlobalAddressSDNode>(N);
19915   if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
19916       AArch64II::MO_NO_FLAG)
19917     return SDValue();
19918
19919   uint64_t MinOffset = -1ull;
19920   for (SDNode *N : GN->uses()) {
19921     if (N->getOpcode() != ISD::ADD)
19922       return SDValue();
19923     auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
19924     if (!C)
19925       C = dyn_cast<ConstantSDNode>(N->getOperand(1));
19926     if (!C)
19927       return SDValue();
19928     MinOffset = std::min(MinOffset, C->getZExtValue());
19929   }
19930   uint64_t Offset = MinOffset + GN->getOffset();
19931
19932   // Require that the new offset is larger than the existing one. Otherwise, we
19933   // can end up oscillating between two possible DAGs, for example,
19934   // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
19935   if (Offset <= uint64_t(GN->getOffset()))
19936     return SDValue();
19937
19938   // Check whether folding this offset is legal. It must not go out of bounds of
19939   // the referenced object to avoid violating the code model, and must be
19940   // smaller than 2^20 because this is the largest offset expressible in all
19941   // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
19942   // stores an immediate signed 21 bit offset.)
19943   //
19944   // This check also prevents us from folding negative offsets, which will end
19945   // up being treated in the same way as large positive ones. They could also
19946   // cause code model violations, and aren't really common enough to matter.
19947   if (Offset >= (1 << 20))
19948     return SDValue();
19949
19950   const GlobalValue *GV = GN->getGlobal();
19951   Type *T = GV->getValueType();
19952   if (!T->isSized() ||
19953       Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
19954     return SDValue();
19955
19956   SDLoc DL(GN);
19957   SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
19958   return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
19959                      DAG.getConstant(MinOffset, DL, MVT::i64));
19960 }
19961
19962 // Turns the vector of indices into a vector of byte offstes by scaling Offset
19963 // by (BitWidth / 8).
19964 static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
19965                                           SDLoc DL, unsigned BitWidth) {
19966   assert(Offset.getValueType().isScalableVector() &&
19967          "This method is only for scalable vectors of offsets");
19968
19969   SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
19970   SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
19971
19972   return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
19973 }
19974
19975 /// Check if the value of \p OffsetInBytes can be used as an immediate for
19976 /// the gather load/prefetch and scatter store instructions with vector base and
19977 /// immediate offset addressing mode:
19978 ///
19979 ///      [<Zn>.[S|D]{, #<imm>}]
19980 ///
19981 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
19982 inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
19983                                                   unsigned ScalarSizeInBytes) {
19984   // The immediate is not a multiple of the scalar size.
19985   if (OffsetInBytes % ScalarSizeInBytes)
19986     return false;
19987
19988   // The immediate is out of range.
19989   if (OffsetInBytes / ScalarSizeInBytes > 31)
19990     return false;
19991
19992   return true;
19993 }
19994
19995 /// Check if the value of \p Offset represents a valid immediate for the SVE
19996 /// gather load/prefetch and scatter store instructiona with vector base and
19997 /// immediate offset addressing mode:
19998 ///
19999 ///      [<Zn>.[S|D]{, #<imm>}]
20000 ///
20001 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
20002 static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
20003                                            unsigned ScalarSizeInBytes) {
20004   ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
20005   return OffsetConst && isValidImmForSVEVecImmAddrMode(
20006                             OffsetConst->getZExtValue(), ScalarSizeInBytes);
20007 }
20008
20009 static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
20010                                           unsigned Opcode,
20011                                           bool OnlyPackedOffsets = true) {
20012   const SDValue Src = N->getOperand(2);
20013   const EVT SrcVT = Src->getValueType(0);
20014   assert(SrcVT.isScalableVector() &&
20015          "Scatter stores are only possible for SVE vectors");
20016
20017   SDLoc DL(N);
20018   MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
20019
20020   // Make sure that source data will fit into an SVE register
20021   if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
20022     return SDValue();
20023
20024   // For FPs, ACLE only supports _packed_ single and double precision types.
20025   if (SrcElVT.isFloatingPoint())
20026     if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
20027       return SDValue();
20028
20029   // Depending on the addressing mode, this is either a pointer or a vector of
20030   // pointers (that fits into one register)
20031   SDValue Base = N->getOperand(4);
20032   // Depending on the addressing mode, this is either a single offset or a
20033   // vector of offsets  (that fits into one register)
20034   SDValue Offset = N->getOperand(5);
20035
20036   // For "scalar + vector of indices", just scale the indices. This only
20037   // applies to non-temporal scatters because there's no instruction that takes
20038   // indicies.
20039   if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
20040     Offset =
20041         getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
20042     Opcode = AArch64ISD::SSTNT1_PRED;
20043   }
20044
20045   // In the case of non-temporal gather loads there's only one SVE instruction
20046   // per data-size: "scalar + vector", i.e.
20047   //    * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
20048   // Since we do have intrinsics that allow the arguments to be in a different
20049   // order, we may need to swap them to match the spec.
20050   if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
20051     std::swap(Base, Offset);
20052
20053   // SST1_IMM requires that the offset is an immediate that is:
20054   //    * a multiple of #SizeInBytes,
20055   //    * in the range [0, 31 x #SizeInBytes],
20056   // where #SizeInBytes is the size in bytes of the stored items. For
20057   // immediates outside that range and non-immediate scalar offsets use SST1 or
20058   // SST1_UXTW instead.
20059   if (Opcode == AArch64ISD::SST1_IMM_PRED) {
20060     if (!isValidImmForSVEVecImmAddrMode(Offset,
20061                                         SrcVT.getScalarSizeInBits() / 8)) {
20062       if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
20063         Opcode = AArch64ISD::SST1_UXTW_PRED;
20064       else
20065         Opcode = AArch64ISD::SST1_PRED;
20066
20067       std::swap(Base, Offset);
20068     }
20069   }
20070
20071   auto &TLI = DAG.getTargetLoweringInfo();
20072   if (!TLI.isTypeLegal(Base.getValueType()))
20073     return SDValue();
20074
20075   // Some scatter store variants allow unpacked offsets, but only as nxv2i32
20076   // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
20077   // nxv2i64. Legalize accordingly.
20078   if (!OnlyPackedOffsets &&
20079       Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
20080     Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
20081
20082   if (!TLI.isTypeLegal(Offset.getValueType()))
20083     return SDValue();
20084
20085   // Source value type that is representable in hardware
20086   EVT HwSrcVt = getSVEContainerType(SrcVT);
20087
20088   // Keep the original type of the input data to store - this is needed to be
20089   // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
20090   // FP values we want the integer equivalent, so just use HwSrcVt.
20091   SDValue InputVT = DAG.getValueType(SrcVT);
20092   if (SrcVT.isFloatingPoint())
20093     InputVT = DAG.getValueType(HwSrcVt);
20094
20095   SDVTList VTs = DAG.getVTList(MVT::Other);
20096   SDValue SrcNew;
20097
20098   if (Src.getValueType().isFloatingPoint())
20099     SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
20100   else
20101     SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
20102
20103   SDValue Ops[] = {N->getOperand(0), // Chain
20104                    SrcNew,
20105                    N->getOperand(3), // Pg
20106                    Base,
20107                    Offset,
20108                    InputVT};
20109
20110   return DAG.getNode(Opcode, DL, VTs, Ops);
20111 }
20112
20113 static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
20114                                         unsigned Opcode,
20115                                         bool OnlyPackedOffsets = true) {
20116   const EVT RetVT = N->getValueType(0);
20117   assert(RetVT.isScalableVector() &&
20118          "Gather loads are only possible for SVE vectors");
20119
20120   SDLoc DL(N);
20121
20122   // Make sure that the loaded data will fit into an SVE register
20123   if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
20124     return SDValue();
20125
20126   // Depending on the addressing mode, this is either a pointer or a vector of
20127   // pointers (that fits into one register)
20128   SDValue Base = N->getOperand(3);
20129   // Depending on the addressing mode, this is either a single offset or a
20130   // vector of offsets  (that fits into one register)
20131   SDValue Offset = N->getOperand(4);
20132
20133   // For "scalar + vector of indices", just scale the indices. This only
20134   // applies to non-temporal gathers because there's no instruction that takes
20135   // indicies.
20136   if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
20137     Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
20138                                         RetVT.getScalarSizeInBits());
20139     Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
20140   }
20141
20142   // In the case of non-temporal gather loads there's only one SVE instruction
20143   // per data-size: "scalar + vector", i.e.
20144   //    * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
20145   // Since we do have intrinsics that allow the arguments to be in a different
20146   // order, we may need to swap them to match the spec.
20147   if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
20148       Offset.getValueType().isVector())
20149     std::swap(Base, Offset);
20150
20151   // GLD{FF}1_IMM requires that the offset is an immediate that is:
20152   //    * a multiple of #SizeInBytes,
20153   //    * in the range [0, 31 x #SizeInBytes],
20154   // where #SizeInBytes is the size in bytes of the loaded items. For
20155   // immediates outside that range and non-immediate scalar offsets use
20156   // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
20157   if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
20158       Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
20159     if (!isValidImmForSVEVecImmAddrMode(Offset,
20160                                         RetVT.getScalarSizeInBits() / 8)) {
20161       if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
20162         Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
20163                      ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
20164                      : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
20165       else
20166         Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
20167                      ? AArch64ISD::GLD1_MERGE_ZERO
20168                      : AArch64ISD::GLDFF1_MERGE_ZERO;
20169
20170       std::swap(Base, Offset);
20171     }
20172   }
20173
20174   auto &TLI = DAG.getTargetLoweringInfo();
20175   if (!TLI.isTypeLegal(Base.getValueType()))
20176     return SDValue();
20177
20178   // Some gather load variants allow unpacked offsets, but only as nxv2i32
20179   // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
20180   // nxv2i64. Legalize accordingly.
20181   if (!OnlyPackedOffsets &&
20182       Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
20183     Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
20184
20185   // Return value type that is representable in hardware
20186   EVT HwRetVt = getSVEContainerType(RetVT);
20187
20188   // Keep the original output value type around - this is needed to be able to
20189   // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
20190   // values we want the integer equivalent, so just use HwRetVT.
20191   SDValue OutVT = DAG.getValueType(RetVT);
20192   if (RetVT.isFloatingPoint())
20193     OutVT = DAG.getValueType(HwRetVt);
20194
20195   SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
20196   SDValue Ops[] = {N->getOperand(0), // Chain
20197                    N->getOperand(2), // Pg
20198                    Base, Offset, OutVT};
20199
20200   SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
20201   SDValue LoadChain = SDValue(Load.getNode(), 1);
20202
20203   if (RetVT.isInteger() && (RetVT != HwRetVt))
20204     Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
20205
20206   // If the original return value was FP, bitcast accordingly. Doing it here
20207   // means that we can avoid adding TableGen patterns for FPs.
20208   if (RetVT.isFloatingPoint())
20209     Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
20210
20211   return DAG.getMergeValues({Load, LoadChain}, DL);
20212 }
20213
20214 static SDValue
20215 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
20216                               SelectionDAG &DAG) {
20217   SDLoc DL(N);
20218   SDValue Src = N->getOperand(0);
20219   unsigned Opc = Src->getOpcode();
20220
20221   // Sign extend of an unsigned unpack -> signed unpack
20222   if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
20223
20224     unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
20225                                                : AArch64ISD::SUNPKLO;
20226
20227     // Push the sign extend to the operand of the unpack
20228     // This is necessary where, for example, the operand of the unpack
20229     // is another unpack:
20230     // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
20231     // ->
20232     // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
20233     // ->
20234     // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
20235     SDValue ExtOp = Src->getOperand(0);
20236     auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
20237     EVT EltTy = VT.getVectorElementType();
20238     (void)EltTy;
20239
20240     assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
20241            "Sign extending from an invalid type");
20242
20243     EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
20244
20245     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
20246                               ExtOp, DAG.getValueType(ExtVT));
20247
20248     return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
20249   }
20250
20251   if (DCI.isBeforeLegalizeOps())
20252     return SDValue();
20253
20254   if (!EnableCombineMGatherIntrinsics)
20255     return SDValue();
20256
20257   // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
20258   // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
20259   unsigned NewOpc;
20260   unsigned MemVTOpNum = 4;
20261   switch (Opc) {
20262   case AArch64ISD::LD1_MERGE_ZERO:
20263     NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
20264     MemVTOpNum = 3;
20265     break;
20266   case AArch64ISD::LDNF1_MERGE_ZERO:
20267     NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
20268     MemVTOpNum = 3;
20269     break;
20270   case AArch64ISD::LDFF1_MERGE_ZERO:
20271     NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
20272     MemVTOpNum = 3;
20273     break;
20274   case AArch64ISD::GLD1_MERGE_ZERO:
20275     NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
20276     break;
20277   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
20278     NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
20279     break;
20280   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
20281     NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
20282     break;
20283   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
20284     NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
20285     break;
20286   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
20287     NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
20288     break;
20289   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
20290     NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
20291     break;
20292   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
20293     NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
20294     break;
20295   case AArch64ISD::GLDFF1_MERGE_ZERO:
20296     NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
20297     break;
20298   case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
20299     NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
20300     break;
20301   case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
20302     NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
20303     break;
20304   case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
20305     NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
20306     break;
20307   case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
20308     NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
20309     break;
20310   case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
20311     NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
20312     break;
20313   case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
20314     NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
20315     break;
20316   case AArch64ISD::GLDNT1_MERGE_ZERO:
20317     NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
20318     break;
20319   default:
20320     return SDValue();
20321   }
20322
20323   EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
20324   EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
20325
20326   if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
20327     return SDValue();
20328
20329   EVT DstVT = N->getValueType(0);
20330   SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
20331
20332   SmallVector<SDValue, 5> Ops;
20333   for (unsigned I = 0; I < Src->getNumOperands(); ++I)
20334     Ops.push_back(Src->getOperand(I));
20335
20336   SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
20337   DCI.CombineTo(N, ExtLoad);
20338   DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
20339
20340   // Return N so it doesn't get rechecked
20341   return SDValue(N, 0);
20342 }
20343
20344 /// Legalize the gather prefetch (scalar + vector addressing mode) when the
20345 /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
20346 /// != nxv2i32) do not need legalization.
20347 static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
20348   const unsigned OffsetPos = 4;
20349   SDValue Offset = N->getOperand(OffsetPos);
20350
20351   // Not an unpacked vector, bail out.
20352   if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
20353     return SDValue();
20354
20355   // Extend the unpacked offset vector to 64-bit lanes.
20356   SDLoc DL(N);
20357   Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
20358   SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
20359   // Replace the offset operand with the 64-bit one.
20360   Ops[OffsetPos] = Offset;
20361
20362   return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
20363 }
20364
20365 /// Combines a node carrying the intrinsic
20366 /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
20367 /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
20368 /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
20369 /// sve gather prefetch instruction with vector plus immediate addressing mode.
20370 static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
20371                                                unsigned ScalarSizeInBytes) {
20372   const unsigned ImmPos = 4, OffsetPos = 3;
20373   // No need to combine the node if the immediate is valid...
20374   if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
20375     return SDValue();
20376
20377   // ...otherwise swap the offset base with the offset...
20378   SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
20379   std::swap(Ops[ImmPos], Ops[OffsetPos]);
20380   // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
20381   // `aarch64_sve_prfb_gather_uxtw_index`.
20382   SDLoc DL(N);
20383   Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
20384                            MVT::i64);
20385
20386   return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
20387 }
20388
20389 // Return true if the vector operation can guarantee only the first lane of its
20390 // result contains data, with all bits in other lanes set to zero.
20391 static bool isLanes1toNKnownZero(SDValue Op) {
20392   switch (Op.getOpcode()) {
20393   default:
20394     return false;
20395   case AArch64ISD::ANDV_PRED:
20396   case AArch64ISD::EORV_PRED:
20397   case AArch64ISD::FADDA_PRED:
20398   case AArch64ISD::FADDV_PRED:
20399   case AArch64ISD::FMAXNMV_PRED:
20400   case AArch64ISD::FMAXV_PRED:
20401   case AArch64ISD::FMINNMV_PRED:
20402   case AArch64ISD::FMINV_PRED:
20403   case AArch64ISD::ORV_PRED:
20404   case AArch64ISD::SADDV_PRED:
20405   case AArch64ISD::SMAXV_PRED:
20406   case AArch64ISD::SMINV_PRED:
20407   case AArch64ISD::UADDV_PRED:
20408   case AArch64ISD::UMAXV_PRED:
20409   case AArch64ISD::UMINV_PRED:
20410     return true;
20411   }
20412 }
20413
20414 static SDValue removeRedundantInsertVectorElt(SDNode *N) {
20415   assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
20416   SDValue InsertVec = N->getOperand(0);
20417   SDValue InsertElt = N->getOperand(1);
20418   SDValue InsertIdx = N->getOperand(2);
20419
20420   // We only care about inserts into the first element...
20421   if (!isNullConstant(InsertIdx))
20422     return SDValue();
20423   // ...of a zero'd vector...
20424   if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode()))
20425     return SDValue();
20426   // ...where the inserted data was previously extracted...
20427   if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
20428     return SDValue();
20429
20430   SDValue ExtractVec = InsertElt.getOperand(0);
20431   SDValue ExtractIdx = InsertElt.getOperand(1);
20432
20433   // ...from the first element of a vector.
20434   if (!isNullConstant(ExtractIdx))
20435     return SDValue();
20436
20437   // If we get here we are effectively trying to zero lanes 1-N of a vector.
20438
20439   // Ensure there's no type conversion going on.
20440   if (N->getValueType(0) != ExtractVec.getValueType())
20441     return SDValue();
20442
20443   if (!isLanes1toNKnownZero(ExtractVec))
20444     return SDValue();
20445
20446   // The explicit zeroing is redundant.
20447   return ExtractVec;
20448 }
20449
20450 static SDValue
20451 performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
20452   if (SDValue Res = removeRedundantInsertVectorElt(N))
20453     return Res;
20454
20455   return performPostLD1Combine(N, DCI, true);
20456 }
20457
20458 static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
20459   EVT Ty = N->getValueType(0);
20460   if (Ty.isInteger())
20461     return SDValue();
20462
20463   EVT IntTy = Ty.changeVectorElementTypeToInteger();
20464   EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
20465   if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
20466       IntTy.getVectorElementType().getScalarSizeInBits())
20467     return SDValue();
20468
20469   SDLoc DL(N);
20470   SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
20471                                      DL, ExtIntTy);
20472   SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
20473                                      DL, ExtIntTy);
20474   SDValue Idx = N->getOperand(2);
20475   SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
20476   SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
20477   return DAG.getBitcast(Ty, Trunc);
20478 }
20479
20480 static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
20481                                       TargetLowering::DAGCombinerInfo &DCI,
20482                                       const AArch64Subtarget *Subtarget) {
20483   SDValue N0 = N->getOperand(0);
20484   EVT VT = N->getValueType(0);
20485
20486   // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
20487   if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
20488     return SDValue();
20489
20490   // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
20491   // We purposefully don't care about legality of the nodes here as we know
20492   // they can be split down into something legal.
20493   if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
20494       N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
20495       VT.isFixedLengthVector() &&
20496       VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
20497     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
20498     SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
20499                                      LN0->getChain(), LN0->getBasePtr(),
20500                                      N0.getValueType(), LN0->getMemOperand());
20501     DCI.CombineTo(N, ExtLoad);
20502     DCI.CombineTo(
20503         N0.getNode(),
20504         DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
20505                     DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
20506         ExtLoad.getValue(1));
20507     return SDValue(N, 0); // Return N so it doesn't get rechecked!
20508   }
20509
20510   return SDValue();
20511 }
20512
20513 static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
20514                                       const AArch64Subtarget *Subtarget) {
20515   EVT VT = N->getValueType(0);
20516
20517   // Don't expand for NEON, SVE2 or SME
20518   if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
20519     return SDValue();
20520
20521   SDLoc DL(N);
20522
20523   SDValue Mask = N->getOperand(0);
20524   SDValue In1 = N->getOperand(1);
20525   SDValue In2 = N->getOperand(2);
20526
20527   SDValue InvMask = DAG.getNOT(DL, Mask, VT);
20528   SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
20529   SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
20530   return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
20531 }
20532
20533 static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
20534   EVT VT = N->getValueType(0);
20535
20536   SDValue Insert = N->getOperand(0);
20537   if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
20538     return SDValue();
20539
20540   if (!Insert.getOperand(0).isUndef())
20541     return SDValue();
20542
20543   uint64_t IdxInsert = Insert.getConstantOperandVal(2);
20544   uint64_t IdxDupLane = N->getConstantOperandVal(1);
20545   if (IdxInsert != 0 || IdxDupLane != 0)
20546     return SDValue();
20547
20548   SDValue Bitcast = Insert.getOperand(1);
20549   if (Bitcast.getOpcode() != ISD::BITCAST)
20550     return SDValue();
20551
20552   SDValue Subvec = Bitcast.getOperand(0);
20553   EVT SubvecVT = Subvec.getValueType();
20554   if (!SubvecVT.is128BitVector())
20555     return SDValue();
20556   EVT NewSubvecVT =
20557       getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
20558
20559   SDLoc DL(N);
20560   SDValue NewInsert =
20561       DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
20562                   DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
20563   SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
20564                                       NewInsert, N->getOperand(1));
20565   return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
20566 }
20567
20568 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
20569                                                  DAGCombinerInfo &DCI) const {
20570   SelectionDAG &DAG = DCI.DAG;
20571   switch (N->getOpcode()) {
20572   default:
20573     LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
20574     break;
20575   case ISD::ADD:
20576   case ISD::SUB:
20577     return performAddSubCombine(N, DCI, DAG);
20578   case ISD::BUILD_VECTOR:
20579     return performBuildVectorCombine(N, DCI, DAG);
20580   case AArch64ISD::ANDS:
20581     return performFlagSettingCombine(N, DCI, ISD::AND);
20582   case AArch64ISD::ADC:
20583     if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
20584       return R;
20585     return foldADCToCINC(N, DAG);
20586   case AArch64ISD::SBC:
20587     return foldOverflowCheck(N, DAG, /* IsAdd */ false);
20588   case AArch64ISD::ADCS:
20589     if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
20590       return R;
20591     return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
20592   case AArch64ISD::SBCS:
20593     if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
20594       return R;
20595     return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
20596   case ISD::XOR:
20597     return performXorCombine(N, DAG, DCI, Subtarget);
20598   case ISD::MUL:
20599     return performMulCombine(N, DAG, DCI, Subtarget);
20600   case ISD::SINT_TO_FP:
20601   case ISD::UINT_TO_FP:
20602     return performIntToFpCombine(N, DAG, Subtarget);
20603   case ISD::FP_TO_SINT:
20604   case ISD::FP_TO_UINT:
20605   case ISD::FP_TO_SINT_SAT:
20606   case ISD::FP_TO_UINT_SAT:
20607     return performFpToIntCombine(N, DAG, DCI, Subtarget);
20608   case ISD::FDIV:
20609     return performFDivCombine(N, DAG, DCI, Subtarget);
20610   case ISD::OR:
20611     return performORCombine(N, DCI, Subtarget);
20612   case ISD::AND:
20613     return performANDCombine(N, DCI);
20614   case ISD::INTRINSIC_WO_CHAIN:
20615     return performIntrinsicCombine(N, DCI, Subtarget);
20616   case ISD::ANY_EXTEND:
20617   case ISD::ZERO_EXTEND:
20618   case ISD::SIGN_EXTEND:
20619     return performExtendCombine(N, DCI, DAG);
20620   case ISD::SIGN_EXTEND_INREG:
20621     return performSignExtendInRegCombine(N, DCI, DAG);
20622   case ISD::CONCAT_VECTORS:
20623     return performConcatVectorsCombine(N, DCI, DAG);
20624   case ISD::EXTRACT_SUBVECTOR:
20625     return performExtractSubvectorCombine(N, DCI, DAG);
20626   case ISD::INSERT_SUBVECTOR:
20627     return performInsertSubvectorCombine(N, DCI, DAG);
20628   case ISD::SELECT:
20629     return performSelectCombine(N, DCI);
20630   case ISD::VSELECT:
20631     return performVSelectCombine(N, DCI.DAG);
20632   case ISD::SETCC:
20633     return performSETCCCombine(N, DCI, DAG);
20634   case ISD::LOAD:
20635     return performLOADCombine(N, DCI, DAG, Subtarget);
20636   case ISD::STORE:
20637     return performSTORECombine(N, DCI, DAG, Subtarget);
20638   case ISD::MSTORE:
20639     return performMSTORECombine(N, DCI, DAG, Subtarget);
20640   case ISD::MGATHER:
20641   case ISD::MSCATTER:
20642     return performMaskedGatherScatterCombine(N, DCI, DAG);
20643   case ISD::VECTOR_SPLICE:
20644     return performSVESpliceCombine(N, DAG);
20645   case ISD::FP_EXTEND:
20646     return performFPExtendCombine(N, DAG, DCI, Subtarget);
20647   case AArch64ISD::BRCOND:
20648     return performBRCONDCombine(N, DCI, DAG);
20649   case AArch64ISD::TBNZ:
20650   case AArch64ISD::TBZ:
20651     return performTBZCombine(N, DCI, DAG);
20652   case AArch64ISD::CSEL:
20653     return performCSELCombine(N, DCI, DAG);
20654   case AArch64ISD::DUP:
20655     return performDUPCombine(N, DCI);
20656   case AArch64ISD::DUPLANE128:
20657     return performDupLane128Combine(N, DAG);
20658   case AArch64ISD::NVCAST:
20659     return performNVCASTCombine(N);
20660   case AArch64ISD::SPLICE:
20661     return performSpliceCombine(N, DAG);
20662   case AArch64ISD::UUNPKLO:
20663   case AArch64ISD::UUNPKHI:
20664     return performUnpackCombine(N, DAG, Subtarget);
20665   case AArch64ISD::UZP1:
20666     return performUzpCombine(N, DAG);
20667   case AArch64ISD::SETCC_MERGE_ZERO:
20668     return performSetccMergeZeroCombine(N, DCI);
20669   case AArch64ISD::GLD1_MERGE_ZERO:
20670   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
20671   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
20672   case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
20673   case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
20674   case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
20675   case AArch64ISD::GLD1_IMM_MERGE_ZERO:
20676   case AArch64ISD::GLD1S_MERGE_ZERO:
20677   case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
20678   case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
20679   case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
20680   case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
20681   case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
20682   case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
20683     return performGLD1Combine(N, DAG);
20684   case AArch64ISD::VASHR:
20685   case AArch64ISD::VLSHR:
20686     return performVectorShiftCombine(N, *this, DCI);
20687   case AArch64ISD::SUNPKLO:
20688     return performSunpkloCombine(N, DAG);
20689   case AArch64ISD::BSP:
20690     return performBSPExpandForSVE(N, DAG, Subtarget);
20691   case ISD::INSERT_VECTOR_ELT:
20692     return performInsertVectorEltCombine(N, DCI);
20693   case ISD::EXTRACT_VECTOR_ELT:
20694     return performExtractVectorEltCombine(N, DCI, Subtarget);
20695   case ISD::VECREDUCE_ADD:
20696     return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
20697   case AArch64ISD::UADDV:
20698     return performUADDVCombine(N, DAG);
20699   case AArch64ISD::SMULL:
20700   case AArch64ISD::UMULL:
20701   case AArch64ISD::PMULL:
20702     return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG);
20703   case ISD::INTRINSIC_VOID:
20704   case ISD::INTRINSIC_W_CHAIN:
20705     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
20706     case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
20707       return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
20708     case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
20709       return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
20710     case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
20711       return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
20712     case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
20713       return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
20714     case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
20715     case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
20716     case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
20717     case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
20718     case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
20719     case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
20720     case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
20721     case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
20722       return legalizeSVEGatherPrefetchOffsVec(N, DAG);
20723     case Intrinsic::aarch64_neon_ld2:
20724     case Intrinsic::aarch64_neon_ld3:
20725     case Intrinsic::aarch64_neon_ld4:
20726     case Intrinsic::aarch64_neon_ld1x2:
20727     case Intrinsic::aarch64_neon_ld1x3:
20728     case Intrinsic::aarch64_neon_ld1x4:
20729     case Intrinsic::aarch64_neon_ld2lane:
20730     case Intrinsic::aarch64_neon_ld3lane:
20731     case Intrinsic::aarch64_neon_ld4lane:
20732     case Intrinsic::aarch64_neon_ld2r:
20733     case Intrinsic::aarch64_neon_ld3r:
20734     case Intrinsic::aarch64_neon_ld4r:
20735     case Intrinsic::aarch64_neon_st2:
20736     case Intrinsic::aarch64_neon_st3:
20737     case Intrinsic::aarch64_neon_st4:
20738     case Intrinsic::aarch64_neon_st1x2:
20739     case Intrinsic::aarch64_neon_st1x3:
20740     case Intrinsic::aarch64_neon_st1x4:
20741     case Intrinsic::aarch64_neon_st2lane:
20742     case Intrinsic::aarch64_neon_st3lane:
20743     case Intrinsic::aarch64_neon_st4lane:
20744       return performNEONPostLDSTCombine(N, DCI, DAG);
20745     case Intrinsic::aarch64_sve_ldnt1:
20746       return performLDNT1Combine(N, DAG);
20747     case Intrinsic::aarch64_sve_ld1rq:
20748       return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
20749     case Intrinsic::aarch64_sve_ld1ro:
20750       return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
20751     case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
20752       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
20753     case Intrinsic::aarch64_sve_ldnt1_gather:
20754       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
20755     case Intrinsic::aarch64_sve_ldnt1_gather_index:
20756       return performGatherLoadCombine(N, DAG,
20757                                       AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
20758     case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
20759       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
20760     case Intrinsic::aarch64_sve_ld1:
20761       return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
20762     case Intrinsic::aarch64_sve_ldnf1:
20763       return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
20764     case Intrinsic::aarch64_sve_ldff1:
20765       return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
20766     case Intrinsic::aarch64_sve_st1:
20767       return performST1Combine(N, DAG);
20768     case Intrinsic::aarch64_sve_stnt1:
20769       return performSTNT1Combine(N, DAG);
20770     case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
20771       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
20772     case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
20773       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
20774     case Intrinsic::aarch64_sve_stnt1_scatter:
20775       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
20776     case Intrinsic::aarch64_sve_stnt1_scatter_index:
20777       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
20778     case Intrinsic::aarch64_sve_ld1_gather:
20779       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
20780     case Intrinsic::aarch64_sve_ld1_gather_index:
20781       return performGatherLoadCombine(N, DAG,
20782                                       AArch64ISD::GLD1_SCALED_MERGE_ZERO);
20783     case Intrinsic::aarch64_sve_ld1_gather_sxtw:
20784       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
20785                                       /*OnlyPackedOffsets=*/false);
20786     case Intrinsic::aarch64_sve_ld1_gather_uxtw:
20787       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
20788                                       /*OnlyPackedOffsets=*/false);
20789     case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
20790       return performGatherLoadCombine(N, DAG,
20791                                       AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
20792                                       /*OnlyPackedOffsets=*/false);
20793     case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
20794       return performGatherLoadCombine(N, DAG,
20795                                       AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
20796                                       /*OnlyPackedOffsets=*/false);
20797     case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
20798       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
20799     case Intrinsic::aarch64_sve_ldff1_gather:
20800       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
20801     case Intrinsic::aarch64_sve_ldff1_gather_index:
20802       return performGatherLoadCombine(N, DAG,
20803                                       AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
20804     case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
20805       return performGatherLoadCombine(N, DAG,
20806                                       AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
20807                                       /*OnlyPackedOffsets=*/false);
20808     case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
20809       return performGatherLoadCombine(N, DAG,
20810                                       AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
20811                                       /*OnlyPackedOffsets=*/false);
20812     case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
20813       return performGatherLoadCombine(N, DAG,
20814                                       AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
20815                                       /*OnlyPackedOffsets=*/false);
20816     case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
20817       return performGatherLoadCombine(N, DAG,
20818                                       AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
20819                                       /*OnlyPackedOffsets=*/false);
20820     case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
20821       return performGatherLoadCombine(N, DAG,
20822                                       AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
20823     case Intrinsic::aarch64_sve_st1_scatter:
20824       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
20825     case Intrinsic::aarch64_sve_st1_scatter_index:
20826       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
20827     case Intrinsic::aarch64_sve_st1_scatter_sxtw:
20828       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
20829                                         /*OnlyPackedOffsets=*/false);
20830     case Intrinsic::aarch64_sve_st1_scatter_uxtw:
20831       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
20832                                         /*OnlyPackedOffsets=*/false);
20833     case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
20834       return performScatterStoreCombine(N, DAG,
20835                                         AArch64ISD::SST1_SXTW_SCALED_PRED,
20836                                         /*OnlyPackedOffsets=*/false);
20837     case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
20838       return performScatterStoreCombine(N, DAG,
20839                                         AArch64ISD::SST1_UXTW_SCALED_PRED,
20840                                         /*OnlyPackedOffsets=*/false);
20841     case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
20842       return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
20843     case Intrinsic::aarch64_rndr:
20844     case Intrinsic::aarch64_rndrrs: {
20845       unsigned IntrinsicID =
20846           cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
20847       auto Register =
20848           (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
20849                                                   : AArch64SysReg::RNDRRS);
20850       SDLoc DL(N);
20851       SDValue A = DAG.getNode(
20852           AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
20853           N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
20854       SDValue B = DAG.getNode(
20855           AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
20856           DAG.getConstant(0, DL, MVT::i32),
20857           DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
20858       return DAG.getMergeValues(
20859           {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
20860     }
20861     default:
20862       break;
20863     }
20864     break;
20865   case ISD::GlobalAddress:
20866     return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
20867   }
20868   return SDValue();
20869 }
20870
20871 // Check if the return value is used as only a return value, as otherwise
20872 // we can't perform a tail-call. In particular, we need to check for
20873 // target ISD nodes that are returns and any other "odd" constructs
20874 // that the generic analysis code won't necessarily catch.
20875 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
20876                                                SDValue &Chain) const {
20877   if (N->getNumValues() != 1)
20878     return false;
20879   if (!N->hasNUsesOfValue(1, 0))
20880     return false;
20881
20882   SDValue TCChain = Chain;
20883   SDNode *Copy = *N->use_begin();
20884   if (Copy->getOpcode() == ISD::CopyToReg) {
20885     // If the copy has a glue operand, we conservatively assume it isn't safe to
20886     // perform a tail call.
20887     if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
20888         MVT::Glue)
20889       return false;
20890     TCChain = Copy->getOperand(0);
20891   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
20892     return false;
20893
20894   bool HasRet = false;
20895   for (SDNode *Node : Copy->uses()) {
20896     if (Node->getOpcode() != AArch64ISD::RET_FLAG)
20897       return false;
20898     HasRet = true;
20899   }
20900
20901   if (!HasRet)
20902     return false;
20903
20904   Chain = TCChain;
20905   return true;
20906 }
20907
20908 // Return whether the an instruction can potentially be optimized to a tail
20909 // call. This will cause the optimizers to attempt to move, or duplicate,
20910 // return instructions to help enable tail call optimizations for this
20911 // instruction.
20912 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
20913   return CI->isTailCall();
20914 }
20915
20916 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
20917                                                    SDValue &Offset,
20918                                                    ISD::MemIndexedMode &AM,
20919                                                    bool &IsInc,
20920                                                    SelectionDAG &DAG) const {
20921   if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
20922     return false;
20923
20924   Base = Op->getOperand(0);
20925   // All of the indexed addressing mode instructions take a signed
20926   // 9 bit immediate offset.
20927   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
20928     int64_t RHSC = RHS->getSExtValue();
20929     if (Op->getOpcode() == ISD::SUB)
20930       RHSC = -(uint64_t)RHSC;
20931     if (!isInt<9>(RHSC))
20932       return false;
20933     IsInc = (Op->getOpcode() == ISD::ADD);
20934     Offset = Op->getOperand(1);
20935     return true;
20936   }
20937   return false;
20938 }
20939
20940 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
20941                                                       SDValue &Offset,
20942                                                       ISD::MemIndexedMode &AM,
20943                                                       SelectionDAG &DAG) const {
20944   EVT VT;
20945   SDValue Ptr;
20946   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
20947     VT = LD->getMemoryVT();
20948     Ptr = LD->getBasePtr();
20949   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
20950     VT = ST->getMemoryVT();
20951     Ptr = ST->getBasePtr();
20952   } else
20953     return false;
20954
20955   bool IsInc;
20956   if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
20957     return false;
20958   AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
20959   return true;
20960 }
20961
20962 bool AArch64TargetLowering::getPostIndexedAddressParts(
20963     SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
20964     ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
20965   EVT VT;
20966   SDValue Ptr;
20967   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
20968     VT = LD->getMemoryVT();
20969     Ptr = LD->getBasePtr();
20970   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
20971     VT = ST->getMemoryVT();
20972     Ptr = ST->getBasePtr();
20973   } else
20974     return false;
20975
20976   bool IsInc;
20977   if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
20978     return false;
20979   // Post-indexing updates the base, so it's not a valid transform
20980   // if that's not the same as the load's pointer.
20981   if (Ptr != Base)
20982     return false;
20983   AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
20984   return true;
20985 }
20986
20987 void AArch64TargetLowering::ReplaceBITCASTResults(
20988     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
20989   SDLoc DL(N);
20990   SDValue Op = N->getOperand(0);
20991   EVT VT = N->getValueType(0);
20992   EVT SrcVT = Op.getValueType();
20993
20994   if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
20995     assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
20996            "Expected fp->int bitcast!");
20997
20998     // Bitcasting between unpacked vector types of different element counts is
20999     // not a NOP because the live elements are laid out differently.
21000     //                01234567
21001     // e.g. nxv2i32 = XX??XX??
21002     //      nxv4f16 = X?X?X?X?
21003     if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
21004       return;
21005
21006     SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
21007     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
21008     return;
21009   }
21010
21011   if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
21012     return;
21013
21014   Op = SDValue(
21015       DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
21016                          DAG.getUNDEF(MVT::i32), Op,
21017                          DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
21018       0);
21019   Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
21020   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
21021 }
21022
21023 static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
21024                                SelectionDAG &DAG,
21025                                const AArch64Subtarget *Subtarget) {
21026   EVT VT = N->getValueType(0);
21027   if (!VT.is256BitVector() ||
21028       (VT.getScalarType().isFloatingPoint() &&
21029        !N->getFlags().hasAllowReassociation()) ||
21030       (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()))
21031     return;
21032
21033   SDValue X = N->getOperand(0);
21034   auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
21035   if (!Shuf) {
21036     Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
21037     X = N->getOperand(1);
21038     if (!Shuf)
21039       return;
21040   }
21041
21042   if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
21043     return;
21044
21045   // Check the mask is 1,0,3,2,5,4,...
21046   ArrayRef<int> Mask = Shuf->getMask();
21047   for (int I = 0, E = Mask.size(); I < E; I++)
21048     if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
21049       return;
21050
21051   SDLoc DL(N);
21052   auto LoHi = DAG.SplitVector(X, DL);
21053   assert(LoHi.first.getValueType() == LoHi.second.getValueType());
21054   SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
21055                              LoHi.first, LoHi.second);
21056
21057   // Shuffle the elements back into order.
21058   SmallVector<int> NMask;
21059   for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
21060     NMask.push_back(I);
21061     NMask.push_back(I);
21062   }
21063   Results.push_back(
21064       DAG.getVectorShuffle(VT, DL,
21065                            DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
21066                                        DAG.getUNDEF(LoHi.first.getValueType())),
21067                            DAG.getUNDEF(VT), NMask));
21068 }
21069
21070 static void ReplaceReductionResults(SDNode *N,
21071                                     SmallVectorImpl<SDValue> &Results,
21072                                     SelectionDAG &DAG, unsigned InterOp,
21073                                     unsigned AcrossOp) {
21074   EVT LoVT, HiVT;
21075   SDValue Lo, Hi;
21076   SDLoc dl(N);
21077   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
21078   std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
21079   SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
21080   SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
21081   Results.push_back(SplitVal);
21082 }
21083
21084 static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
21085   SDLoc DL(N);
21086   SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
21087   SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
21088                            DAG.getNode(ISD::SRL, DL, MVT::i128, N,
21089                                        DAG.getConstant(64, DL, MVT::i64)));
21090   return std::make_pair(Lo, Hi);
21091 }
21092
21093 void AArch64TargetLowering::ReplaceExtractSubVectorResults(
21094     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
21095   SDValue In = N->getOperand(0);
21096   EVT InVT = In.getValueType();
21097
21098   // Common code will handle these just fine.
21099   if (!InVT.isScalableVector() || !InVT.isInteger())
21100     return;
21101
21102   SDLoc DL(N);
21103   EVT VT = N->getValueType(0);
21104
21105   // The following checks bail if this is not a halving operation.
21106
21107   ElementCount ResEC = VT.getVectorElementCount();
21108
21109   if (InVT.getVectorElementCount() != (ResEC * 2))
21110     return;
21111
21112   auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
21113   if (!CIndex)
21114     return;
21115
21116   unsigned Index = CIndex->getZExtValue();
21117   if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
21118     return;
21119
21120   unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
21121   EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
21122
21123   SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
21124   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
21125 }
21126
21127 // Create an even/odd pair of X registers holding integer value V.
21128 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
21129   SDLoc dl(V.getNode());
21130   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
21131   SDValue VHi = DAG.getAnyExtOrTrunc(
21132       DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
21133       dl, MVT::i64);
21134   if (DAG.getDataLayout().isBigEndian())
21135     std::swap (VLo, VHi);
21136   SDValue RegClass =
21137       DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
21138   SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
21139   SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
21140   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
21141   return SDValue(
21142       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
21143 }
21144
21145 static void ReplaceCMP_SWAP_128Results(SDNode *N,
21146                                        SmallVectorImpl<SDValue> &Results,
21147                                        SelectionDAG &DAG,
21148                                        const AArch64Subtarget *Subtarget) {
21149   assert(N->getValueType(0) == MVT::i128 &&
21150          "AtomicCmpSwap on types less than 128 should be legal");
21151
21152   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
21153   if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
21154     // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
21155     // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
21156     SDValue Ops[] = {
21157         createGPRPairNode(DAG, N->getOperand(2)), // Compare value
21158         createGPRPairNode(DAG, N->getOperand(3)), // Store value
21159         N->getOperand(1), // Ptr
21160         N->getOperand(0), // Chain in
21161     };
21162
21163     unsigned Opcode;
21164     switch (MemOp->getMergedOrdering()) {
21165     case AtomicOrdering::Monotonic:
21166       Opcode = AArch64::CASPX;
21167       break;
21168     case AtomicOrdering::Acquire:
21169       Opcode = AArch64::CASPAX;
21170       break;
21171     case AtomicOrdering::Release:
21172       Opcode = AArch64::CASPLX;
21173       break;
21174     case AtomicOrdering::AcquireRelease:
21175     case AtomicOrdering::SequentiallyConsistent:
21176       Opcode = AArch64::CASPALX;
21177       break;
21178     default:
21179       llvm_unreachable("Unexpected ordering!");
21180     }
21181
21182     MachineSDNode *CmpSwap = DAG.getMachineNode(
21183         Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
21184     DAG.setNodeMemRefs(CmpSwap, {MemOp});
21185
21186     unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
21187     if (DAG.getDataLayout().isBigEndian())
21188       std::swap(SubReg1, SubReg2);
21189     SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
21190                                             SDValue(CmpSwap, 0));
21191     SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
21192                                             SDValue(CmpSwap, 0));
21193     Results.push_back(
21194         DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
21195     Results.push_back(SDValue(CmpSwap, 1)); // Chain out
21196     return;
21197   }
21198
21199   unsigned Opcode;
21200   switch (MemOp->getMergedOrdering()) {
21201   case AtomicOrdering::Monotonic:
21202     Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
21203     break;
21204   case AtomicOrdering::Acquire:
21205     Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
21206     break;
21207   case AtomicOrdering::Release:
21208     Opcode = AArch64::CMP_SWAP_128_RELEASE;
21209     break;
21210   case AtomicOrdering::AcquireRelease:
21211   case AtomicOrdering::SequentiallyConsistent:
21212     Opcode = AArch64::CMP_SWAP_128;
21213     break;
21214   default:
21215     llvm_unreachable("Unexpected ordering!");
21216   }
21217
21218   auto Desired = splitInt128(N->getOperand(2), DAG);
21219   auto New = splitInt128(N->getOperand(3), DAG);
21220   SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
21221                    New.first,        New.second,    N->getOperand(0)};
21222   SDNode *CmpSwap = DAG.getMachineNode(
21223       Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
21224       Ops);
21225   DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
21226
21227   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
21228                                 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
21229   Results.push_back(SDValue(CmpSwap, 3));
21230 }
21231
21232 void AArch64TargetLowering::ReplaceNodeResults(
21233     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
21234   switch (N->getOpcode()) {
21235   default:
21236     llvm_unreachable("Don't know how to custom expand this");
21237   case ISD::BITCAST:
21238     ReplaceBITCASTResults(N, Results, DAG);
21239     return;
21240   case ISD::VECREDUCE_ADD:
21241   case ISD::VECREDUCE_SMAX:
21242   case ISD::VECREDUCE_SMIN:
21243   case ISD::VECREDUCE_UMAX:
21244   case ISD::VECREDUCE_UMIN:
21245     Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
21246     return;
21247   case ISD::ADD:
21248   case ISD::FADD:
21249     ReplaceAddWithADDP(N, Results, DAG, Subtarget);
21250     return;
21251
21252   case ISD::CTPOP:
21253   case ISD::PARITY:
21254     if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
21255       Results.push_back(Result);
21256     return;
21257   case AArch64ISD::SADDV:
21258     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
21259     return;
21260   case AArch64ISD::UADDV:
21261     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
21262     return;
21263   case AArch64ISD::SMINV:
21264     ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
21265     return;
21266   case AArch64ISD::UMINV:
21267     ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
21268     return;
21269   case AArch64ISD::SMAXV:
21270     ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
21271     return;
21272   case AArch64ISD::UMAXV:
21273     ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
21274     return;
21275   case ISD::FP_TO_UINT:
21276   case ISD::FP_TO_SINT:
21277   case ISD::STRICT_FP_TO_SINT:
21278   case ISD::STRICT_FP_TO_UINT:
21279     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
21280     // Let normal code take care of it by not adding anything to Results.
21281     return;
21282   case ISD::ATOMIC_CMP_SWAP:
21283     ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
21284     return;
21285   case ISD::ATOMIC_LOAD:
21286   case ISD::LOAD: {
21287     MemSDNode *LoadNode = cast<MemSDNode>(N);
21288     EVT MemVT = LoadNode->getMemoryVT();
21289     // Handle lowering 256 bit non temporal loads into LDNP for little-endian
21290     // targets.
21291     if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
21292         MemVT.getSizeInBits() == 256u &&
21293         (MemVT.getScalarSizeInBits() == 8u ||
21294          MemVT.getScalarSizeInBits() == 16u ||
21295          MemVT.getScalarSizeInBits() == 32u ||
21296          MemVT.getScalarSizeInBits() == 64u)) {
21297
21298       SDValue Result = DAG.getMemIntrinsicNode(
21299           AArch64ISD::LDNP, SDLoc(N),
21300           DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
21301                          MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
21302                          MVT::Other}),
21303           {LoadNode->getChain(), LoadNode->getBasePtr()},
21304           LoadNode->getMemoryVT(), LoadNode->getMemOperand());
21305
21306       SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
21307                                  Result.getValue(0), Result.getValue(1));
21308       Results.append({Pair, Result.getValue(2) /* Chain */});
21309       return;
21310     }
21311
21312     if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
21313         LoadNode->getMemoryVT() != MVT::i128) {
21314       // Non-volatile or atomic loads are optimized later in AArch64's load/store
21315       // optimizer.
21316       return;
21317     }
21318
21319     if (SDValue(N, 0).getValueType() == MVT::i128) {
21320       SDValue Result = DAG.getMemIntrinsicNode(
21321           AArch64ISD::LDP, SDLoc(N),
21322           DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
21323           {LoadNode->getChain(), LoadNode->getBasePtr()},
21324           LoadNode->getMemoryVT(), LoadNode->getMemOperand());
21325
21326       SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
21327                                  Result.getValue(0), Result.getValue(1));
21328       Results.append({Pair, Result.getValue(2) /* Chain */});
21329     }
21330     return;
21331   }
21332   case ISD::EXTRACT_SUBVECTOR:
21333     ReplaceExtractSubVectorResults(N, Results, DAG);
21334     return;
21335   case ISD::INSERT_SUBVECTOR:
21336   case ISD::CONCAT_VECTORS:
21337     // Custom lowering has been requested for INSERT_SUBVECTOR and
21338     // CONCAT_VECTORS -- but delegate to common code for result type
21339     // legalisation
21340     return;
21341   case ISD::INTRINSIC_WO_CHAIN: {
21342     EVT VT = N->getValueType(0);
21343     assert((VT == MVT::i8 || VT == MVT::i16) &&
21344            "custom lowering for unexpected type");
21345
21346     ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
21347     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
21348     switch (IntID) {
21349     default:
21350       return;
21351     case Intrinsic::aarch64_sve_clasta_n: {
21352       SDLoc DL(N);
21353       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
21354       auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
21355                            N->getOperand(1), Op2, N->getOperand(3));
21356       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
21357       return;
21358     }
21359     case Intrinsic::aarch64_sve_clastb_n: {
21360       SDLoc DL(N);
21361       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
21362       auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
21363                            N->getOperand(1), Op2, N->getOperand(3));
21364       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
21365       return;
21366     }
21367     case Intrinsic::aarch64_sve_lasta: {
21368       SDLoc DL(N);
21369       auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
21370                            N->getOperand(1), N->getOperand(2));
21371       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
21372       return;
21373     }
21374     case Intrinsic::aarch64_sve_lastb: {
21375       SDLoc DL(N);
21376       auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
21377                            N->getOperand(1), N->getOperand(2));
21378       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
21379       return;
21380     }
21381     }
21382   }
21383   }
21384 }
21385
21386 bool AArch64TargetLowering::useLoadStackGuardNode() const {
21387   if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
21388     return TargetLowering::useLoadStackGuardNode();
21389   return true;
21390 }
21391
21392 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
21393   // Combine multiple FDIVs with the same divisor into multiple FMULs by the
21394   // reciprocal if there are three or more FDIVs.
21395   return 3;
21396 }
21397
21398 TargetLoweringBase::LegalizeTypeAction
21399 AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
21400   // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
21401   // v4i16, v2i32 instead of to promote.
21402   if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
21403       VT == MVT::v1f32)
21404     return TypeWidenVector;
21405
21406   return TargetLoweringBase::getPreferredVectorAction(VT);
21407 }
21408
21409 // In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
21410 // provided the address is 16-byte aligned.
21411 bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
21412   if (!Subtarget->hasLSE2())
21413     return false;
21414
21415   if (auto LI = dyn_cast<LoadInst>(I))
21416     return LI->getType()->getPrimitiveSizeInBits() == 128 &&
21417            LI->getAlign() >= Align(16);
21418
21419   if (auto SI = dyn_cast<StoreInst>(I))
21420     return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
21421            SI->getAlign() >= Align(16);
21422
21423   return false;
21424 }
21425
21426 bool AArch64TargetLowering::shouldInsertFencesForAtomic(
21427     const Instruction *I) const {
21428   return isOpSuitableForLDPSTP(I);
21429 }
21430
21431 // Loads and stores less than 128-bits are already atomic; ones above that
21432 // are doomed anyway, so defer to the default libcall and blame the OS when
21433 // things go wrong.
21434 TargetLoweringBase::AtomicExpansionKind
21435 AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
21436   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21437   if (Size != 128 || isOpSuitableForLDPSTP(SI))
21438     return AtomicExpansionKind::None;
21439   return AtomicExpansionKind::Expand;
21440 }
21441
21442 // Loads and stores less than 128-bits are already atomic; ones above that
21443 // are doomed anyway, so defer to the default libcall and blame the OS when
21444 // things go wrong.
21445 TargetLowering::AtomicExpansionKind
21446 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
21447   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21448
21449   if (Size != 128 || isOpSuitableForLDPSTP(LI))
21450     return AtomicExpansionKind::None;
21451
21452   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21453   // implement atomicrmw without spilling. If the target address is also on the
21454   // stack and close enough to the spill slot, this can lead to a situation
21455   // where the monitor always gets cleared and the atomic operation can never
21456   // succeed. So at -O0 lower this operation to a CAS loop.
21457   if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
21458     return AtomicExpansionKind::CmpXChg;
21459
21460   return AtomicExpansionKind::LLSC;
21461 }
21462
21463 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
21464 TargetLowering::AtomicExpansionKind
21465 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
21466   if (AI->isFloatingPointOperation())
21467     return AtomicExpansionKind::CmpXChg;
21468
21469   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21470   if (Size > 128) return AtomicExpansionKind::None;
21471
21472   // Nand is not supported in LSE.
21473   // Leave 128 bits to LLSC or CmpXChg.
21474   if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
21475     if (Subtarget->hasLSE())
21476       return AtomicExpansionKind::None;
21477     if (Subtarget->outlineAtomics()) {
21478       // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
21479       // Don't outline them unless
21480       // (1) high level <atomic> support approved:
21481       //   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
21482       // (2) low level libgcc and compiler-rt support implemented by:
21483       //   min/max outline atomics helpers
21484       if (AI->getOperation() != AtomicRMWInst::Min &&
21485           AI->getOperation() != AtomicRMWInst::Max &&
21486           AI->getOperation() != AtomicRMWInst::UMin &&
21487           AI->getOperation() != AtomicRMWInst::UMax) {
21488         return AtomicExpansionKind::None;
21489       }
21490     }
21491   }
21492
21493   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21494   // implement atomicrmw without spilling. If the target address is also on the
21495   // stack and close enough to the spill slot, this can lead to a situation
21496   // where the monitor always gets cleared and the atomic operation can never
21497   // succeed. So at -O0 lower this operation to a CAS loop.
21498   if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
21499     return AtomicExpansionKind::CmpXChg;
21500
21501   return AtomicExpansionKind::LLSC;
21502 }
21503
21504 TargetLowering::AtomicExpansionKind
21505 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
21506     AtomicCmpXchgInst *AI) const {
21507   // If subtarget has LSE, leave cmpxchg intact for codegen.
21508   if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
21509     return AtomicExpansionKind::None;
21510   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21511   // implement cmpxchg without spilling. If the address being exchanged is also
21512   // on the stack and close enough to the spill slot, this can lead to a
21513   // situation where the monitor always gets cleared and the atomic operation
21514   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21515   if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
21516     return AtomicExpansionKind::None;
21517
21518   // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
21519   // it.
21520   unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
21521   if (Size > 64)
21522     return AtomicExpansionKind::None;
21523
21524   return AtomicExpansionKind::LLSC;
21525 }
21526
21527 Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
21528                                              Type *ValueTy, Value *Addr,
21529                                              AtomicOrdering Ord) const {
21530   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21531   bool IsAcquire = isAcquireOrStronger(Ord);
21532
21533   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
21534   // intrinsic must return {i64, i64} and we have to recombine them into a
21535   // single i128 here.
21536   if (ValueTy->getPrimitiveSizeInBits() == 128) {
21537     Intrinsic::ID Int =
21538         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
21539     Function *Ldxr = Intrinsic::getDeclaration(M, Int);
21540
21541     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
21542     Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
21543
21544     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21545     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21546     Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21547     Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21548     return Builder.CreateOr(
21549         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
21550   }
21551
21552   Type *Tys[] = { Addr->getType() };
21553   Intrinsic::ID Int =
21554       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
21555   Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
21556
21557   const DataLayout &DL = M->getDataLayout();
21558   IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
21559   CallInst *CI = Builder.CreateCall(Ldxr, Addr);
21560   CI->addParamAttr(
21561       0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
21562   Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
21563
21564   return Builder.CreateBitCast(Trunc, ValueTy);
21565 }
21566
21567 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
21568     IRBuilderBase &Builder) const {
21569   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21570   Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
21571 }
21572
21573 Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
21574                                                    Value *Val, Value *Addr,
21575                                                    AtomicOrdering Ord) const {
21576   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21577   bool IsRelease = isReleaseOrStronger(Ord);
21578
21579   // Since the intrinsics must have legal type, the i128 intrinsics take two
21580   // parameters: "i64, i64". We must marshal Val into the appropriate form
21581   // before the call.
21582   if (Val->getType()->getPrimitiveSizeInBits() == 128) {
21583     Intrinsic::ID Int =
21584         IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
21585     Function *Stxr = Intrinsic::getDeclaration(M, Int);
21586     Type *Int64Ty = Type::getInt64Ty(M->getContext());
21587
21588     Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
21589     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
21590     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
21591     return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
21592   }
21593
21594   Intrinsic::ID Int =
21595       IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
21596   Type *Tys[] = { Addr->getType() };
21597   Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
21598
21599   const DataLayout &DL = M->getDataLayout();
21600   IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
21601   Val = Builder.CreateBitCast(Val, IntValTy);
21602
21603   CallInst *CI = Builder.CreateCall(
21604       Stxr, {Builder.CreateZExtOrBitCast(
21605                  Val, Stxr->getFunctionType()->getParamType(0)),
21606              Addr});
21607   CI->addParamAttr(1, Attribute::get(Builder.getContext(),
21608                                      Attribute::ElementType, Val->getType()));
21609   return CI;
21610 }
21611
21612 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
21613     Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21614     const DataLayout &DL) const {
21615   if (!Ty->isArrayTy()) {
21616     const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
21617     return TySize.isScalable() && TySize.getKnownMinSize() > 128;
21618   }
21619
21620   // All non aggregate members of the type must have the same type
21621   SmallVector<EVT> ValueVTs;
21622   ComputeValueVTs(*this, DL, Ty, ValueVTs);
21623   return all_equal(ValueVTs);
21624 }
21625
21626 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
21627                                                             EVT) const {
21628   return false;
21629 }
21630
21631 static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
21632   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
21633   Function *ThreadPointerFunc =
21634       Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
21635   return IRB.CreatePointerCast(
21636       IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
21637                              Offset),
21638       IRB.getInt8PtrTy()->getPointerTo(0));
21639 }
21640
21641 Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
21642   // Android provides a fixed TLS slot for the stack cookie. See the definition
21643   // of TLS_SLOT_STACK_GUARD in
21644   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
21645   if (Subtarget->isTargetAndroid())
21646     return UseTlsOffset(IRB, 0x28);
21647
21648   // Fuchsia is similar.
21649   // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
21650   if (Subtarget->isTargetFuchsia())
21651     return UseTlsOffset(IRB, -0x10);
21652
21653   return TargetLowering::getIRStackGuard(IRB);
21654 }
21655
21656 void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
21657   // MSVC CRT provides functionalities for stack protection.
21658   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
21659     // MSVC CRT has a global variable holding security cookie.
21660     M.getOrInsertGlobal("__security_cookie",
21661                         Type::getInt8PtrTy(M.getContext()));
21662
21663     // MSVC CRT has a function to validate security cookie.
21664     FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
21665         Subtarget->getSecurityCheckCookieName(),
21666         Type::getVoidTy(M.getContext()), Type::getInt8PtrTy(M.getContext()));
21667     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
21668       F->setCallingConv(CallingConv::Win64);
21669       F->addParamAttr(0, Attribute::AttrKind::InReg);
21670     }
21671     return;
21672   }
21673   TargetLowering::insertSSPDeclarations(M);
21674 }
21675
21676 Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
21677   // MSVC CRT has a global variable holding security cookie.
21678   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21679     return M.getGlobalVariable("__security_cookie");
21680   return TargetLowering::getSDagStackGuard(M);
21681 }
21682
21683 Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
21684   // MSVC CRT has a function to validate security cookie.
21685   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21686     return M.getFunction(Subtarget->getSecurityCheckCookieName());
21687   return TargetLowering::getSSPStackGuardCheck(M);
21688 }
21689
21690 Value *
21691 AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
21692   // Android provides a fixed TLS slot for the SafeStack pointer. See the
21693   // definition of TLS_SLOT_SAFESTACK in
21694   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
21695   if (Subtarget->isTargetAndroid())
21696     return UseTlsOffset(IRB, 0x48);
21697
21698   // Fuchsia is similar.
21699   // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
21700   if (Subtarget->isTargetFuchsia())
21701     return UseTlsOffset(IRB, -0x8);
21702
21703   return TargetLowering::getSafeStackPointerLocation(IRB);
21704 }
21705
21706 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
21707     const Instruction &AndI) const {
21708   // Only sink 'and' mask to cmp use block if it is masking a single bit, since
21709   // this is likely to be fold the and/cmp/br into a single tbz instruction.  It
21710   // may be beneficial to sink in other cases, but we would have to check that
21711   // the cmp would not get folded into the br to form a cbz for these to be
21712   // beneficial.
21713   ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
21714   if (!Mask)
21715     return false;
21716   return Mask->getValue().isPowerOf2();
21717 }
21718
21719 bool AArch64TargetLowering::
21720     shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
21721         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
21722         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
21723         SelectionDAG &DAG) const {
21724   // Does baseline recommend not to perform the fold by default?
21725   if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
21726           X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
21727     return false;
21728   // Else, if this is a vector shift, prefer 'shl'.
21729   return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
21730 }
21731
21732 bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
21733                                               SDNode *N) const {
21734   if (DAG.getMachineFunction().getFunction().hasMinSize() &&
21735       !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
21736     return false;
21737   return true;
21738 }
21739
21740 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21741   // Update IsSplitCSR in AArch64unctionInfo.
21742   AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
21743   AFI->setIsSplitCSR(true);
21744 }
21745
21746 void AArch64TargetLowering::insertCopiesSplitCSR(
21747     MachineBasicBlock *Entry,
21748     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21749   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
21750   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21751   if (!IStart)
21752     return;
21753
21754   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21755   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
21756   MachineBasicBlock::iterator MBBI = Entry->begin();
21757   for (const MCPhysReg *I = IStart; *I; ++I) {
21758     const TargetRegisterClass *RC = nullptr;
21759     if (AArch64::GPR64RegClass.contains(*I))
21760       RC = &AArch64::GPR64RegClass;
21761     else if (AArch64::FPR64RegClass.contains(*I))
21762       RC = &AArch64::FPR64RegClass;
21763     else
21764       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21765
21766     Register NewVR = MRI->createVirtualRegister(RC);
21767     // Create copy from CSR to a virtual register.
21768     // FIXME: this currently does not emit CFI pseudo-instructions, it works
21769     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21770     // nounwind. If we want to generalize this later, we may need to emit
21771     // CFI pseudo-instructions.
21772     assert(Entry->getParent()->getFunction().hasFnAttribute(
21773                Attribute::NoUnwind) &&
21774            "Function should be nounwind in insertCopiesSplitCSR!");
21775     Entry->addLiveIn(*I);
21776     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21777         .addReg(*I);
21778
21779     // Insert the copy-back instructions right before the terminator.
21780     for (auto *Exit : Exits)
21781       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
21782               TII->get(TargetOpcode::COPY), *I)
21783           .addReg(NewVR);
21784   }
21785 }
21786
21787 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
21788   // Integer division on AArch64 is expensive. However, when aggressively
21789   // optimizing for code size, we prefer to use a div instruction, as it is
21790   // usually smaller than the alternative sequence.
21791   // The exception to this is vector division. Since AArch64 doesn't have vector
21792   // integer division, leaving the division as-is is a loss even in terms of
21793   // size, because it will have to be scalarized, while the alternative code
21794   // sequence can be performed in vector form.
21795   bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
21796   return OptSize && !VT.isVector();
21797 }
21798
21799 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
21800   // We want inc-of-add for scalars and sub-of-not for vectors.
21801   return VT.isScalarInteger();
21802 }
21803
21804 bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
21805                                                  EVT VT) const {
21806   // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
21807   // legalize.
21808   if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
21809     return false;
21810   return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
21811 }
21812
21813 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
21814   return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
21815 }
21816
21817 unsigned
21818 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
21819   if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
21820     return getPointerTy(DL).getSizeInBits();
21821
21822   return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
21823 }
21824
21825 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
21826   MachineFrameInfo &MFI = MF.getFrameInfo();
21827   // If we have any vulnerable SVE stack objects then the stack protector
21828   // needs to be placed at the top of the SVE stack area, as the SVE locals
21829   // are placed above the other locals, so we allocate it as if it were a
21830   // scalable vector.
21831   // FIXME: It may be worthwhile having a specific interface for this rather
21832   // than doing it here in finalizeLowering.
21833   if (MFI.hasStackProtectorIndex()) {
21834     for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
21835       if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
21836           MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
21837         MFI.setStackID(MFI.getStackProtectorIndex(),
21838                        TargetStackID::ScalableVector);
21839         MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16));
21840         break;
21841       }
21842     }
21843   }
21844   MFI.computeMaxCallFrameSize(MF);
21845   TargetLoweringBase::finalizeLowering(MF);
21846 }
21847
21848 // Unlike X86, we let frame lowering assign offsets to all catch objects.
21849 bool AArch64TargetLowering::needsFixedCatchObjects() const {
21850   return false;
21851 }
21852
21853 bool AArch64TargetLowering::shouldLocalize(
21854     const MachineInstr &MI, const TargetTransformInfo *TTI) const {
21855   auto &MF = *MI.getMF();
21856   auto &MRI = MF.getRegInfo();
21857   auto maxUses = [](unsigned RematCost) {
21858     // A cost of 1 means remats are basically free.
21859     if (RematCost == 1)
21860       return std::numeric_limits<unsigned>::max();
21861     if (RematCost == 2)
21862       return 2U;
21863
21864     // Remat is too expensive, only sink if there's one user.
21865     if (RematCost > 2)
21866       return 1U;
21867     llvm_unreachable("Unexpected remat cost");
21868   };
21869
21870   switch (MI.getOpcode()) {
21871   case TargetOpcode::G_GLOBAL_VALUE: {
21872     // On Darwin, TLS global vars get selected into function calls, which
21873     // we don't want localized, as they can get moved into the middle of a
21874     // another call sequence.
21875     const GlobalValue &GV = *MI.getOperand(1).getGlobal();
21876     if (GV.isThreadLocal() && Subtarget->isTargetMachO())
21877       return false;
21878     break;
21879   }
21880   case TargetOpcode::G_CONSTANT: {
21881     auto *CI = MI.getOperand(1).getCImm();
21882     APInt Imm = CI->getValue();
21883     InstructionCost Cost = TTI->getIntImmCost(
21884         Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
21885     assert(Cost.isValid() && "Expected a valid imm cost");
21886
21887     unsigned RematCost = *Cost.getValue();
21888     Register Reg = MI.getOperand(0).getReg();
21889     unsigned MaxUses = maxUses(RematCost);
21890     // Don't pass UINT_MAX sentinal value to hasAtMostUserInstrs().
21891     if (MaxUses == std::numeric_limits<unsigned>::max())
21892       --MaxUses;
21893     return MRI.hasAtMostUserInstrs(Reg, MaxUses);
21894   }
21895   // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
21896   // localizable.
21897   case AArch64::ADRP:
21898   case AArch64::G_ADD_LOW:
21899     return true;
21900   default:
21901     break;
21902   }
21903   return TargetLoweringBase::shouldLocalize(MI, TTI);
21904 }
21905
21906 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
21907   if (isa<ScalableVectorType>(Inst.getType()))
21908     return true;
21909
21910   for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
21911     if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
21912       return true;
21913
21914   if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
21915     if (isa<ScalableVectorType>(AI->getAllocatedType()))
21916       return true;
21917   }
21918
21919   return false;
21920 }
21921
21922 // Return the largest legal scalable vector type that matches VT's element type.
21923 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
21924   assert(VT.isFixedLengthVector() &&
21925          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
21926          "Expected legal fixed length vector!");
21927   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
21928   default:
21929     llvm_unreachable("unexpected element type for SVE container");
21930   case MVT::i8:
21931     return EVT(MVT::nxv16i8);
21932   case MVT::i16:
21933     return EVT(MVT::nxv8i16);
21934   case MVT::i32:
21935     return EVT(MVT::nxv4i32);
21936   case MVT::i64:
21937     return EVT(MVT::nxv2i64);
21938   case MVT::f16:
21939     return EVT(MVT::nxv8f16);
21940   case MVT::f32:
21941     return EVT(MVT::nxv4f32);
21942   case MVT::f64:
21943     return EVT(MVT::nxv2f64);
21944   }
21945 }
21946
21947 // Return a PTRUE with active lanes corresponding to the extent of VT.
21948 static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
21949                                                 EVT VT) {
21950   assert(VT.isFixedLengthVector() &&
21951          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
21952          "Expected legal fixed length vector!");
21953
21954   Optional<unsigned> PgPattern =
21955       getSVEPredPatternFromNumElements(VT.getVectorNumElements());
21956   assert(PgPattern && "Unexpected element count for SVE predicate");
21957
21958   // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
21959   // AArch64SVEPredPattern::all, which can enable the use of unpredicated
21960   // variants of instructions when available.
21961   const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
21962   unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
21963   unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
21964   if (MaxSVESize && MinSVESize == MaxSVESize &&
21965       MaxSVESize == VT.getSizeInBits())
21966     PgPattern = AArch64SVEPredPattern::all;
21967
21968   MVT MaskVT;
21969   switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
21970   default:
21971     llvm_unreachable("unexpected element type for SVE predicate");
21972   case MVT::i8:
21973     MaskVT = MVT::nxv16i1;
21974     break;
21975   case MVT::i16:
21976   case MVT::f16:
21977     MaskVT = MVT::nxv8i1;
21978     break;
21979   case MVT::i32:
21980   case MVT::f32:
21981     MaskVT = MVT::nxv4i1;
21982     break;
21983   case MVT::i64:
21984   case MVT::f64:
21985     MaskVT = MVT::nxv2i1;
21986     break;
21987   }
21988
21989   return getPTrue(DAG, DL, MaskVT, *PgPattern);
21990 }
21991
21992 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
21993                                              EVT VT) {
21994   assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
21995          "Expected legal scalable vector!");
21996   auto PredTy = VT.changeVectorElementType(MVT::i1);
21997   return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
21998 }
21999
22000 static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
22001   if (VT.isFixedLengthVector())
22002     return getPredicateForFixedLengthVector(DAG, DL, VT);
22003
22004   return getPredicateForScalableVector(DAG, DL, VT);
22005 }
22006
22007 // Grow V to consume an entire SVE register.
22008 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
22009   assert(VT.isScalableVector() &&
22010          "Expected to convert into a scalable vector!");
22011   assert(V.getValueType().isFixedLengthVector() &&
22012          "Expected a fixed length vector operand!");
22013   SDLoc DL(V);
22014   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22015   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
22016 }
22017
22018 // Shrink V so it's just big enough to maintain a VT's worth of data.
22019 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
22020   assert(VT.isFixedLengthVector() &&
22021          "Expected to convert into a fixed length vector!");
22022   assert(V.getValueType().isScalableVector() &&
22023          "Expected a scalable vector operand!");
22024   SDLoc DL(V);
22025   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22026   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
22027 }
22028
22029 // Convert all fixed length vector loads larger than NEON to masked_loads.
22030 SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
22031     SDValue Op, SelectionDAG &DAG) const {
22032   auto Load = cast<LoadSDNode>(Op);
22033
22034   SDLoc DL(Op);
22035   EVT VT = Op.getValueType();
22036   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22037   EVT LoadVT = ContainerVT;
22038   EVT MemVT = Load->getMemoryVT();
22039
22040   auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
22041
22042   if (VT.isFloatingPoint()) {
22043     LoadVT = ContainerVT.changeTypeToInteger();
22044     MemVT = MemVT.changeTypeToInteger();
22045   }
22046
22047   SDValue NewLoad = DAG.getMaskedLoad(
22048       LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
22049       DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
22050       Load->getAddressingMode(), Load->getExtensionType());
22051
22052   SDValue Result = NewLoad;
22053   if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
22054     EVT ExtendVT = ContainerVT.changeVectorElementType(
22055         Load->getMemoryVT().getVectorElementType());
22056
22057     Result = getSVESafeBitCast(ExtendVT, Result, DAG);
22058     Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
22059                          Pg, Result, DAG.getUNDEF(ContainerVT));
22060   } else if (VT.isFloatingPoint()) {
22061     Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
22062   }
22063
22064   Result = convertFromScalableVector(DAG, VT, Result);
22065   SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
22066   return DAG.getMergeValues(MergedValues, DL);
22067 }
22068
22069 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
22070                                                 SelectionDAG &DAG) {
22071   SDLoc DL(Mask);
22072   EVT InVT = Mask.getValueType();
22073   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
22074
22075   auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
22076
22077   if (ISD::isBuildVectorAllOnes(Mask.getNode()))
22078     return Pg;
22079
22080   auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
22081   auto Op2 = DAG.getConstant(0, DL, ContainerVT);
22082
22083   return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
22084                      {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
22085 }
22086
22087 // Convert all fixed length vector loads larger than NEON to masked_loads.
22088 SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
22089     SDValue Op, SelectionDAG &DAG) const {
22090   auto Load = cast<MaskedLoadSDNode>(Op);
22091
22092   SDLoc DL(Op);
22093   EVT VT = Op.getValueType();
22094   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22095
22096   SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG);
22097
22098   SDValue PassThru;
22099   bool IsPassThruZeroOrUndef = false;
22100
22101   if (Load->getPassThru()->isUndef()) {
22102     PassThru = DAG.getUNDEF(ContainerVT);
22103     IsPassThruZeroOrUndef = true;
22104   } else {
22105     if (ContainerVT.isInteger())
22106       PassThru = DAG.getConstant(0, DL, ContainerVT);
22107     else
22108       PassThru = DAG.getConstantFP(0, DL, ContainerVT);
22109     if (isZerosVector(Load->getPassThru().getNode()))
22110       IsPassThruZeroOrUndef = true;
22111   }
22112
22113   SDValue NewLoad = DAG.getMaskedLoad(
22114       ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
22115       Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
22116       Load->getAddressingMode(), Load->getExtensionType());
22117
22118   SDValue Result = NewLoad;
22119   if (!IsPassThruZeroOrUndef) {
22120     SDValue OldPassThru =
22121         convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
22122     Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
22123   }
22124
22125   Result = convertFromScalableVector(DAG, VT, Result);
22126   SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
22127   return DAG.getMergeValues(MergedValues, DL);
22128 }
22129
22130 // Convert all fixed length vector stores larger than NEON to masked_stores.
22131 SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
22132     SDValue Op, SelectionDAG &DAG) const {
22133   auto Store = cast<StoreSDNode>(Op);
22134
22135   SDLoc DL(Op);
22136   EVT VT = Store->getValue().getValueType();
22137   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22138   EVT MemVT = Store->getMemoryVT();
22139
22140   auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
22141   auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
22142
22143   if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
22144     EVT TruncVT = ContainerVT.changeVectorElementType(
22145         Store->getMemoryVT().getVectorElementType());
22146     MemVT = MemVT.changeTypeToInteger();
22147     NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
22148                            NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
22149                            DAG.getUNDEF(TruncVT));
22150     NewValue =
22151         getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
22152   } else if (VT.isFloatingPoint()) {
22153     MemVT = MemVT.changeTypeToInteger();
22154     NewValue =
22155         getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
22156   }
22157
22158   return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
22159                             Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
22160                             Store->getMemOperand(), Store->getAddressingMode(),
22161                             Store->isTruncatingStore());
22162 }
22163
22164 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
22165     SDValue Op, SelectionDAG &DAG) const {
22166   auto *Store = cast<MaskedStoreSDNode>(Op);
22167
22168   SDLoc DL(Op);
22169   EVT VT = Store->getValue().getValueType();
22170   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22171
22172   auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
22173   SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
22174
22175   return DAG.getMaskedStore(
22176       Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
22177       Mask, Store->getMemoryVT(), Store->getMemOperand(),
22178       Store->getAddressingMode(), Store->isTruncatingStore());
22179 }
22180
22181 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
22182     SDValue Op, SelectionDAG &DAG) const {
22183   SDLoc dl(Op);
22184   EVT VT = Op.getValueType();
22185   EVT EltVT = VT.getVectorElementType();
22186
22187   bool Signed = Op.getOpcode() == ISD::SDIV;
22188   unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
22189
22190   bool Negated;
22191   uint64_t SplatVal;
22192   if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
22193     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22194     SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
22195     SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
22196
22197     SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
22198     SDValue Res = DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
22199     if (Negated)
22200       Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
22201
22202     return convertFromScalableVector(DAG, VT, Res);
22203   }
22204
22205   // Scalable vector i32/i64 DIV is supported.
22206   if (EltVT == MVT::i32 || EltVT == MVT::i64)
22207     return LowerToPredicatedOp(Op, DAG, PredOpcode);
22208
22209   // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
22210   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22211   EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
22212   EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
22213   EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT);
22214
22215   // If this is not a full vector, extend, div, and truncate it.
22216   EVT WidenedVT = VT.widenIntegerVectorElementType(*DAG.getContext());
22217   if (DAG.getTargetLoweringInfo().isTypeLegal(WidenedVT)) {
22218     unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22219     SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(0));
22220     SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(1));
22221     SDValue Div = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0, Op1);
22222     return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
22223   }
22224
22225   // Convert the operands to scalable vectors.
22226   SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
22227   SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
22228
22229   // Extend the scalable operands.
22230   unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
22231   unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
22232   SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0);
22233   SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1);
22234   SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0);
22235   SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1);
22236
22237   // Convert back to fixed vectors so the DIV can be further lowered.
22238   Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo);
22239   Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo);
22240   Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi);
22241   Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi);
22242   SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
22243                                  Op0Lo, Op1Lo);
22244   SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
22245                                  Op0Hi, Op1Hi);
22246
22247   // Convert again to scalable vectors to truncate.
22248   ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo);
22249   ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi);
22250   SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT,
22251                                        ResultLo, ResultHi);
22252
22253   return convertFromScalableVector(DAG, VT, ScalableResult);
22254 }
22255
22256 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
22257     SDValue Op, SelectionDAG &DAG) const {
22258   EVT VT = Op.getValueType();
22259   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22260
22261   SDLoc DL(Op);
22262   SDValue Val = Op.getOperand(0);
22263   EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
22264   Val = convertToScalableVector(DAG, ContainerVT, Val);
22265
22266   bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
22267   unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
22268
22269   // Repeatedly unpack Val until the result is of the desired element type.
22270   switch (ContainerVT.getSimpleVT().SimpleTy) {
22271   default:
22272     llvm_unreachable("unimplemented container type");
22273   case MVT::nxv16i8:
22274     Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
22275     if (VT.getVectorElementType() == MVT::i16)
22276       break;
22277     [[fallthrough]];
22278   case MVT::nxv8i16:
22279     Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
22280     if (VT.getVectorElementType() == MVT::i32)
22281       break;
22282     [[fallthrough]];
22283   case MVT::nxv4i32:
22284     Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
22285     assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
22286     break;
22287   }
22288
22289   return convertFromScalableVector(DAG, VT, Val);
22290 }
22291
22292 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
22293     SDValue Op, SelectionDAG &DAG) const {
22294   EVT VT = Op.getValueType();
22295   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22296
22297   SDLoc DL(Op);
22298   SDValue Val = Op.getOperand(0);
22299   EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
22300   Val = convertToScalableVector(DAG, ContainerVT, Val);
22301
22302   // Repeatedly truncate Val until the result is of the desired element type.
22303   switch (ContainerVT.getSimpleVT().SimpleTy) {
22304   default:
22305     llvm_unreachable("unimplemented container type");
22306   case MVT::nxv2i64:
22307     Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
22308     Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
22309     if (VT.getVectorElementType() == MVT::i32)
22310       break;
22311     [[fallthrough]];
22312   case MVT::nxv4i32:
22313     Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
22314     Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
22315     if (VT.getVectorElementType() == MVT::i16)
22316       break;
22317     [[fallthrough]];
22318   case MVT::nxv8i16:
22319     Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
22320     Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
22321     assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
22322     break;
22323   }
22324
22325   return convertFromScalableVector(DAG, VT, Val);
22326 }
22327
22328 SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
22329     SDValue Op, SelectionDAG &DAG) const {
22330   EVT VT = Op.getValueType();
22331   EVT InVT = Op.getOperand(0).getValueType();
22332   assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
22333
22334   SDLoc DL(Op);
22335   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
22336   SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
22337
22338   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
22339 }
22340
22341 SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
22342     SDValue Op, SelectionDAG &DAG) const {
22343   EVT VT = Op.getValueType();
22344   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22345
22346   SDLoc DL(Op);
22347   EVT InVT = Op.getOperand(0).getValueType();
22348   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
22349   SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
22350
22351   auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
22352                                  Op.getOperand(1), Op.getOperand(2));
22353
22354   return convertFromScalableVector(DAG, VT, ScalableRes);
22355 }
22356
22357 // Convert vector operation 'Op' to an equivalent predicated operation whereby
22358 // the original operation's type is used to construct a suitable predicate.
22359 // NOTE: The results for inactive lanes are undefined.
22360 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
22361                                                    SelectionDAG &DAG,
22362                                                    unsigned NewOp) const {
22363   EVT VT = Op.getValueType();
22364   SDLoc DL(Op);
22365   auto Pg = getPredicateForVector(DAG, DL, VT);
22366
22367   if (VT.isFixedLengthVector()) {
22368     assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
22369     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22370
22371     // Create list of operands by converting existing ones to scalable types.
22372     SmallVector<SDValue, 4> Operands = {Pg};
22373     for (const SDValue &V : Op->op_values()) {
22374       if (isa<CondCodeSDNode>(V)) {
22375         Operands.push_back(V);
22376         continue;
22377       }
22378
22379       if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
22380         EVT VTArg = VTNode->getVT().getVectorElementType();
22381         EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
22382         Operands.push_back(DAG.getValueType(NewVTArg));
22383         continue;
22384       }
22385
22386       assert(isTypeLegal(V.getValueType()) &&
22387              "Expected only legal fixed-width types");
22388       Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
22389     }
22390
22391     if (isMergePassthruOpcode(NewOp))
22392       Operands.push_back(DAG.getUNDEF(ContainerVT));
22393
22394     auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
22395     return convertFromScalableVector(DAG, VT, ScalableRes);
22396   }
22397
22398   assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
22399
22400   SmallVector<SDValue, 4> Operands = {Pg};
22401   for (const SDValue &V : Op->op_values()) {
22402     assert((!V.getValueType().isVector() ||
22403             V.getValueType().isScalableVector()) &&
22404            "Only scalable vectors are supported!");
22405     Operands.push_back(V);
22406   }
22407
22408   if (isMergePassthruOpcode(NewOp))
22409     Operands.push_back(DAG.getUNDEF(VT));
22410
22411   return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
22412 }
22413
22414 // If a fixed length vector operation has no side effects when applied to
22415 // undefined elements, we can safely use scalable vectors to perform the same
22416 // operation without needing to worry about predication.
22417 SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
22418                                                  SelectionDAG &DAG) const {
22419   EVT VT = Op.getValueType();
22420   assert(useSVEForFixedLengthVectorVT(VT) &&
22421          "Only expected to lower fixed length vector operation!");
22422   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22423
22424   // Create list of operands by converting existing ones to scalable types.
22425   SmallVector<SDValue, 4> Ops;
22426   for (const SDValue &V : Op->op_values()) {
22427     assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
22428
22429     // Pass through non-vector operands.
22430     if (!V.getValueType().isVector()) {
22431       Ops.push_back(V);
22432       continue;
22433     }
22434
22435     // "cast" fixed length vector to a scalable vector.
22436     assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
22437            "Only fixed length vectors are supported!");
22438     Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
22439   }
22440
22441   auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
22442   return convertFromScalableVector(DAG, VT, ScalableRes);
22443 }
22444
22445 SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
22446     SelectionDAG &DAG) const {
22447   SDLoc DL(ScalarOp);
22448   SDValue AccOp = ScalarOp.getOperand(0);
22449   SDValue VecOp = ScalarOp.getOperand(1);
22450   EVT SrcVT = VecOp.getValueType();
22451   EVT ResVT = SrcVT.getVectorElementType();
22452
22453   EVT ContainerVT = SrcVT;
22454   if (SrcVT.isFixedLengthVector()) {
22455     ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
22456     VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
22457   }
22458
22459   SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
22460   SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22461
22462   // Convert operands to Scalable.
22463   AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
22464                       DAG.getUNDEF(ContainerVT), AccOp, Zero);
22465
22466   // Perform reduction.
22467   SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
22468                             Pg, AccOp, VecOp);
22469
22470   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
22471 }
22472
22473 SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
22474                                                        SelectionDAG &DAG) const {
22475   SDLoc DL(ReduceOp);
22476   SDValue Op = ReduceOp.getOperand(0);
22477   EVT OpVT = Op.getValueType();
22478   EVT VT = ReduceOp.getValueType();
22479
22480   if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
22481     return SDValue();
22482
22483   SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
22484
22485   switch (ReduceOp.getOpcode()) {
22486   default:
22487     return SDValue();
22488   case ISD::VECREDUCE_OR:
22489     if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
22490       // The predicate can be 'Op' because
22491       // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
22492       return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
22493     else
22494       return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
22495   case ISD::VECREDUCE_AND: {
22496     Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
22497     return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
22498   }
22499   case ISD::VECREDUCE_XOR: {
22500     SDValue ID =
22501         DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
22502     if (OpVT == MVT::nxv1i1) {
22503       // Emulate a CNTP on .Q using .D and a different governing predicate.
22504       Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
22505       Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
22506     }
22507     SDValue Cntp =
22508         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
22509     return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
22510   }
22511   }
22512
22513   return SDValue();
22514 }
22515
22516 SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
22517                                                    SDValue ScalarOp,
22518                                                    SelectionDAG &DAG) const {
22519   SDLoc DL(ScalarOp);
22520   SDValue VecOp = ScalarOp.getOperand(0);
22521   EVT SrcVT = VecOp.getValueType();
22522
22523   if (useSVEForFixedLengthVectorVT(
22524           SrcVT,
22525           /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
22526     EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
22527     VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
22528   }
22529
22530   // UADDV always returns an i64 result.
22531   EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
22532                                                    SrcVT.getVectorElementType();
22533   EVT RdxVT = SrcVT;
22534   if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
22535     RdxVT = getPackedSVEVectorVT(ResVT);
22536
22537   SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
22538   SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
22539   SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
22540                             Rdx, DAG.getConstant(0, DL, MVT::i64));
22541
22542   // The VEC_REDUCE nodes expect an element size result.
22543   if (ResVT != ScalarOp.getValueType())
22544     Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
22545
22546   return Res;
22547 }
22548
22549 SDValue
22550 AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
22551     SelectionDAG &DAG) const {
22552   EVT VT = Op.getValueType();
22553   SDLoc DL(Op);
22554
22555   EVT InVT = Op.getOperand(1).getValueType();
22556   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
22557   SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
22558   SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
22559
22560   // Convert the mask to a predicated (NOTE: We don't need to worry about
22561   // inactive lanes since VSELECT is safe when given undefined elements).
22562   EVT MaskVT = Op.getOperand(0).getValueType();
22563   EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
22564   auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
22565   Mask = DAG.getNode(ISD::TRUNCATE, DL,
22566                      MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
22567
22568   auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
22569                                 Mask, Op1, Op2);
22570
22571   return convertFromScalableVector(DAG, VT, ScalableRes);
22572 }
22573
22574 SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
22575     SDValue Op, SelectionDAG &DAG) const {
22576   SDLoc DL(Op);
22577   EVT InVT = Op.getOperand(0).getValueType();
22578   EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
22579
22580   assert(useSVEForFixedLengthVectorVT(InVT) &&
22581          "Only expected to lower fixed length vector operation!");
22582   assert(Op.getValueType() == InVT.changeTypeToInteger() &&
22583          "Expected integer result of the same bit length as the inputs!");
22584
22585   auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
22586   auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
22587   auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
22588
22589   EVT CmpVT = Pg.getValueType();
22590   auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
22591                          {Pg, Op1, Op2, Op.getOperand(2)});
22592
22593   EVT PromoteVT = ContainerVT.changeTypeToInteger();
22594   auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
22595   return convertFromScalableVector(DAG, Op.getValueType(), Promote);
22596 }
22597
22598 SDValue
22599 AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
22600                                                     SelectionDAG &DAG) const {
22601   SDLoc DL(Op);
22602   auto SrcOp = Op.getOperand(0);
22603   EVT VT = Op.getValueType();
22604   EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
22605   EVT ContainerSrcVT =
22606       getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
22607
22608   SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
22609   Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
22610   return convertFromScalableVector(DAG, VT, Op);
22611 }
22612
22613 SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
22614     SDValue Op, SelectionDAG &DAG) const {
22615   SDLoc DL(Op);
22616   unsigned NumOperands = Op->getNumOperands();
22617
22618   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
22619          "Unexpected number of operands in CONCAT_VECTORS");
22620
22621   auto SrcOp1 = Op.getOperand(0);
22622   auto SrcOp2 = Op.getOperand(1);
22623   EVT VT = Op.getValueType();
22624   EVT SrcVT = SrcOp1.getValueType();
22625
22626   if (NumOperands > 2) {
22627     SmallVector<SDValue, 4> Ops;
22628     EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
22629     for (unsigned I = 0; I < NumOperands; I += 2)
22630       Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
22631                                 Op->getOperand(I), Op->getOperand(I + 1)));
22632
22633     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
22634   }
22635
22636   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22637
22638   SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
22639   SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
22640   SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
22641
22642   Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
22643
22644   return convertFromScalableVector(DAG, VT, Op);
22645 }
22646
22647 SDValue
22648 AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
22649                                                      SelectionDAG &DAG) const {
22650   EVT VT = Op.getValueType();
22651   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22652
22653   SDLoc DL(Op);
22654   SDValue Val = Op.getOperand(0);
22655   SDValue Pg = getPredicateForVector(DAG, DL, VT);
22656   EVT SrcVT = Val.getValueType();
22657   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22658   EVT ExtendVT = ContainerVT.changeVectorElementType(
22659       SrcVT.getVectorElementType());
22660
22661   Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
22662   Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
22663
22664   Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
22665   Val = getSVESafeBitCast(ExtendVT, Val, DAG);
22666   Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
22667                     Pg, Val, DAG.getUNDEF(ContainerVT));
22668
22669   return convertFromScalableVector(DAG, VT, Val);
22670 }
22671
22672 SDValue
22673 AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
22674                                                     SelectionDAG &DAG) const {
22675   EVT VT = Op.getValueType();
22676   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22677
22678   SDLoc DL(Op);
22679   SDValue Val = Op.getOperand(0);
22680   EVT SrcVT = Val.getValueType();
22681   EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
22682   EVT RoundVT = ContainerSrcVT.changeVectorElementType(
22683       VT.getVectorElementType());
22684   SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
22685
22686   Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
22687   Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
22688                     Op.getOperand(1), DAG.getUNDEF(RoundVT));
22689   Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
22690   Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
22691
22692   Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
22693   return DAG.getNode(ISD::BITCAST, DL, VT, Val);
22694 }
22695
22696 SDValue
22697 AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
22698                                                     SelectionDAG &DAG) const {
22699   EVT VT = Op.getValueType();
22700   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22701
22702   bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
22703   unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
22704                              : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
22705
22706   SDLoc DL(Op);
22707   SDValue Val = Op.getOperand(0);
22708   EVT SrcVT = Val.getValueType();
22709   EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
22710   EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
22711
22712   if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
22713       ContainerDstVT.getVectorElementType().getSizeInBits()) {
22714     SDValue Pg = getPredicateForVector(DAG, DL, VT);
22715
22716     Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
22717                       VT.changeTypeToInteger(), Val);
22718
22719     Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
22720     Val = getSVESafeBitCast(ContainerDstVT.changeTypeToInteger(), Val, DAG);
22721     // Safe to use a larger than specified operand since we just unpacked the
22722     // data, hence the upper bits are zero.
22723     Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
22724                       DAG.getUNDEF(ContainerDstVT));
22725     return convertFromScalableVector(DAG, VT, Val);
22726   } else {
22727     EVT CvtVT = ContainerSrcVT.changeVectorElementType(
22728         ContainerDstVT.getVectorElementType());
22729     SDValue Pg = getPredicateForVector(DAG, DL, CvtVT);
22730
22731     Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
22732     Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
22733     Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
22734     Val = convertFromScalableVector(DAG, SrcVT, Val);
22735
22736     Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
22737     return DAG.getNode(ISD::BITCAST, DL, VT, Val);
22738   }
22739 }
22740
22741 SDValue
22742 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
22743                                                     SelectionDAG &DAG) const {
22744   EVT VT = Op.getValueType();
22745   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22746
22747   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
22748   unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
22749                              : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
22750
22751   SDLoc DL(Op);
22752   SDValue Val = Op.getOperand(0);
22753   EVT SrcVT = Val.getValueType();
22754   EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
22755   EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
22756
22757   if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
22758       ContainerDstVT.getVectorElementType().getSizeInBits()) {
22759     EVT CvtVT = ContainerDstVT.changeVectorElementType(
22760       ContainerSrcVT.getVectorElementType());
22761     SDValue Pg = getPredicateForVector(DAG, DL, VT);
22762
22763     Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
22764     Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
22765
22766     Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
22767     Val = getSVESafeBitCast(CvtVT, Val, DAG);
22768     Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
22769                       DAG.getUNDEF(ContainerDstVT));
22770     return convertFromScalableVector(DAG, VT, Val);
22771   } else {
22772     EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
22773     SDValue Pg = getPredicateForVector(DAG, DL, CvtVT);
22774
22775     // Safe to use a larger than specified result since an fp_to_int where the
22776     // result doesn't fit into the destination is undefined.
22777     Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
22778     Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
22779     Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
22780
22781     return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
22782   }
22783 }
22784
22785 SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
22786     SDValue Op, SelectionDAG &DAG) const {
22787   EVT VT = Op.getValueType();
22788   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22789
22790   auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
22791   auto ShuffleMask = SVN->getMask();
22792
22793   SDLoc DL(Op);
22794   SDValue Op1 = Op.getOperand(0);
22795   SDValue Op2 = Op.getOperand(1);
22796
22797   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22798   Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
22799   Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
22800
22801   bool ReverseEXT = false;
22802   unsigned Imm;
22803   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
22804       Imm == VT.getVectorNumElements() - 1) {
22805     if (ReverseEXT)
22806       std::swap(Op1, Op2);
22807
22808     EVT ScalarTy = VT.getVectorElementType();
22809     if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22810       ScalarTy = MVT::i32;
22811     SDValue Scalar = DAG.getNode(
22812         ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
22813         DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
22814     Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
22815     return convertFromScalableVector(DAG, VT, Op);
22816   }
22817
22818   for (unsigned LaneSize : {64U, 32U, 16U}) {
22819     if (isREVMask(ShuffleMask, VT, LaneSize)) {
22820       EVT NewVT =
22821           getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize));
22822       unsigned RevOp;
22823       unsigned EltSz = VT.getScalarSizeInBits();
22824       if (EltSz == 8)
22825         RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
22826       else if (EltSz == 16)
22827         RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
22828       else
22829         RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
22830
22831       Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
22832       Op = LowerToPredicatedOp(Op, DAG, RevOp);
22833       Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
22834       return convertFromScalableVector(DAG, VT, Op);
22835     }
22836   }
22837
22838   unsigned WhichResult;
22839   if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
22840     return convertFromScalableVector(
22841         DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
22842
22843   if (isTRNMask(ShuffleMask, VT, WhichResult)) {
22844     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
22845     return convertFromScalableVector(
22846         DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
22847   }
22848
22849   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
22850     return convertFromScalableVector(
22851         DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
22852
22853   if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
22854     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
22855     return convertFromScalableVector(
22856         DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
22857   }
22858
22859   // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
22860   // represents the same logical operation as performed by a ZIP instruction. In
22861   // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
22862   // equivalent to an AArch64 instruction. There's the extra component of
22863   // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
22864   // only operated on 64/128bit vector types that have a direct mapping to a
22865   // target register and so an exact mapping is implied.
22866   // However, when using SVE for fixed length vectors, most legal vector types
22867   // are actually sub-vectors of a larger SVE register. When mapping
22868   // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
22869   // how the mask's indices translate. Specifically, when the mapping requires
22870   // an exact meaning for a specific vector index (e.g. Index X is the last
22871   // vector element in the register) then such mappings are often only safe when
22872   // the exact SVE register size is know. The main exception to this is when
22873   // indices are logically relative to the first element of either
22874   // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
22875   // when converting from fixed-length to scalable vector types (i.e. the start
22876   // of a fixed length vector is always the start of a scalable vector).
22877   unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22878   unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
22879   if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
22880     if (ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) {
22881       Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
22882       return convertFromScalableVector(DAG, VT, Op);
22883     }
22884
22885     if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
22886       return convertFromScalableVector(
22887           DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
22888
22889     if (isUZPMask(ShuffleMask, VT, WhichResult)) {
22890       unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
22891       return convertFromScalableVector(
22892           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
22893     }
22894
22895     if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
22896       return convertFromScalableVector(
22897           DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
22898
22899     if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
22900       unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
22901       return convertFromScalableVector(
22902           DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
22903     }
22904   }
22905
22906   return SDValue();
22907 }
22908
22909 SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
22910                                                  SelectionDAG &DAG) const {
22911   SDLoc DL(Op);
22912   EVT InVT = Op.getValueType();
22913
22914   assert(VT.isScalableVector() && isTypeLegal(VT) &&
22915          InVT.isScalableVector() && isTypeLegal(InVT) &&
22916          "Only expect to cast between legal scalable vector types!");
22917   assert(VT.getVectorElementType() != MVT::i1 &&
22918          InVT.getVectorElementType() != MVT::i1 &&
22919          "For predicate bitcasts, use getSVEPredicateBitCast");
22920
22921   if (InVT == VT)
22922     return Op;
22923
22924   EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
22925   EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
22926
22927   // Safe bitcasting between unpacked vector types of different element counts
22928   // is currently unsupported because the following is missing the necessary
22929   // work to ensure the result's elements live where they're supposed to within
22930   // an SVE register.
22931   //                01234567
22932   // e.g. nxv2i32 = XX??XX??
22933   //      nxv4f16 = X?X?X?X?
22934   assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
22935           VT == PackedVT || InVT == PackedInVT) &&
22936          "Unexpected bitcast!");
22937
22938   // Pack input if required.
22939   if (InVT != PackedInVT)
22940     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
22941
22942   Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
22943
22944   // Unpack result if required.
22945   if (VT != PackedVT)
22946     Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
22947
22948   return Op;
22949 }
22950
22951 bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
22952                                                  SDValue N) const {
22953   return ::isAllActivePredicate(DAG, N);
22954 }
22955
22956 EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
22957   return ::getPromotedVTForPredicate(VT);
22958 }
22959
22960 bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
22961     SDValue Op, const APInt &OriginalDemandedBits,
22962     const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
22963     unsigned Depth) const {
22964
22965   unsigned Opc = Op.getOpcode();
22966   switch (Opc) {
22967   case AArch64ISD::VSHL: {
22968     // Match (VSHL (VLSHR Val X) X)
22969     SDValue ShiftL = Op;
22970     SDValue ShiftR = Op->getOperand(0);
22971     if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
22972       return false;
22973
22974     if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
22975       return false;
22976
22977     unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
22978     unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
22979
22980     // Other cases can be handled as well, but this is not
22981     // implemented.
22982     if (ShiftRBits != ShiftLBits)
22983       return false;
22984
22985     unsigned ScalarSize = Op.getScalarValueSizeInBits();
22986     assert(ScalarSize > ShiftLBits && "Invalid shift imm");
22987
22988     APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
22989     APInt UnusedBits = ~OriginalDemandedBits;
22990
22991     if ((ZeroBits & UnusedBits) != ZeroBits)
22992       return false;
22993
22994     // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
22995     // used - simplify to just Val.
22996     return TLO.CombineTo(Op, ShiftR->getOperand(0));
22997   }
22998   }
22999
23000   return TargetLowering::SimplifyDemandedBitsForTargetNode(
23001       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
23002 }
23003
23004 bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
23005   return Op.getOpcode() == AArch64ISD::DUP ||
23006          (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23007           Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
23008          TargetLowering::isTargetCanonicalConstantNode(Op);
23009 }
23010
23011 bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal(
23012     unsigned Opc, LLT Ty1, LLT Ty2) const {
23013   return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
23014 }