llvm/lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines the interfaces that X86 uses to lower LLVM code into a
  10 // selection DAG.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "X86ISelLowering.h"
  15 #include "MCTargetDesc/X86ShuffleDecode.h"
  16 #include "X86.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86IntrinsicsInfo.h"
  21 #include "X86MachineFunctionInfo.h"
  22 #include "X86TargetMachine.h"
  23 #include "X86TargetObjectFile.h"
  24 #include "llvm/ADT/SmallBitVector.h"
  25 #include "llvm/ADT/SmallSet.h"
  26 #include "llvm/ADT/Statistic.h"
  27 #include "llvm/ADT/StringExtras.h"
  28 #include "llvm/ADT/StringSwitch.h"
  29 #include "llvm/Analysis/BlockFrequencyInfo.h"
  30 #include "llvm/Analysis/ObjCARCUtil.h"
  31 #include "llvm/Analysis/ProfileSummaryInfo.h"
  32 #include "llvm/Analysis/VectorUtils.h"
  33 #include "llvm/CodeGen/IntrinsicLowering.h"
  34 #include "llvm/CodeGen/MachineFrameInfo.h"
  35 #include "llvm/CodeGen/MachineFunction.h"
  36 #include "llvm/CodeGen/MachineInstrBuilder.h"
  37 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  38 #include "llvm/CodeGen/MachineLoopInfo.h"
  39 #include "llvm/CodeGen/MachineModuleInfo.h"
  40 #include "llvm/CodeGen/MachineRegisterInfo.h"
  41 #include "llvm/CodeGen/TargetLowering.h"
  42 #include "llvm/CodeGen/WinEHFuncInfo.h"
  43 #include "llvm/IR/CallingConv.h"
  44 #include "llvm/IR/Constants.h"
  45 #include "llvm/IR/DerivedTypes.h"
  46 #include "llvm/IR/DiagnosticInfo.h"
  47 #include "llvm/IR/EHPersonalities.h"
  48 #include "llvm/IR/Function.h"
  49 #include "llvm/IR/GlobalAlias.h"
  50 #include "llvm/IR/GlobalVariable.h"
  51 #include "llvm/IR/IRBuilder.h"
  52 #include "llvm/IR/Instructions.h"
  53 #include "llvm/IR/Intrinsics.h"
  54 #include "llvm/IR/PatternMatch.h"
  55 #include "llvm/MC/MCAsmInfo.h"
  56 #include "llvm/MC/MCContext.h"
  57 #include "llvm/MC/MCExpr.h"
  58 #include "llvm/MC/MCSymbol.h"
  59 #include "llvm/Support/CommandLine.h"
  60 #include "llvm/Support/Debug.h"
  61 #include "llvm/Support/ErrorHandling.h"
  62 #include "llvm/Support/KnownBits.h"
  63 #include "llvm/Support/MathExtras.h"
  64 #include "llvm/Target/TargetOptions.h"
  65 #include <algorithm>
  66 #include <bitset>
  67 #include <cctype>
  68 #include <numeric>
  69 using namespace llvm;
  70
  71 #define DEBUG_TYPE "x86-isel"
  72
  73 STATISTIC(NumTailCalls, "Number of tail calls");
  74
  75 static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
  76     "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
  77     cl::desc(
  78         "Sets the preferable loop alignment for experiments (as log2 bytes) "
  79         "for innermost loops only. If specified, this option overrides "
  80         "alignment set by x86-experimental-pref-loop-alignment."),
  81     cl::Hidden);
  82
  83 static cl::opt<bool> MulConstantOptimization(
  84     "mul-constant-optimization", cl::init(true),
  85     cl::desc("Replace 'mul x, Const' with more effective instructions like "
  86              "SHIFT, LEA, etc."),
  87     cl::Hidden);
  88
  89 static cl::opt<bool> ExperimentalUnorderedISEL(
  90     "x86-experimental-unordered-atomic-isel", cl::init(false),
  91     cl::desc("Use LoadSDNode and StoreSDNode instead of "
  92              "AtomicSDNode for unordered atomic loads and "
  93              "stores respectively."),
  94     cl::Hidden);
  95
  96 /// Call this when the user attempts to do something unsupported, like
  97 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
  98 /// report_fatal_error, so calling code should attempt to recover without
  99 /// crashing.
 100 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
 101                              const char *Msg) {
 102   MachineFunction &MF = DAG.getMachineFunction();
 103   DAG.getContext()->diagnose(
 104       DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
 105 }
 106
 107 /// Returns true if a CC can dynamically exclude a register from the list of
 108 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
 109 /// the return registers.
 110 static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {
 111   switch (CC) {
 112   default:
 113     return false;
 114   case CallingConv::X86_RegCall:
 115   case CallingConv::PreserveMost:
 116   case CallingConv::PreserveAll:
 117     return true;
 118   }
 119 }
 120
 121 /// Returns true if a CC can dynamically exclude a register from the list of
 122 /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on
 123 /// the parameters.
 124 static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {
 125   return CC == CallingConv::X86_RegCall;
 126 }
 127
 128 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 129                                      const X86Subtarget &STI)
 130     : TargetLowering(TM), Subtarget(STI) {
 131   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
 132   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
 133
 134   // Set up the TargetLowering object.
 135
 136   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 137   setBooleanContents(ZeroOrOneBooleanContent);
 138   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 139   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 140
 141   // For 64-bit, since we have so many registers, use the ILP scheduler.
 142   // For 32-bit, use the register pressure specific scheduling.
 143   // For Atom, always use ILP scheduling.
 144   if (Subtarget.isAtom())
 145     setSchedulingPreference(Sched::ILP);
 146   else if (Subtarget.is64Bit())
 147     setSchedulingPreference(Sched::ILP);
 148   else
 149     setSchedulingPreference(Sched::RegPressure);
 150   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 151   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 152
 153   // Bypass expensive divides and use cheaper ones.
 154   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 155     if (Subtarget.hasSlowDivide32())
 156       addBypassSlowDiv(32, 8);
 157     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
 158       addBypassSlowDiv(64, 32);
 159   }
 160
 161   // Setup Windows compiler runtime calls.
 162   if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
 163     static const struct {
 164       const RTLIB::Libcall Op;
 165       const char * const Name;
 166       const CallingConv::ID CC;
 167     } LibraryCalls[] = {
 168       { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
 169       { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
 170       { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
 171       { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
 172       { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
 173     };
 174
 175     for (const auto &LC : LibraryCalls) {
 176       setLibcallName(LC.Op, LC.Name);
 177       setLibcallCallingConv(LC.Op, LC.CC);
 178     }
 179   }
 180
 181   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
 182     // MSVCRT doesn't have powi; fall back to pow
 183     setLibcallName(RTLIB::POWI_F32, nullptr);
 184     setLibcallName(RTLIB::POWI_F64, nullptr);
 185   }
 186
 187   // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
 188   // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
 189   // FIXME: Should we be limiting the atomic size on other configs? Default is
 190   // 1024.
 191   if (!Subtarget.canUseCMPXCHG8B())
 192     setMaxAtomicSizeInBitsSupported(32);
 193
 194   setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
 195
 196   setMaxLargeFPConvertBitWidthSupported(128);
 197
 198   // Set up the register classes.
 199   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 200   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 201   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 202   if (Subtarget.is64Bit())
 203     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 204
 205   for (MVT VT : MVT::integer_valuetypes())
 206     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 207
 208   // We don't accept any truncstore of integer registers.
 209   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 210   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 211   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 212   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 213   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 214   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 215
 216   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 217
 218   // SETOEQ and SETUNE require checking two conditions.
 219   for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
 220     setCondCodeAction(ISD::SETOEQ, VT, Expand);
 221     setCondCodeAction(ISD::SETUNE, VT, Expand);
 222   }
 223
 224   // Integer absolute.
 225   if (Subtarget.canUseCMOV()) {
 226     setOperationAction(ISD::ABS            , MVT::i16  , Custom);
 227     setOperationAction(ISD::ABS            , MVT::i32  , Custom);
 228     if (Subtarget.is64Bit())
 229       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
 230   }
 231
 232   // Absolute difference.
 233   for (auto Op : {ISD::ABDS, ISD::ABDU}) {
 234     setOperationAction(Op                  , MVT::i8   , Custom);
 235     setOperationAction(Op                  , MVT::i16  , Custom);
 236     setOperationAction(Op                  , MVT::i32  , Custom);
 237     if (Subtarget.is64Bit())
 238      setOperationAction(Op                 , MVT::i64  , Custom);
 239   }
 240
 241   // Signed saturation subtraction.
 242   setOperationAction(ISD::SSUBSAT          , MVT::i8   , Custom);
 243   setOperationAction(ISD::SSUBSAT          , MVT::i16  , Custom);
 244   setOperationAction(ISD::SSUBSAT          , MVT::i32  , Custom);
 245   if (Subtarget.is64Bit())
 246     setOperationAction(ISD::SSUBSAT        , MVT::i64  , Custom);
 247
 248   // Funnel shifts.
 249   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
 250     // For slow shld targets we only lower for code size.
 251     LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
 252
 253     setOperationAction(ShiftOp             , MVT::i8   , Custom);
 254     setOperationAction(ShiftOp             , MVT::i16  , Custom);
 255     setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);
 256     if (Subtarget.is64Bit())
 257       setOperationAction(ShiftOp           , MVT::i64  , ShiftDoubleAction);
 258   }
 259
 260   if (!Subtarget.useSoftFloat()) {
 261     // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 262     // operation.
 263     setOperationAction(ISD::UINT_TO_FP,        MVT::i8, Promote);
 264     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
 265     setOperationAction(ISD::UINT_TO_FP,        MVT::i16, Promote);
 266     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
 267     // We have an algorithm for SSE2, and we turn this into a 64-bit
 268     // FILD or VCVTUSI2SS/SD for other targets.
 269     setOperationAction(ISD::UINT_TO_FP,        MVT::i32, Custom);
 270     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
 271     // We have an algorithm for SSE2->double, and we turn this into a
 272     // 64-bit FILD followed by conditional FADD for other targets.
 273     setOperationAction(ISD::UINT_TO_FP,        MVT::i64, Custom);
 274     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
 275
 276     // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 277     // this operation.
 278     setOperationAction(ISD::SINT_TO_FP,        MVT::i8, Promote);
 279     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
 280     // SSE has no i16 to fp conversion, only i32. We promote in the handler
 281     // to allow f80 to use i16 and f64 to use i16 with sse1 only
 282     setOperationAction(ISD::SINT_TO_FP,        MVT::i16, Custom);
 283     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
 284     // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
 285     setOperationAction(ISD::SINT_TO_FP,        MVT::i32, Custom);
 286     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
 287     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 288     // are Legal, f80 is custom lowered.
 289     setOperationAction(ISD::SINT_TO_FP,        MVT::i64, Custom);
 290     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
 291
 292     // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 293     // this operation.
 294     setOperationAction(ISD::FP_TO_SINT,        MVT::i8,  Promote);
 295     // FIXME: This doesn't generate invalid exception when it should. PR44019.
 296     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8,  Promote);
 297     setOperationAction(ISD::FP_TO_SINT,        MVT::i16, Custom);
 298     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
 299     setOperationAction(ISD::FP_TO_SINT,        MVT::i32, Custom);
 300     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
 301     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 302     // are Legal, f80 is custom lowered.
 303     setOperationAction(ISD::FP_TO_SINT,        MVT::i64, Custom);
 304     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
 305
 306     // Handle FP_TO_UINT by promoting the destination to a larger signed
 307     // conversion.
 308     setOperationAction(ISD::FP_TO_UINT,        MVT::i8,  Promote);
 309     // FIXME: This doesn't generate invalid exception when it should. PR44019.
 310     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8,  Promote);
 311     setOperationAction(ISD::FP_TO_UINT,        MVT::i16, Promote);
 312     // FIXME: This doesn't generate invalid exception when it should. PR44019.
 313     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
 314     setOperationAction(ISD::FP_TO_UINT,        MVT::i32, Custom);
 315     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
 316     setOperationAction(ISD::FP_TO_UINT,        MVT::i64, Custom);
 317     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
 318
 319     setOperationAction(ISD::LRINT,             MVT::f32, Custom);
 320     setOperationAction(ISD::LRINT,             MVT::f64, Custom);
 321     setOperationAction(ISD::LLRINT,            MVT::f32, Custom);
 322     setOperationAction(ISD::LLRINT,            MVT::f64, Custom);
 323
 324     if (!Subtarget.is64Bit()) {
 325       setOperationAction(ISD::LRINT,  MVT::i64, Custom);
 326       setOperationAction(ISD::LLRINT, MVT::i64, Custom);
 327     }
 328   }
 329
 330   if (Subtarget.hasSSE2()) {
 331     // Custom lowering for saturating float to int conversions.
 332     // We handle promotion to larger result types manually.
 333     for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
 334       setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
 335       setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
 336     }
 337     if (Subtarget.is64Bit()) {
 338       setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
 339       setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
 340     }
 341   }
 342
 343   // Handle address space casts between mixed sized pointers.
 344   setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
 345   setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
 346
 347   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 348   if (!Subtarget.hasSSE2()) {
 349     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 350     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 351     if (Subtarget.is64Bit()) {
 352       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 353       // Without SSE, i64->f64 goes through memory.
 354       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 355     }
 356   } else if (!Subtarget.is64Bit())
 357     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
 358
 359   // Scalar integer divide and remainder are lowered to use operations that
 360   // produce two results, to match the available instructions. This exposes
 361   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 362   // into a single instruction.
 363   //
 364   // Scalar integer multiply-high is also lowered to use two-result
 365   // operations, to match the available instructions. However, plain multiply
 366   // (low) operations are left as Legal, as there are single-result
 367   // instructions for this in x86. Using the two-result multiply instructions
 368   // when both high and low results are needed must be arranged by dagcombine.
 369   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 370     setOperationAction(ISD::MULHS, VT, Expand);
 371     setOperationAction(ISD::MULHU, VT, Expand);
 372     setOperationAction(ISD::SDIV, VT, Expand);
 373     setOperationAction(ISD::UDIV, VT, Expand);
 374     setOperationAction(ISD::SREM, VT, Expand);
 375     setOperationAction(ISD::UREM, VT, Expand);
 376   }
 377
 378   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 379   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 380   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
 381                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
 382     setOperationAction(ISD::BR_CC,     VT, Expand);
 383     setOperationAction(ISD::SELECT_CC, VT, Expand);
 384   }
 385   if (Subtarget.is64Bit())
 386     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 387   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 388   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 389   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 390
 391   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 392   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 393   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 394   setOperationAction(ISD::FREM             , MVT::f128 , Expand);
 395
 396   if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
 397     setOperationAction(ISD::GET_ROUNDING   , MVT::i32  , Custom);
 398     setOperationAction(ISD::SET_ROUNDING   , MVT::Other, Custom);
 399   }
 400
 401   // Promote the i8 variants and force them on up to i32 which has a shorter
 402   // encoding.
 403   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
 404   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 405   // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
 406   // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
 407   // promote that too.
 408   setOperationPromotedToType(ISD::CTTZ           , MVT::i16  , MVT::i32);
 409   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , MVT::i32);
 410
 411   if (!Subtarget.hasBMI()) {
 412     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 413     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
 414     if (Subtarget.is64Bit()) {
 415       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 416       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
 417     }
 418   }
 419
 420   if (Subtarget.hasLZCNT()) {
 421     // When promoting the i8 variants, force them to i32 for a shorter
 422     // encoding.
 423     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
 424     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 425   } else {
 426     for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
 427       if (VT == MVT::i64 && !Subtarget.is64Bit())
 428         continue;
 429       setOperationAction(ISD::CTLZ           , VT, Custom);
 430       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
 431     }
 432   }
 433
 434   for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
 435                   ISD::STRICT_FP_TO_FP16}) {
 436     // Special handling for half-precision floating point conversions.
 437     // If we don't have F16C support, then lower half float conversions
 438     // into library calls.
 439     setOperationAction(
 440         Op, MVT::f32,
 441         (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
 442     // There's never any support for operations beyond MVT::f32.
 443     setOperationAction(Op, MVT::f64, Expand);
 444     setOperationAction(Op, MVT::f80, Expand);
 445     setOperationAction(Op, MVT::f128, Expand);
 446   }
 447
 448   for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
 449     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
 450     setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
 451     setTruncStoreAction(VT, MVT::f16, Expand);
 452     setTruncStoreAction(VT, MVT::bf16, Expand);
 453
 454     setOperationAction(ISD::BF16_TO_FP, VT, Expand);
 455     setOperationAction(ISD::FP_TO_BF16, VT, Custom);
 456   }
 457
 458   setOperationAction(ISD::PARITY, MVT::i8, Custom);
 459   setOperationAction(ISD::PARITY, MVT::i16, Custom);
 460   setOperationAction(ISD::PARITY, MVT::i32, Custom);
 461   if (Subtarget.is64Bit())
 462     setOperationAction(ISD::PARITY, MVT::i64, Custom);
 463   if (Subtarget.hasPOPCNT()) {
 464     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
 465     // popcntw is longer to encode than popcntl and also has a false dependency
 466     // on the dest that popcntl hasn't had since Cannon Lake.
 467     setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
 468   } else {
 469     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 470     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 471     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 472     if (Subtarget.is64Bit())
 473       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 474     else
 475       setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
 476   }
 477
 478   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 479
 480   if (!Subtarget.hasMOVBE())
 481     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 482
 483   // X86 wants to expand cmov itself.
 484   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
 485     setOperationAction(ISD::SELECT, VT, Custom);
 486     setOperationAction(ISD::SETCC, VT, Custom);
 487     setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
 488     setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
 489   }
 490   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 491     if (VT == MVT::i64 && !Subtarget.is64Bit())
 492       continue;
 493     setOperationAction(ISD::SELECT, VT, Custom);
 494     setOperationAction(ISD::SETCC,  VT, Custom);
 495   }
 496
 497   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
 498   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
 499   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
 500
 501   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 502   // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
 503   // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
 504   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 505   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 506   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
 507   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
 508     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 509
 510   // Darwin ABI issue.
 511   for (auto VT : { MVT::i32, MVT::i64 }) {
 512     if (VT == MVT::i64 && !Subtarget.is64Bit())
 513       continue;
 514     setOperationAction(ISD::ConstantPool    , VT, Custom);
 515     setOperationAction(ISD::JumpTable       , VT, Custom);
 516     setOperationAction(ISD::GlobalAddress   , VT, Custom);
 517     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
 518     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
 519     setOperationAction(ISD::BlockAddress    , VT, Custom);
 520   }
 521
 522   // 64-bit shl, sra, srl (iff 32-bit x86)
 523   for (auto VT : { MVT::i32, MVT::i64 }) {
 524     if (VT == MVT::i64 && !Subtarget.is64Bit())
 525       continue;
 526     setOperationAction(ISD::SHL_PARTS, VT, Custom);
 527     setOperationAction(ISD::SRA_PARTS, VT, Custom);
 528     setOperationAction(ISD::SRL_PARTS, VT, Custom);
 529   }
 530
 531   if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
 532     setOperationAction(ISD::PREFETCH      , MVT::Other, Custom);
 533
 534   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 535
 536   // Expand certain atomics
 537   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 538     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 539     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 540     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
 541     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
 542     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
 543     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
 544     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 545   }
 546
 547   if (!Subtarget.is64Bit())
 548     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
 549
 550   if (Subtarget.canUseCMPXCHG16B())
 551     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 552
 553   // FIXME - use subtarget debug flags
 554   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
 555       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
 556       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
 557     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 558   }
 559
 560   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 561   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 562
 563   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 564   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 565
 566   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 567   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 568   if (Subtarget.isTargetPS())
 569     setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
 570   else
 571     setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
 572
 573   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 574   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 575   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 576   bool Is64Bit = Subtarget.is64Bit();
 577   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
 578   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
 579
 580   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 581   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 582
 583   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
 584
 585   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
 586   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
 587   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
 588
 589   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
 590
 591   auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
 592     setOperationAction(ISD::FABS, VT, Action);
 593     setOperationAction(ISD::FNEG, VT, Action);
 594     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 595     setOperationAction(ISD::FREM, VT, Action);
 596     setOperationAction(ISD::FMA, VT, Action);
 597     setOperationAction(ISD::FMINNUM, VT, Action);
 598     setOperationAction(ISD::FMAXNUM, VT, Action);
 599     setOperationAction(ISD::FMINIMUM, VT, Action);
 600     setOperationAction(ISD::FMAXIMUM, VT, Action);
 601     setOperationAction(ISD::FSIN, VT, Action);
 602     setOperationAction(ISD::FCOS, VT, Action);
 603     setOperationAction(ISD::FSINCOS, VT, Action);
 604     setOperationAction(ISD::FSQRT, VT, Action);
 605     setOperationAction(ISD::FPOW, VT, Action);
 606     setOperationAction(ISD::FLOG, VT, Action);
 607     setOperationAction(ISD::FLOG2, VT, Action);
 608     setOperationAction(ISD::FLOG10, VT, Action);
 609     setOperationAction(ISD::FEXP, VT, Action);
 610     setOperationAction(ISD::FEXP2, VT, Action);
 611     setOperationAction(ISD::FCEIL, VT, Action);
 612     setOperationAction(ISD::FFLOOR, VT, Action);
 613     setOperationAction(ISD::FNEARBYINT, VT, Action);
 614     setOperationAction(ISD::FRINT, VT, Action);
 615     setOperationAction(ISD::BR_CC, VT, Action);
 616     setOperationAction(ISD::SETCC, VT, Action);
 617     setOperationAction(ISD::SELECT, VT, Custom);
 618     setOperationAction(ISD::SELECT_CC, VT, Action);
 619     setOperationAction(ISD::FROUND, VT, Action);
 620     setOperationAction(ISD::FROUNDEVEN, VT, Action);
 621     setOperationAction(ISD::FTRUNC, VT, Action);
 622     setOperationAction(ISD::FLDEXP, VT, Action);
 623   };
 624
 625   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
 626     // f16, f32 and f64 use SSE.
 627     // Set up the FP register classes.
 628     addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
 629                                                      : &X86::FR16RegClass);
 630     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
 631                                                      : &X86::FR32RegClass);
 632     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
 633                                                      : &X86::FR64RegClass);
 634
 635     // Disable f32->f64 extload as we can only generate this in one instruction
 636     // under optsize. So its easier to pattern match (fpext (load)) for that
 637     // case instead of needing to emit 2 instructions for extload in the
 638     // non-optsize case.
 639     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
 640
 641     for (auto VT : { MVT::f32, MVT::f64 }) {
 642       // Use ANDPD to simulate FABS.
 643       setOperationAction(ISD::FABS, VT, Custom);
 644
 645       // Use XORP to simulate FNEG.
 646       setOperationAction(ISD::FNEG, VT, Custom);
 647
 648       // Use ANDPD and ORPD to simulate FCOPYSIGN.
 649       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
 650
 651       // These might be better off as horizontal vector ops.
 652       setOperationAction(ISD::FADD, VT, Custom);
 653       setOperationAction(ISD::FSUB, VT, Custom);
 654
 655       // We don't support sin/cos/fmod
 656       setOperationAction(ISD::FSIN   , VT, Expand);
 657       setOperationAction(ISD::FCOS   , VT, Expand);
 658       setOperationAction(ISD::FSINCOS, VT, Expand);
 659     }
 660
 661     // Half type will be promoted by default.
 662     setF16Action(MVT::f16, Promote);
 663     setOperationAction(ISD::FADD, MVT::f16, Promote);
 664     setOperationAction(ISD::FSUB, MVT::f16, Promote);
 665     setOperationAction(ISD::FMUL, MVT::f16, Promote);
 666     setOperationAction(ISD::FDIV, MVT::f16, Promote);
 667     setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
 668     setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
 669     setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
 670
 671     setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);
 672     setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);
 673     setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);
 674     setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);
 675     setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);
 676     setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);
 677     setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);
 678     setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);
 679     setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);
 680     setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);
 681     setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);
 682     setOperationAction(ISD::STRICT_FLDEXP, MVT::f16, Promote);
 683     setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);
 684     setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);
 685     setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);
 686     setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);
 687     setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);
 688     setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);
 689     setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);
 690     setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);
 691     setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);
 692     setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);
 693     setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);
 694     setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
 695     setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
 696     setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
 697     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
 698     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
 699     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
 700
 701     setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
 702     setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
 703
 704     // Lower this to MOVMSK plus an AND.
 705     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 706     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 707
 708   } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
 709              (UseX87 || Is64Bit)) {
 710     // Use SSE for f32, x87 for f64.
 711     // Set up the FP register classes.
 712     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 713     if (UseX87)
 714       addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 715
 716     // Use ANDPS to simulate FABS.
 717     setOperationAction(ISD::FABS , MVT::f32, Custom);
 718
 719     // Use XORP to simulate FNEG.
 720     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 721
 722     if (UseX87)
 723       setOperationAction(ISD::UNDEF, MVT::f64, Expand);
 724
 725     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 726     if (UseX87)
 727       setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 728     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 729
 730     // We don't support sin/cos/fmod
 731     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 732     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 733     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 734
 735     if (UseX87) {
 736       // Always expand sin/cos functions even though x87 has an instruction.
 737       setOperationAction(ISD::FSIN, MVT::f64, Expand);
 738       setOperationAction(ISD::FCOS, MVT::f64, Expand);
 739       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 740     }
 741   } else if (UseX87) {
 742     // f32 and f64 in x87.
 743     // Set up the FP register classes.
 744     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 745     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 746
 747     for (auto VT : { MVT::f32, MVT::f64 }) {
 748       setOperationAction(ISD::UNDEF,     VT, Expand);
 749       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 750
 751       // Always expand sin/cos functions even though x87 has an instruction.
 752       setOperationAction(ISD::FSIN   , VT, Expand);
 753       setOperationAction(ISD::FCOS   , VT, Expand);
 754       setOperationAction(ISD::FSINCOS, VT, Expand);
 755     }
 756   }
 757
 758   // Expand FP32 immediates into loads from the stack, save special cases.
 759   if (isTypeLegal(MVT::f32)) {
 760     if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
 761       addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 762       addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 763       addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 764       addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 765     } else // SSE immediates.
 766       addLegalFPImmediate(APFloat(+0.0f)); // xorps
 767   }
 768   // Expand FP64 immediates into loads from the stack, save special cases.
 769   if (isTypeLegal(MVT::f64)) {
 770     if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
 771       addLegalFPImmediate(APFloat(+0.0)); // FLD0
 772       addLegalFPImmediate(APFloat(+1.0)); // FLD1
 773       addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 774       addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 775     } else // SSE immediates.
 776       addLegalFPImmediate(APFloat(+0.0)); // xorpd
 777   }
 778   // Support fp16 0 immediate.
 779   if (isTypeLegal(MVT::f16))
 780     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
 781
 782   // Handle constrained floating-point operations of scalar.
 783   setOperationAction(ISD::STRICT_FADD,      MVT::f32, Legal);
 784   setOperationAction(ISD::STRICT_FADD,      MVT::f64, Legal);
 785   setOperationAction(ISD::STRICT_FSUB,      MVT::f32, Legal);
 786   setOperationAction(ISD::STRICT_FSUB,      MVT::f64, Legal);
 787   setOperationAction(ISD::STRICT_FMUL,      MVT::f32, Legal);
 788   setOperationAction(ISD::STRICT_FMUL,      MVT::f64, Legal);
 789   setOperationAction(ISD::STRICT_FDIV,      MVT::f32, Legal);
 790   setOperationAction(ISD::STRICT_FDIV,      MVT::f64, Legal);
 791   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f32, Legal);
 792   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f64, Legal);
 793   setOperationAction(ISD::STRICT_FSQRT,     MVT::f32, Legal);
 794   setOperationAction(ISD::STRICT_FSQRT,     MVT::f64, Legal);
 795
 796   // We don't support FMA.
 797   setOperationAction(ISD::FMA, MVT::f64, Expand);
 798   setOperationAction(ISD::FMA, MVT::f32, Expand);
 799
 800   // f80 always uses X87.
 801   if (UseX87) {
 802     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 803     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 804     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 805     {
 806       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
 807       addLegalFPImmediate(TmpFlt);  // FLD0
 808       TmpFlt.changeSign();
 809       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 810
 811       bool ignored;
 812       APFloat TmpFlt2(+1.0);
 813       TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
 814                       &ignored);
 815       addLegalFPImmediate(TmpFlt2);  // FLD1
 816       TmpFlt2.changeSign();
 817       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 818     }
 819
 820     // Always expand sin/cos functions even though x87 has an instruction.
 821     setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 822     setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 823     setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 824
 825     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 826     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 827     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 828     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 829     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 830     setOperationAction(ISD::FMA, MVT::f80, Expand);
 831     setOperationAction(ISD::LROUND, MVT::f80, Expand);
 832     setOperationAction(ISD::LLROUND, MVT::f80, Expand);
 833     setOperationAction(ISD::LRINT, MVT::f80, Custom);
 834     setOperationAction(ISD::LLRINT, MVT::f80, Custom);
 835
 836     // Handle constrained floating-point operations of scalar.
 837     setOperationAction(ISD::STRICT_FADD     , MVT::f80, Legal);
 838     setOperationAction(ISD::STRICT_FSUB     , MVT::f80, Legal);
 839     setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
 840     setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
 841     setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
 842     if (isTypeLegal(MVT::f16)) {
 843       setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
 844       setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
 845     } else {
 846       setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
 847     }
 848     // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
 849     // as Custom.
 850     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
 851   }
 852
 853   // f128 uses xmm registers, but most operations require libcalls.
 854   if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
 855     addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
 856                                                    : &X86::VR128RegClass);
 857
 858     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
 859
 860     setOperationAction(ISD::FADD,        MVT::f128, LibCall);
 861     setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
 862     setOperationAction(ISD::FSUB,        MVT::f128, LibCall);
 863     setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
 864     setOperationAction(ISD::FDIV,        MVT::f128, LibCall);
 865     setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
 866     setOperationAction(ISD::FMUL,        MVT::f128, LibCall);
 867     setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
 868     setOperationAction(ISD::FMA,         MVT::f128, LibCall);
 869     setOperationAction(ISD::STRICT_FMA,  MVT::f128, LibCall);
 870
 871     setOperationAction(ISD::FABS, MVT::f128, Custom);
 872     setOperationAction(ISD::FNEG, MVT::f128, Custom);
 873     setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
 874
 875     setOperationAction(ISD::FSIN,         MVT::f128, LibCall);
 876     setOperationAction(ISD::STRICT_FSIN,  MVT::f128, LibCall);
 877     setOperationAction(ISD::FCOS,         MVT::f128, LibCall);
 878     setOperationAction(ISD::STRICT_FCOS,  MVT::f128, LibCall);
 879     setOperationAction(ISD::FSINCOS,      MVT::f128, LibCall);
 880     // No STRICT_FSINCOS
 881     setOperationAction(ISD::FSQRT,        MVT::f128, LibCall);
 882     setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
 883
 884     setOperationAction(ISD::FP_EXTEND,        MVT::f128, Custom);
 885     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
 886     // We need to custom handle any FP_ROUND with an f128 input, but
 887     // LegalizeDAG uses the result type to know when to run a custom handler.
 888     // So we have to list all legal floating point result types here.
 889     if (isTypeLegal(MVT::f32)) {
 890       setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
 891       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
 892     }
 893     if (isTypeLegal(MVT::f64)) {
 894       setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
 895       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
 896     }
 897     if (isTypeLegal(MVT::f80)) {
 898       setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
 899       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
 900     }
 901
 902     setOperationAction(ISD::SETCC, MVT::f128, Custom);
 903
 904     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
 905     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
 906     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
 907     setTruncStoreAction(MVT::f128, MVT::f32, Expand);
 908     setTruncStoreAction(MVT::f128, MVT::f64, Expand);
 909     setTruncStoreAction(MVT::f128, MVT::f80, Expand);
 910   }
 911
 912   // Always use a library call for pow.
 913   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 914   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 915   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 916   setOperationAction(ISD::FPOW             , MVT::f128 , Expand);
 917
 918   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 919   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 920   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 921   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 922   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 923   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 924   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 925
 926   // Some FP actions are always expanded for vector types.
 927   for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
 928                    MVT::v4f32, MVT::v8f32,  MVT::v16f32,
 929                    MVT::v2f64, MVT::v4f64,  MVT::v8f64 }) {
 930     setOperationAction(ISD::FSIN,      VT, Expand);
 931     setOperationAction(ISD::FSINCOS,   VT, Expand);
 932     setOperationAction(ISD::FCOS,      VT, Expand);
 933     setOperationAction(ISD::FREM,      VT, Expand);
 934     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 935     setOperationAction(ISD::FPOW,      VT, Expand);
 936     setOperationAction(ISD::FLOG,      VT, Expand);
 937     setOperationAction(ISD::FLOG2,     VT, Expand);
 938     setOperationAction(ISD::FLOG10,    VT, Expand);
 939     setOperationAction(ISD::FEXP,      VT, Expand);
 940     setOperationAction(ISD::FEXP2,     VT, Expand);
 941   }
 942
 943   // First set operation action for all vector types to either promote
 944   // (for widening) or expand (for scalarization). Then we will selectively
 945   // turn on ones that can be effectively codegen'd.
 946   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
 947     setOperationAction(ISD::SDIV, VT, Expand);
 948     setOperationAction(ISD::UDIV, VT, Expand);
 949     setOperationAction(ISD::SREM, VT, Expand);
 950     setOperationAction(ISD::UREM, VT, Expand);
 951     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 952     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 953     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 954     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 955     setOperationAction(ISD::FMA,  VT, Expand);
 956     setOperationAction(ISD::FFLOOR, VT, Expand);
 957     setOperationAction(ISD::FCEIL, VT, Expand);
 958     setOperationAction(ISD::FTRUNC, VT, Expand);
 959     setOperationAction(ISD::FRINT, VT, Expand);
 960     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 961     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 962     setOperationAction(ISD::MULHS, VT, Expand);
 963     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 964     setOperationAction(ISD::MULHU, VT, Expand);
 965     setOperationAction(ISD::SDIVREM, VT, Expand);
 966     setOperationAction(ISD::UDIVREM, VT, Expand);
 967     setOperationAction(ISD::CTPOP, VT, Expand);
 968     setOperationAction(ISD::CTTZ, VT, Expand);
 969     setOperationAction(ISD::CTLZ, VT, Expand);
 970     setOperationAction(ISD::ROTL, VT, Expand);
 971     setOperationAction(ISD::ROTR, VT, Expand);
 972     setOperationAction(ISD::BSWAP, VT, Expand);
 973     setOperationAction(ISD::SETCC, VT, Expand);
 974     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 975     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 976     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 977     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 978     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 979     setOperationAction(ISD::TRUNCATE, VT, Expand);
 980     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 981     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 982     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 983     setOperationAction(ISD::SELECT_CC, VT, Expand);
 984     for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
 985       setTruncStoreAction(InnerVT, VT, Expand);
 986
 987       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 988       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 989
 990       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 991       // types, we have to deal with them whether we ask for Expansion or not.
 992       // Setting Expand causes its own optimisation problems though, so leave
 993       // them legal.
 994       if (VT.getVectorElementType() == MVT::i1)
 995         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 996
 997       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
 998       // split/scalarized right now.
 999       if (VT.getVectorElementType() == MVT::f16 ||
1000           VT.getVectorElementType() == MVT::bf16)
1001         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1002     }
1003   }
1004
1005   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1006   // with -msoft-float, disable use of MMX as well.
1007   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1008     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1009     // No operations on x86mmx supported, everything uses intrinsics.
1010   }
1011
1012   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1013     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1014                                                     : &X86::VR128RegClass);
1015
1016     setOperationAction(ISD::FMAXIMUM,           MVT::f32, Custom);
1017     setOperationAction(ISD::FMINIMUM,           MVT::f32, Custom);
1018
1019     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
1020     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
1021     setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
1022     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
1023     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
1024     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
1025     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1026     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
1027
1028     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
1029     setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
1030
1031     setOperationAction(ISD::STRICT_FADD,        MVT::v4f32, Legal);
1032     setOperationAction(ISD::STRICT_FSUB,        MVT::v4f32, Legal);
1033     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f32, Legal);
1034     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f32, Legal);
1035     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f32, Legal);
1036   }
1037
1038   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1039     addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1040                                                     : &X86::VR128RegClass);
1041
1042     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1043     // registers cannot be used even for integer operations.
1044     addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1045                                                     : &X86::VR128RegClass);
1046     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1047                                                     : &X86::VR128RegClass);
1048     addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1049                                                     : &X86::VR128RegClass);
1050     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1051                                                     : &X86::VR128RegClass);
1052     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1053                                                     : &X86::VR128RegClass);
1054
1055     for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1056       setOperationAction(ISD::FMAXIMUM, VT, Custom);
1057       setOperationAction(ISD::FMINIMUM, VT, Custom);
1058     }
1059
1060     for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1061                      MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1062       setOperationAction(ISD::SDIV, VT, Custom);
1063       setOperationAction(ISD::SREM, VT, Custom);
1064       setOperationAction(ISD::UDIV, VT, Custom);
1065       setOperationAction(ISD::UREM, VT, Custom);
1066     }
1067
1068     setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
1069     setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);
1070     setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);
1071
1072     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
1073     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
1074     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
1075     setOperationAction(ISD::MULHU,              MVT::v4i32, Custom);
1076     setOperationAction(ISD::MULHS,              MVT::v4i32, Custom);
1077     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
1078     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
1079     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
1080     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
1081     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
1082     setOperationAction(ISD::AVGCEILU,           MVT::v16i8, Legal);
1083     setOperationAction(ISD::AVGCEILU,           MVT::v8i16, Legal);
1084
1085     setOperationAction(ISD::SMULO,              MVT::v16i8, Custom);
1086     setOperationAction(ISD::UMULO,              MVT::v16i8, Custom);
1087     setOperationAction(ISD::UMULO,              MVT::v2i32, Custom);
1088
1089     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
1090     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
1091     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
1092
1093     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1094       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1095       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1096       setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1097       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1098     }
1099
1100     setOperationAction(ISD::ABDU,               MVT::v16i8, Custom);
1101     setOperationAction(ISD::ABDS,               MVT::v16i8, Custom);
1102     setOperationAction(ISD::ABDU,               MVT::v8i16, Custom);
1103     setOperationAction(ISD::ABDS,               MVT::v8i16, Custom);
1104     setOperationAction(ISD::ABDU,               MVT::v4i32, Custom);
1105     setOperationAction(ISD::ABDS,               MVT::v4i32, Custom);
1106
1107     setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
1108     setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
1109     setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
1110     setOperationAction(ISD::SSUBSAT,            MVT::v16i8, Legal);
1111     setOperationAction(ISD::UADDSAT,            MVT::v8i16, Legal);
1112     setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);
1113     setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);
1114     setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);
1115     setOperationAction(ISD::USUBSAT,            MVT::v4i32, Custom);
1116     setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);
1117
1118     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1119     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1120     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1121     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1122
1123     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1124       setOperationAction(ISD::SETCC,              VT, Custom);
1125       setOperationAction(ISD::CTPOP,              VT, Custom);
1126       setOperationAction(ISD::ABS,                VT, Custom);
1127
1128       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1129       // setcc all the way to isel and prefer SETGT in some isel patterns.
1130       setCondCodeAction(ISD::SETLT, VT, Custom);
1131       setCondCodeAction(ISD::SETLE, VT, Custom);
1132     }
1133
1134     setOperationAction(ISD::SETCC,          MVT::v2f64, Custom);
1135     setOperationAction(ISD::SETCC,          MVT::v4f32, Custom);
1136     setOperationAction(ISD::STRICT_FSETCC,  MVT::v2f64, Custom);
1137     setOperationAction(ISD::STRICT_FSETCC,  MVT::v4f32, Custom);
1138     setOperationAction(ISD::STRICT_FSETCCS, MVT::v2f64, Custom);
1139     setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f32, Custom);
1140
1141     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1142       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1143       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1144       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1145       setOperationAction(ISD::VSELECT,            VT, Custom);
1146       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1147     }
1148
1149     for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1150       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1151       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1152       setOperationAction(ISD::VSELECT,            VT, Custom);
1153
1154       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1155         continue;
1156
1157       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1158       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1159     }
1160     setF16Action(MVT::v8f16, Expand);
1161     setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1162     setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1163     setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1164     setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1165
1166     // Custom lower v2i64 and v2f64 selects.
1167     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1168     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1169     setOperationAction(ISD::SELECT,             MVT::v4i32, Custom);
1170     setOperationAction(ISD::SELECT,             MVT::v8i16, Custom);
1171     setOperationAction(ISD::SELECT,             MVT::v8f16, Custom);
1172     setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
1173
1174     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Custom);
1175     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);
1176     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
1177     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
1178     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Custom);
1179     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v2i32, Custom);
1180
1181     // Custom legalize these to avoid over promotion or custom promotion.
1182     for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1183       setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
1184       setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
1185       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1186       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1187     }
1188
1189     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Custom);
1190     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v4i32, Custom);
1191     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
1192     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2i32, Custom);
1193
1194     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
1195     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2i32, Custom);
1196
1197     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
1198     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v4i32, Custom);
1199
1200     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1201     setOperationAction(ISD::SINT_TO_FP,         MVT::v2f32, Custom);
1202     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2f32, Custom);
1203     setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
1204     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2f32, Custom);
1205
1206     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1207     setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v2f32, Custom);
1208     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1209     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v2f32, Custom);
1210
1211     // We want to legalize this to an f64 load rather than an i64 load on
1212     // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1213     // store.
1214     setOperationAction(ISD::LOAD,               MVT::v2i32, Custom);
1215     setOperationAction(ISD::LOAD,               MVT::v4i16, Custom);
1216     setOperationAction(ISD::LOAD,               MVT::v8i8,  Custom);
1217     setOperationAction(ISD::STORE,              MVT::v2i32, Custom);
1218     setOperationAction(ISD::STORE,              MVT::v4i16, Custom);
1219     setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);
1220
1221     // Add 32-bit vector stores to help vectorization opportunities.
1222     setOperationAction(ISD::STORE,              MVT::v2i16, Custom);
1223     setOperationAction(ISD::STORE,              MVT::v4i8,  Custom);
1224
1225     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1226     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1227     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1228     if (!Subtarget.hasAVX512())
1229       setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1230
1231     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1232     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1233     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1234
1235     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1236
1237     setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
1238     setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
1239     setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
1240     setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
1241     setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
1242     setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
1243
1244     // In the customized shift lowering, the legal v4i32/v2i64 cases
1245     // in AVX2 will be recognized.
1246     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1247       setOperationAction(ISD::SRL,              VT, Custom);
1248       setOperationAction(ISD::SHL,              VT, Custom);
1249       setOperationAction(ISD::SRA,              VT, Custom);
1250       if (VT == MVT::v2i64) continue;
1251       setOperationAction(ISD::ROTL,             VT, Custom);
1252       setOperationAction(ISD::ROTR,             VT, Custom);
1253       setOperationAction(ISD::FSHL,             VT, Custom);
1254       setOperationAction(ISD::FSHR,             VT, Custom);
1255     }
1256
1257     setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);
1258     setOperationAction(ISD::STRICT_FADD,        MVT::v2f64, Legal);
1259     setOperationAction(ISD::STRICT_FSUB,        MVT::v2f64, Legal);
1260     setOperationAction(ISD::STRICT_FMUL,        MVT::v2f64, Legal);
1261     setOperationAction(ISD::STRICT_FDIV,        MVT::v2f64, Legal);
1262   }
1263
1264   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1265     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
1266     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
1267     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
1268     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
1269     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
1270     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
1271     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
1272     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
1273
1274     // These might be better off as horizontal vector ops.
1275     setOperationAction(ISD::ADD,                MVT::i16, Custom);
1276     setOperationAction(ISD::ADD,                MVT::i32, Custom);
1277     setOperationAction(ISD::SUB,                MVT::i16, Custom);
1278     setOperationAction(ISD::SUB,                MVT::i32, Custom);
1279   }
1280
1281   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1282     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1283       setOperationAction(ISD::FFLOOR,            RoundedTy,  Legal);
1284       setOperationAction(ISD::STRICT_FFLOOR,     RoundedTy,  Legal);
1285       setOperationAction(ISD::FCEIL,             RoundedTy,  Legal);
1286       setOperationAction(ISD::STRICT_FCEIL,      RoundedTy,  Legal);
1287       setOperationAction(ISD::FTRUNC,            RoundedTy,  Legal);
1288       setOperationAction(ISD::STRICT_FTRUNC,     RoundedTy,  Legal);
1289       setOperationAction(ISD::FRINT,             RoundedTy,  Legal);
1290       setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
1291       setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
1292       setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
1293       setOperationAction(ISD::FROUNDEVEN,        RoundedTy,  Legal);
1294       setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy,  Legal);
1295
1296       setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
1297     }
1298
1299     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
1300     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
1301     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
1302     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
1303     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
1304     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
1305     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
1306     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
1307
1308     for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1309       setOperationAction(ISD::ABDS,             VT, Custom);
1310       setOperationAction(ISD::ABDU,             VT, Custom);
1311     }
1312
1313     setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
1314     setOperationAction(ISD::SADDSAT,            MVT::v2i64, Custom);
1315     setOperationAction(ISD::SSUBSAT,            MVT::v2i64, Custom);
1316
1317     // FIXME: Do we need to handle scalar-to-vector here?
1318     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1319     setOperationAction(ISD::SMULO,              MVT::v2i32, Custom);
1320
1321     // We directly match byte blends in the backend as they match the VSELECT
1322     // condition form.
1323     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1324
1325     // SSE41 brings specific instructions for doing vector sign extend even in
1326     // cases where we don't have SRA.
1327     for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1328       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1329       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1330     }
1331
1332     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1333     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1334       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
1335       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
1336       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
1337       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1338       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1339       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1340     }
1341
1342     if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1343       // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1344       // do the pre and post work in the vector domain.
1345       setOperationAction(ISD::UINT_TO_FP,        MVT::v4i64, Custom);
1346       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1347       // We need to mark SINT_TO_FP as Custom even though we want to expand it
1348       // so that DAG combine doesn't try to turn it into uint_to_fp.
1349       setOperationAction(ISD::SINT_TO_FP,        MVT::v4i64, Custom);
1350       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1351     }
1352   }
1353
1354   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1355     setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);
1356   }
1357
1358   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1359     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1360                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1361       setOperationAction(ISD::ROTL, VT, Custom);
1362       setOperationAction(ISD::ROTR, VT, Custom);
1363     }
1364
1365     // XOP can efficiently perform BITREVERSE with VPPERM.
1366     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1367       setOperationAction(ISD::BITREVERSE, VT, Custom);
1368
1369     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1370                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1371       setOperationAction(ISD::BITREVERSE, VT, Custom);
1372   }
1373
1374   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1375     bool HasInt256 = Subtarget.hasInt256();
1376
1377     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1378                                                      : &X86::VR256RegClass);
1379     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1380                                                      : &X86::VR256RegClass);
1381     addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1382                                                      : &X86::VR256RegClass);
1383     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1384                                                      : &X86::VR256RegClass);
1385     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1386                                                      : &X86::VR256RegClass);
1387     addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1388                                                      : &X86::VR256RegClass);
1389     addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1390                                                      : &X86::VR256RegClass);
1391
1392     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1393       setOperationAction(ISD::FFLOOR,            VT, Legal);
1394       setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1395       setOperationAction(ISD::FCEIL,             VT, Legal);
1396       setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1397       setOperationAction(ISD::FTRUNC,            VT, Legal);
1398       setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1399       setOperationAction(ISD::FRINT,             VT, Legal);
1400       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1401       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1402       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1403       setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1404       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1405
1406       setOperationAction(ISD::FROUND,            VT, Custom);
1407
1408       setOperationAction(ISD::FNEG,              VT, Custom);
1409       setOperationAction(ISD::FABS,              VT, Custom);
1410       setOperationAction(ISD::FCOPYSIGN,         VT, Custom);
1411
1412       setOperationAction(ISD::FMAXIMUM,          VT, Custom);
1413       setOperationAction(ISD::FMINIMUM,          VT, Custom);
1414     }
1415
1416     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1417     // even though v8i16 is a legal type.
1418     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
1419     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i16, MVT::v8i32);
1420     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1421     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1422     setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Custom);
1423     setOperationAction(ISD::FP_TO_UINT,                MVT::v8i32, Custom);
1424     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Custom);
1425
1426     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Custom);
1427     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i32, Custom);
1428     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Expand);
1429     setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Expand);
1430     setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);
1431     setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Custom);
1432
1433     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v4f32, Legal);
1434     setOperationAction(ISD::STRICT_FADD,        MVT::v8f32, Legal);
1435     setOperationAction(ISD::STRICT_FADD,        MVT::v4f64, Legal);
1436     setOperationAction(ISD::STRICT_FSUB,        MVT::v8f32, Legal);
1437     setOperationAction(ISD::STRICT_FSUB,        MVT::v4f64, Legal);
1438     setOperationAction(ISD::STRICT_FMUL,        MVT::v8f32, Legal);
1439     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f64, Legal);
1440     setOperationAction(ISD::STRICT_FDIV,        MVT::v8f32, Legal);
1441     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f64, Legal);
1442     setOperationAction(ISD::STRICT_FSQRT,       MVT::v8f32, Legal);
1443     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f64, Legal);
1444
1445     if (!Subtarget.hasAVX512())
1446       setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1447
1448     // In the customized shift lowering, the legal v8i32/v4i64 cases
1449     // in AVX2 will be recognized.
1450     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1451       setOperationAction(ISD::SRL,             VT, Custom);
1452       setOperationAction(ISD::SHL,             VT, Custom);
1453       setOperationAction(ISD::SRA,             VT, Custom);
1454       setOperationAction(ISD::ABDS,            VT, Custom);
1455       setOperationAction(ISD::ABDU,            VT, Custom);
1456       if (VT == MVT::v4i64) continue;
1457       setOperationAction(ISD::ROTL,            VT, Custom);
1458       setOperationAction(ISD::ROTR,            VT, Custom);
1459       setOperationAction(ISD::FSHL,            VT, Custom);
1460       setOperationAction(ISD::FSHR,            VT, Custom);
1461     }
1462
1463     // These types need custom splitting if their input is a 128-bit vector.
1464     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
1465     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
1466     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);
1467     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);
1468
1469     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1470     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1471     setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);
1472     setOperationAction(ISD::SELECT,            MVT::v16i16, Custom);
1473     setOperationAction(ISD::SELECT,            MVT::v16f16, Custom);
1474     setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
1475     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1476
1477     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1478       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
1479       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
1480       setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
1481     }
1482
1483     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1484     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1485     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1486     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1487
1488     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1489       setOperationAction(ISD::SETCC,           VT, Custom);
1490       setOperationAction(ISD::CTPOP,           VT, Custom);
1491       setOperationAction(ISD::CTLZ,            VT, Custom);
1492
1493       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1494       // setcc all the way to isel and prefer SETGT in some isel patterns.
1495       setCondCodeAction(ISD::SETLT, VT, Custom);
1496       setCondCodeAction(ISD::SETLE, VT, Custom);
1497     }
1498
1499     setOperationAction(ISD::SETCC,          MVT::v4f64, Custom);
1500     setOperationAction(ISD::SETCC,          MVT::v8f32, Custom);
1501     setOperationAction(ISD::STRICT_FSETCC,  MVT::v4f64, Custom);
1502     setOperationAction(ISD::STRICT_FSETCC,  MVT::v8f32, Custom);
1503     setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f64, Custom);
1504     setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f32, Custom);
1505
1506     if (Subtarget.hasAnyFMA()) {
1507       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1508                        MVT::v2f64, MVT::v4f64 }) {
1509         setOperationAction(ISD::FMA, VT, Legal);
1510         setOperationAction(ISD::STRICT_FMA, VT, Legal);
1511       }
1512     }
1513
1514     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1515       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1516       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1517     }
1518
1519     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1520     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1521     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1522     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1523
1524     setOperationAction(ISD::MULHU,     MVT::v8i32,  Custom);
1525     setOperationAction(ISD::MULHS,     MVT::v8i32,  Custom);
1526     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1527     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1528     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1529     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1530     setOperationAction(ISD::AVGCEILU,  MVT::v16i16, HasInt256 ? Legal : Custom);
1531     setOperationAction(ISD::AVGCEILU,  MVT::v32i8,  HasInt256 ? Legal : Custom);
1532
1533     setOperationAction(ISD::SMULO,     MVT::v32i8, Custom);
1534     setOperationAction(ISD::UMULO,     MVT::v32i8, Custom);
1535
1536     setOperationAction(ISD::ABS,       MVT::v4i64,  Custom);
1537     setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);
1538     setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);
1539     setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
1540     setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
1541
1542     setOperationAction(ISD::UADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1543     setOperationAction(ISD::SADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1544     setOperationAction(ISD::USUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1545     setOperationAction(ISD::SSUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1546     setOperationAction(ISD::UADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1547     setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1548     setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1549     setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1550     setOperationAction(ISD::UADDSAT,   MVT::v8i32, Custom);
1551     setOperationAction(ISD::USUBSAT,   MVT::v8i32, Custom);
1552     setOperationAction(ISD::UADDSAT,   MVT::v4i64, Custom);
1553     setOperationAction(ISD::USUBSAT,   MVT::v4i64, Custom);
1554
1555     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1556       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
1557       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1558       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1559       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1560       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1561     }
1562
1563     for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1564       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1565       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1566     }
1567
1568     if (HasInt256) {
1569       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1570       // when we have a 256bit-wide blend with immediate.
1571       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1572       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1573
1574       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1575       for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1576         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1577         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
1578         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
1579         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
1580         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
1581         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
1582       }
1583     }
1584
1585     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1586                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1587       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
1588       setOperationAction(ISD::MSTORE, VT, Legal);
1589     }
1590
1591     // Extract subvector is special because the value type
1592     // (result) is 128-bit but the source is 256-bit wide.
1593     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1594                      MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1595       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1596     }
1597
1598     // Custom lower several nodes for 256-bit types.
1599     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1600                     MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1601       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1602       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1603       setOperationAction(ISD::VSELECT,            VT, Custom);
1604       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1605       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1606       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1607       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1608       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1609       setOperationAction(ISD::STORE,              VT, Custom);
1610     }
1611     setF16Action(MVT::v16f16, Expand);
1612     setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1613     setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1614     setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1615     setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1616
1617     if (HasInt256) {
1618       setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1619
1620       // Custom legalize 2x32 to get a little better code.
1621       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1622       setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1623
1624       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1625                        MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1626         setOperationAction(ISD::MGATHER,  VT, Custom);
1627     }
1628   }
1629
1630   if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1631       Subtarget.hasF16C()) {
1632     for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1633       setOperationAction(ISD::FP_ROUND,           VT, Custom);
1634       setOperationAction(ISD::STRICT_FP_ROUND,    VT, Custom);
1635     }
1636     for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {
1637       setOperationAction(ISD::FP_EXTEND,          VT, Custom);
1638       setOperationAction(ISD::STRICT_FP_EXTEND,   VT, Custom);
1639     }
1640     for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1641       setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1642       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1643     }
1644
1645     setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1646     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
1647   }
1648
1649   // This block controls legalization of the mask vector sizes that are
1650   // available with AVX512. 512-bit vectors are in a separate block controlled
1651   // by useAVX512Regs.
1652   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1653     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
1654     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1655     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1656     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1657     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1658
1659     setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
1660     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1661     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
1662
1663     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i1,  MVT::v8i32);
1664     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i1,  MVT::v8i32);
1665     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v4i1,  MVT::v4i32);
1666     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v4i1,  MVT::v4i32);
1667     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
1668     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
1669     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
1670     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
1671     setOperationAction(ISD::FP_TO_SINT,                MVT::v2i1,  Custom);
1672     setOperationAction(ISD::FP_TO_UINT,                MVT::v2i1,  Custom);
1673     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v2i1,  Custom);
1674     setOperationAction(ISD::STRICT_FP_TO_UINT,         MVT::v2i1,  Custom);
1675
1676     // There is no byte sized k-register load or store without AVX512DQ.
1677     if (!Subtarget.hasDQI()) {
1678       setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1679       setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1680       setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1681       setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1682
1683       setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1684       setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1685       setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1686       setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1687     }
1688
1689     // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1690     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1691       setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1692       setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1693       setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
1694     }
1695
1696     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1697       setOperationAction(ISD::VSELECT,          VT, Expand);
1698
1699     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1700       setOperationAction(ISD::SETCC,            VT, Custom);
1701       setOperationAction(ISD::SELECT,           VT, Custom);
1702       setOperationAction(ISD::TRUNCATE,         VT, Custom);
1703
1704       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
1705       setOperationAction(ISD::CONCAT_VECTORS,   VT, Custom);
1706       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1707       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1708       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1709       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
1710     }
1711
1712     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1713       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1714   }
1715
1716   // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1717   // elements. 512-bits can be disabled based on prefer-vector-width and
1718   // required-vector-width function attributes.
1719   if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1720     bool HasBWI = Subtarget.hasBWI();
1721
1722     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1723     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1724     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1725     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1726     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1727     addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1728     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1729
1730     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1731       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1732       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1733       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1734       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1735       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1736       if (HasBWI)
1737         setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1738     }
1739
1740     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1741       setOperationAction(ISD::FMAXIMUM, VT, Custom);
1742       setOperationAction(ISD::FMINIMUM, VT, Custom);
1743       setOperationAction(ISD::FNEG,  VT, Custom);
1744       setOperationAction(ISD::FABS,  VT, Custom);
1745       setOperationAction(ISD::FMA,   VT, Legal);
1746       setOperationAction(ISD::STRICT_FMA, VT, Legal);
1747       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1748     }
1749
1750     for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1751       setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
1752       setOperationPromotedToType(ISD::FP_TO_UINT       , VT, MVT::v16i32);
1753       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1754       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1755     }
1756
1757     for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1758       setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
1759       setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
1760       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1761       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1762     }
1763
1764     setOperationAction(ISD::SINT_TO_FP,        MVT::v16i32, Custom);
1765     setOperationAction(ISD::UINT_TO_FP,        MVT::v16i32, Custom);
1766     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
1767     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
1768     setOperationAction(ISD::FP_EXTEND,         MVT::v8f64,  Custom);
1769     setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v8f64,  Custom);
1770
1771     setOperationAction(ISD::STRICT_FADD,      MVT::v16f32, Legal);
1772     setOperationAction(ISD::STRICT_FADD,      MVT::v8f64,  Legal);
1773     setOperationAction(ISD::STRICT_FSUB,      MVT::v16f32, Legal);
1774     setOperationAction(ISD::STRICT_FSUB,      MVT::v8f64,  Legal);
1775     setOperationAction(ISD::STRICT_FMUL,      MVT::v16f32, Legal);
1776     setOperationAction(ISD::STRICT_FMUL,      MVT::v8f64,  Legal);
1777     setOperationAction(ISD::STRICT_FDIV,      MVT::v16f32, Legal);
1778     setOperationAction(ISD::STRICT_FDIV,      MVT::v8f64,  Legal);
1779     setOperationAction(ISD::STRICT_FSQRT,     MVT::v16f32, Legal);
1780     setOperationAction(ISD::STRICT_FSQRT,     MVT::v8f64,  Legal);
1781     setOperationAction(ISD::STRICT_FP_ROUND,  MVT::v8f32,  Legal);
1782
1783     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1784     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1785     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1786     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1787     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1788     if (HasBWI)
1789       setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1790
1791     // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1792     // to 512-bit rather than use the AVX2 instructions so that we can use
1793     // k-masks.
1794     if (!Subtarget.hasVLX()) {
1795       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1796            MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1797         setOperationAction(ISD::MLOAD,  VT, Custom);
1798         setOperationAction(ISD::MSTORE, VT, Custom);
1799       }
1800     }
1801
1802     setOperationAction(ISD::TRUNCATE,    MVT::v8i32,  Legal);
1803     setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Legal);
1804     setOperationAction(ISD::TRUNCATE,    MVT::v32i8,  HasBWI ? Legal : Custom);
1805     setOperationAction(ISD::TRUNCATE,    MVT::v16i64, Custom);
1806     setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1807     setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1808     setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
1809     setOperationAction(ISD::ANY_EXTEND,  MVT::v32i16, Custom);
1810     setOperationAction(ISD::ANY_EXTEND,  MVT::v16i32, Custom);
1811     setOperationAction(ISD::ANY_EXTEND,  MVT::v8i64,  Custom);
1812     setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1813     setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1814     setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
1815
1816     if (HasBWI) {
1817       // Extends from v64i1 masks to 512-bit vectors.
1818       setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1819       setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1820       setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
1821     }
1822
1823     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1824       setOperationAction(ISD::FFLOOR,            VT, Legal);
1825       setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1826       setOperationAction(ISD::FCEIL,             VT, Legal);
1827       setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1828       setOperationAction(ISD::FTRUNC,            VT, Legal);
1829       setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1830       setOperationAction(ISD::FRINT,             VT, Legal);
1831       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1832       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1833       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1834       setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1835       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1836
1837       setOperationAction(ISD::FROUND,            VT, Custom);
1838     }
1839
1840     for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1841       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1842       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1843     }
1844
1845     setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1846     setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1847     setOperationAction(ISD::ADD, MVT::v64i8,  HasBWI ? Legal : Custom);
1848     setOperationAction(ISD::SUB, MVT::v64i8,  HasBWI ? Legal : Custom);
1849
1850     setOperationAction(ISD::MUL, MVT::v8i64,  Custom);
1851     setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1852     setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1853     setOperationAction(ISD::MUL, MVT::v64i8,  Custom);
1854
1855     setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1856     setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1857     setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1858     setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1859     setOperationAction(ISD::MULHS, MVT::v64i8,  Custom);
1860     setOperationAction(ISD::MULHU, MVT::v64i8,  Custom);
1861     setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1862     setOperationAction(ISD::AVGCEILU, MVT::v64i8,  HasBWI ? Legal : Custom);
1863
1864     setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1865     setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1866
1867     setOperationAction(ISD::BITREVERSE, MVT::v64i8,  Custom);
1868
1869     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1870       setOperationAction(ISD::SRL,              VT, Custom);
1871       setOperationAction(ISD::SHL,              VT, Custom);
1872       setOperationAction(ISD::SRA,              VT, Custom);
1873       setOperationAction(ISD::ROTL,             VT, Custom);
1874       setOperationAction(ISD::ROTR,             VT, Custom);
1875       setOperationAction(ISD::SETCC,            VT, Custom);
1876       setOperationAction(ISD::ABDS,             VT, Custom);
1877       setOperationAction(ISD::ABDU,             VT, Custom);
1878
1879       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1880       // setcc all the way to isel and prefer SETGT in some isel patterns.
1881       setCondCodeAction(ISD::SETLT, VT, Custom);
1882       setCondCodeAction(ISD::SETLE, VT, Custom);
1883     }
1884
1885     setOperationAction(ISD::SETCC,          MVT::v8f64, Custom);
1886     setOperationAction(ISD::SETCC,          MVT::v16f32, Custom);
1887     setOperationAction(ISD::STRICT_FSETCC,  MVT::v8f64, Custom);
1888     setOperationAction(ISD::STRICT_FSETCC,  MVT::v16f32, Custom);
1889     setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f64, Custom);
1890     setOperationAction(ISD::STRICT_FSETCCS, MVT::v16f32, Custom);
1891
1892     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1893       setOperationAction(ISD::SMAX,             VT, Legal);
1894       setOperationAction(ISD::UMAX,             VT, Legal);
1895       setOperationAction(ISD::SMIN,             VT, Legal);
1896       setOperationAction(ISD::UMIN,             VT, Legal);
1897       setOperationAction(ISD::ABS,              VT, Legal);
1898       setOperationAction(ISD::CTPOP,            VT, Custom);
1899     }
1900
1901     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1902       setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
1903       setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
1904       setOperationAction(ISD::CTLZ,    VT, Custom);
1905       setOperationAction(ISD::SMAX,    VT, HasBWI ? Legal : Custom);
1906       setOperationAction(ISD::UMAX,    VT, HasBWI ? Legal : Custom);
1907       setOperationAction(ISD::SMIN,    VT, HasBWI ? Legal : Custom);
1908       setOperationAction(ISD::UMIN,    VT, HasBWI ? Legal : Custom);
1909       setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1910       setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1911       setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1912       setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1913     }
1914
1915     setOperationAction(ISD::FSHL,       MVT::v64i8, Custom);
1916     setOperationAction(ISD::FSHR,       MVT::v64i8, Custom);
1917     setOperationAction(ISD::FSHL,      MVT::v32i16, Custom);
1918     setOperationAction(ISD::FSHR,      MVT::v32i16, Custom);
1919     setOperationAction(ISD::FSHL,      MVT::v16i32, Custom);
1920     setOperationAction(ISD::FSHR,      MVT::v16i32, Custom);
1921
1922     if (Subtarget.hasDQI()) {
1923       for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1924                        ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
1925                        ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1926         setOperationAction(Opc,           MVT::v8i64, Custom);
1927       setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
1928     }
1929
1930     if (Subtarget.hasCDI()) {
1931       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1932       for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1933         setOperationAction(ISD::CTLZ,            VT, Legal);
1934       }
1935     } // Subtarget.hasCDI()
1936
1937     if (Subtarget.hasVPOPCNTDQ()) {
1938       for (auto VT : { MVT::v16i32, MVT::v8i64 })
1939         setOperationAction(ISD::CTPOP, VT, Legal);
1940     }
1941
1942     // Extract subvector is special because the value type
1943     // (result) is 256-bit but the source is 512-bit wide.
1944     // 128-bit was made Legal under AVX1.
1945     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1946                      MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1947       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1948
1949     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1950                      MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
1951       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1952       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1953       setOperationAction(ISD::SELECT,             VT, Custom);
1954       setOperationAction(ISD::VSELECT,            VT, Custom);
1955       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1956       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1957       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1958       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1959       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1960     }
1961     setF16Action(MVT::v32f16, Expand);
1962     setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
1963     setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
1964     setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
1965     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
1966     for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1967       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1968       setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
1969     }
1970
1971     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1972       setOperationAction(ISD::MLOAD,               VT, Legal);
1973       setOperationAction(ISD::MSTORE,              VT, Legal);
1974       setOperationAction(ISD::MGATHER,             VT, Custom);
1975       setOperationAction(ISD::MSCATTER,            VT, Custom);
1976     }
1977     if (HasBWI) {
1978       for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1979         setOperationAction(ISD::MLOAD,        VT, Legal);
1980         setOperationAction(ISD::MSTORE,       VT, Legal);
1981       }
1982     } else {
1983       setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1984       setOperationAction(ISD::STORE, MVT::v64i8,  Custom);
1985     }
1986
1987     if (Subtarget.hasVBMI2()) {
1988       for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1989                        MVT::v16i16, MVT::v8i32, MVT::v4i64,
1990                        MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1991         setOperationAction(ISD::FSHL, VT, Custom);
1992         setOperationAction(ISD::FSHR, VT, Custom);
1993       }
1994
1995       setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1996       setOperationAction(ISD::ROTR, MVT::v8i16,  Custom);
1997       setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1998       setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1999     }
2000   }// useAVX512Regs
2001
2002   // This block controls legalization for operations that don't have
2003   // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2004   // narrower widths.
2005   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2006     // These operations are handled on non-VLX by artificially widening in
2007     // isel patterns.
2008
2009     setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i32, Custom);
2010     setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v4i32, Custom);
2011     setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v2i32, Custom);
2012
2013     if (Subtarget.hasDQI()) {
2014       // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2015       // v2f32 UINT_TO_FP is already custom under SSE2.
2016       assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
2017              isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
2018              "Unexpected operation action!");
2019       // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2020       setOperationAction(ISD::FP_TO_SINT,        MVT::v2f32, Custom);
2021       setOperationAction(ISD::FP_TO_UINT,        MVT::v2f32, Custom);
2022       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
2023       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
2024     }
2025
2026     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2027       setOperationAction(ISD::SMAX, VT, Legal);
2028       setOperationAction(ISD::UMAX, VT, Legal);
2029       setOperationAction(ISD::SMIN, VT, Legal);
2030       setOperationAction(ISD::UMIN, VT, Legal);
2031       setOperationAction(ISD::ABS,  VT, Legal);
2032     }
2033
2034     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2035       setOperationAction(ISD::ROTL,     VT, Custom);
2036       setOperationAction(ISD::ROTR,     VT, Custom);
2037     }
2038
2039     // Custom legalize 2x32 to get a little better code.
2040     setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2041     setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2042
2043     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2044                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2045       setOperationAction(ISD::MSCATTER, VT, Custom);
2046
2047     if (Subtarget.hasDQI()) {
2048       for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
2049                        ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
2050                        ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {
2051         setOperationAction(Opc, MVT::v2i64, Custom);
2052         setOperationAction(Opc, MVT::v4i64, Custom);
2053       }
2054       setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2055       setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2056     }
2057
2058     if (Subtarget.hasCDI()) {
2059       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2060         setOperationAction(ISD::CTLZ,            VT, Legal);
2061       }
2062     } // Subtarget.hasCDI()
2063
2064     if (Subtarget.hasVPOPCNTDQ()) {
2065       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2066         setOperationAction(ISD::CTPOP, VT, Legal);
2067     }
2068   }
2069
2070   // This block control legalization of v32i1/v64i1 which are available with
2071   // AVX512BW..
2072   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2073     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
2074     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
2075
2076     for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2077       setOperationAction(ISD::VSELECT,            VT, Expand);
2078       setOperationAction(ISD::TRUNCATE,           VT, Custom);
2079       setOperationAction(ISD::SETCC,              VT, Custom);
2080       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2081       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
2082       setOperationAction(ISD::SELECT,             VT, Custom);
2083       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
2084       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
2085       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
2086       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
2087     }
2088
2089     for (auto VT : { MVT::v16i1, MVT::v32i1 })
2090       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
2091
2092     // Extends from v32i1 masks to 256-bit vectors.
2093     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
2094     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
2095     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
2096
2097     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2098       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
2099       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2100     }
2101
2102     // These operations are handled on non-VLX by artificially widening in
2103     // isel patterns.
2104     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2105
2106     if (Subtarget.hasBITALG()) {
2107       for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2108         setOperationAction(ISD::CTPOP, VT, Legal);
2109     }
2110   }
2111
2112   if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2113     auto setGroup = [&] (MVT VT) {
2114       setOperationAction(ISD::FADD,               VT, Legal);
2115       setOperationAction(ISD::STRICT_FADD,        VT, Legal);
2116       setOperationAction(ISD::FSUB,               VT, Legal);
2117       setOperationAction(ISD::STRICT_FSUB,        VT, Legal);
2118       setOperationAction(ISD::FMUL,               VT, Legal);
2119       setOperationAction(ISD::STRICT_FMUL,        VT, Legal);
2120       setOperationAction(ISD::FDIV,               VT, Legal);
2121       setOperationAction(ISD::STRICT_FDIV,        VT, Legal);
2122       setOperationAction(ISD::FSQRT,              VT, Legal);
2123       setOperationAction(ISD::STRICT_FSQRT,       VT, Legal);
2124
2125       setOperationAction(ISD::FFLOOR,             VT, Legal);
2126       setOperationAction(ISD::STRICT_FFLOOR,      VT, Legal);
2127       setOperationAction(ISD::FCEIL,              VT, Legal);
2128       setOperationAction(ISD::STRICT_FCEIL,       VT, Legal);
2129       setOperationAction(ISD::FTRUNC,             VT, Legal);
2130       setOperationAction(ISD::STRICT_FTRUNC,      VT, Legal);
2131       setOperationAction(ISD::FRINT,              VT, Legal);
2132       setOperationAction(ISD::STRICT_FRINT,       VT, Legal);
2133       setOperationAction(ISD::FNEARBYINT,         VT, Legal);
2134       setOperationAction(ISD::STRICT_FNEARBYINT,  VT, Legal);
2135
2136       setOperationAction(ISD::FROUND,             VT, Custom);
2137
2138       setOperationAction(ISD::LOAD,               VT, Legal);
2139       setOperationAction(ISD::STORE,              VT, Legal);
2140
2141       setOperationAction(ISD::FMA,                VT, Legal);
2142       setOperationAction(ISD::STRICT_FMA,         VT, Legal);
2143       setOperationAction(ISD::VSELECT,            VT, Legal);
2144       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
2145       setOperationAction(ISD::SELECT,             VT, Custom);
2146
2147       setOperationAction(ISD::FNEG,               VT, Custom);
2148       setOperationAction(ISD::FABS,               VT, Custom);
2149       setOperationAction(ISD::FCOPYSIGN,          VT, Custom);
2150       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2151       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
2152
2153       setOperationAction(ISD::SETCC,              VT, Custom);
2154       setOperationAction(ISD::STRICT_FSETCC,      VT, Custom);
2155       setOperationAction(ISD::STRICT_FSETCCS,     VT, Custom);
2156     };
2157
2158     // AVX512_FP16 scalar operations
2159     setGroup(MVT::f16);
2160     setOperationAction(ISD::FREM,                 MVT::f16, Promote);
2161     setOperationAction(ISD::STRICT_FREM,          MVT::f16, Promote);
2162     setOperationAction(ISD::SELECT_CC,            MVT::f16, Expand);
2163     setOperationAction(ISD::BR_CC,                MVT::f16, Expand);
2164     setOperationAction(ISD::STRICT_FROUND,        MVT::f16, Promote);
2165     setOperationAction(ISD::FROUNDEVEN,           MVT::f16, Legal);
2166     setOperationAction(ISD::STRICT_FROUNDEVEN,    MVT::f16, Legal);
2167     setOperationAction(ISD::FP_ROUND,             MVT::f16, Custom);
2168     setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);
2169     setOperationAction(ISD::FMAXIMUM,             MVT::f16, Custom);
2170     setOperationAction(ISD::FMINIMUM,             MVT::f16, Custom);
2171     setOperationAction(ISD::FP_EXTEND,            MVT::f32, Legal);
2172     setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);
2173
2174     setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
2175     setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
2176
2177     if (Subtarget.useAVX512Regs()) {
2178       setGroup(MVT::v32f16);
2179       setOperationAction(ISD::SCALAR_TO_VECTOR,       MVT::v32f16, Custom);
2180       setOperationAction(ISD::SINT_TO_FP,             MVT::v32i16, Legal);
2181       setOperationAction(ISD::STRICT_SINT_TO_FP,      MVT::v32i16, Legal);
2182       setOperationAction(ISD::UINT_TO_FP,             MVT::v32i16, Legal);
2183       setOperationAction(ISD::STRICT_UINT_TO_FP,      MVT::v32i16, Legal);
2184       setOperationAction(ISD::FP_ROUND,               MVT::v16f16, Legal);
2185       setOperationAction(ISD::STRICT_FP_ROUND,        MVT::v16f16, Legal);
2186       setOperationAction(ISD::FP_EXTEND,              MVT::v16f32, Legal);
2187       setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v16f32, Legal);
2188       setOperationAction(ISD::FP_EXTEND,              MVT::v8f64,  Legal);
2189       setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v8f64,  Legal);
2190       setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);
2191
2192       setOperationAction(ISD::FP_TO_SINT,             MVT::v32i16, Custom);
2193       setOperationAction(ISD::STRICT_FP_TO_SINT,      MVT::v32i16, Custom);
2194       setOperationAction(ISD::FP_TO_UINT,             MVT::v32i16, Custom);
2195       setOperationAction(ISD::STRICT_FP_TO_UINT,      MVT::v32i16, Custom);
2196       setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i8,  MVT::v32i16);
2197       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
2198                                  MVT::v32i16);
2199       setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i8,  MVT::v32i16);
2200       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
2201                                  MVT::v32i16);
2202       setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i1,  MVT::v32i16);
2203       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
2204                                  MVT::v32i16);
2205       setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i1,  MVT::v32i16);
2206       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
2207                                  MVT::v32i16);
2208
2209       setOperationAction(ISD::EXTRACT_SUBVECTOR,      MVT::v16f16, Legal);
2210       setOperationAction(ISD::INSERT_SUBVECTOR,       MVT::v32f16, Legal);
2211       setOperationAction(ISD::CONCAT_VECTORS,         MVT::v32f16, Custom);
2212
2213       setLoadExtAction(ISD::EXTLOAD, MVT::v8f64,  MVT::v8f16,  Legal);
2214       setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2215     }
2216
2217     if (Subtarget.hasVLX()) {
2218       setGroup(MVT::v8f16);
2219       setGroup(MVT::v16f16);
2220
2221       setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8f16,  Legal);
2222       setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16f16, Custom);
2223       setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Legal);
2224       setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v16i16, Legal);
2225       setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16,  Legal);
2226       setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i16,  Legal);
2227       setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Legal);
2228       setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v16i16, Legal);
2229       setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16,  Legal);
2230       setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v8i16,  Legal);
2231
2232       setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
2233       setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v8i16, Custom);
2234       setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Custom);
2235       setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
2236       setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Legal);
2237       setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);
2238       setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
2239       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v8f32, Legal);
2240       setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Legal);
2241       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
2242
2243       // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2244       setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v8f16,  Custom);
2245       setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v16f16, Custom);
2246
2247       setOperationAction(ISD::EXTRACT_SUBVECTOR,    MVT::v8f16, Legal);
2248       setOperationAction(ISD::INSERT_SUBVECTOR,     MVT::v16f16, Legal);
2249       setOperationAction(ISD::CONCAT_VECTORS,       MVT::v16f16, Custom);
2250
2251       setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2252       setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2253       setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2254       setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2255
2256       // Need to custom widen these to prevent scalarization.
2257       setOperationAction(ISD::LOAD,  MVT::v4f16, Custom);
2258       setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2259     }
2260   }
2261
2262   if (!Subtarget.useSoftFloat() &&
2263       (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2264     addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);
2265     addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);
2266     // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2267     // provide the method to promote BUILD_VECTOR. Set the operation action
2268     // Custom to do the customization later.
2269     setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);
2270     for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2271       setF16Action(VT, Expand);
2272       setOperationAction(ISD::FADD, VT, Expand);
2273       setOperationAction(ISD::FSUB, VT, Expand);
2274       setOperationAction(ISD::FMUL, VT, Expand);
2275       setOperationAction(ISD::FDIV, VT, Expand);
2276       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2277       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2278     }
2279     addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2280   }
2281
2282   if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2283     addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2284     setF16Action(MVT::v32bf16, Expand);
2285     setOperationAction(ISD::FADD, MVT::v32bf16, Expand);
2286     setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);
2287     setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
2288     setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
2289     setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
2290     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
2291   }
2292
2293   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2294     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
2295     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2296     setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2297     setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
2298     setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2299
2300     setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
2301     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2302     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2303     setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
2304     setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2305
2306     if (Subtarget.hasBWI()) {
2307       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
2308       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
2309     }
2310
2311     if (Subtarget.hasFP16()) {
2312       // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2313       setOperationAction(ISD::FP_TO_SINT,        MVT::v2f16, Custom);
2314       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2315       setOperationAction(ISD::FP_TO_UINT,        MVT::v2f16, Custom);
2316       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2317       setOperationAction(ISD::FP_TO_SINT,        MVT::v4f16, Custom);
2318       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2319       setOperationAction(ISD::FP_TO_UINT,        MVT::v4f16, Custom);
2320       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2321       // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2322       setOperationAction(ISD::SINT_TO_FP,        MVT::v2f16, Custom);
2323       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2324       setOperationAction(ISD::UINT_TO_FP,        MVT::v2f16, Custom);
2325       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2326       setOperationAction(ISD::SINT_TO_FP,        MVT::v4f16, Custom);
2327       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2328       setOperationAction(ISD::UINT_TO_FP,        MVT::v4f16, Custom);
2329       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2330       // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2331       setOperationAction(ISD::FP_ROUND,          MVT::v2f16, Custom);
2332       setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v2f16, Custom);
2333       setOperationAction(ISD::FP_ROUND,          MVT::v4f16, Custom);
2334       setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v4f16, Custom);
2335       // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2336       setOperationAction(ISD::FP_EXTEND,         MVT::v2f16, Custom);
2337       setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v2f16, Custom);
2338       setOperationAction(ISD::FP_EXTEND,         MVT::v4f16, Custom);
2339       setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v4f16, Custom);
2340     }
2341
2342     setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
2343     setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
2344     setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
2345   }
2346
2347   if (Subtarget.hasAMXTILE()) {
2348     addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2349   }
2350
2351   // We want to custom lower some of our intrinsics.
2352   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2353   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2354   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2355   if (!Subtarget.is64Bit()) {
2356     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2357   }
2358
2359   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2360   // handle type legalization for these operations here.
2361   //
2362   // FIXME: We really should do custom legalization for addition and
2363   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
2364   // than generic legalization for 64-bit multiplication-with-overflow, though.
2365   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2366     if (VT == MVT::i64 && !Subtarget.is64Bit())
2367       continue;
2368     // Add/Sub/Mul with overflow operations are custom lowered.
2369     setOperationAction(ISD::SADDO, VT, Custom);
2370     setOperationAction(ISD::UADDO, VT, Custom);
2371     setOperationAction(ISD::SSUBO, VT, Custom);
2372     setOperationAction(ISD::USUBO, VT, Custom);
2373     setOperationAction(ISD::SMULO, VT, Custom);
2374     setOperationAction(ISD::UMULO, VT, Custom);
2375
2376     // Support carry in as value rather than glue.
2377     setOperationAction(ISD::UADDO_CARRY, VT, Custom);
2378     setOperationAction(ISD::USUBO_CARRY, VT, Custom);
2379     setOperationAction(ISD::SETCCCARRY, VT, Custom);
2380     setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2381     setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2382   }
2383
2384   if (!Subtarget.is64Bit()) {
2385     // These libcalls are not available in 32-bit.
2386     setLibcallName(RTLIB::SHL_I128, nullptr);
2387     setLibcallName(RTLIB::SRL_I128, nullptr);
2388     setLibcallName(RTLIB::SRA_I128, nullptr);
2389     setLibcallName(RTLIB::MUL_I128, nullptr);
2390     // The MULO libcall is not part of libgcc, only compiler-rt.
2391     setLibcallName(RTLIB::MULO_I64, nullptr);
2392   }
2393   // The MULO libcall is not part of libgcc, only compiler-rt.
2394   setLibcallName(RTLIB::MULO_I128, nullptr);
2395
2396   // Combine sin / cos into _sincos_stret if it is available.
2397   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2398       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2399     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2400     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2401   }
2402
2403   if (Subtarget.isTargetWin64()) {
2404     setOperationAction(ISD::SDIV, MVT::i128, Custom);
2405     setOperationAction(ISD::UDIV, MVT::i128, Custom);
2406     setOperationAction(ISD::SREM, MVT::i128, Custom);
2407     setOperationAction(ISD::UREM, MVT::i128, Custom);
2408     setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
2409     setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
2410     setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
2411     setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
2412     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
2413     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
2414     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
2415     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
2416   }
2417
2418   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2419   // is. We should promote the value to 64-bits to solve this.
2420   // This is what the CRT headers do - `fmodf` is an inline header
2421   // function casting to f64 and calling `fmod`.
2422   if (Subtarget.is32Bit() &&
2423       (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2424     for (ISD::NodeType Op :
2425          {ISD::FCEIL,  ISD::STRICT_FCEIL,
2426           ISD::FCOS,   ISD::STRICT_FCOS,
2427           ISD::FEXP,   ISD::STRICT_FEXP,
2428           ISD::FFLOOR, ISD::STRICT_FFLOOR,
2429           ISD::FREM,   ISD::STRICT_FREM,
2430           ISD::FLOG,   ISD::STRICT_FLOG,
2431           ISD::FLOG10, ISD::STRICT_FLOG10,
2432           ISD::FPOW,   ISD::STRICT_FPOW,
2433           ISD::FSIN,   ISD::STRICT_FSIN})
2434       if (isOperationExpand(Op, MVT::f32))
2435         setOperationAction(Op, MVT::f32, Promote);
2436
2437   // We have target-specific dag combine patterns for the following nodes:
2438   setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
2439                        ISD::SCALAR_TO_VECTOR,
2440                        ISD::INSERT_VECTOR_ELT,
2441                        ISD::EXTRACT_VECTOR_ELT,
2442                        ISD::CONCAT_VECTORS,
2443                        ISD::INSERT_SUBVECTOR,
2444                        ISD::EXTRACT_SUBVECTOR,
2445                        ISD::BITCAST,
2446                        ISD::VSELECT,
2447                        ISD::SELECT,
2448                        ISD::SHL,
2449                        ISD::SRA,
2450                        ISD::SRL,
2451                        ISD::OR,
2452                        ISD::AND,
2453                        ISD::ADD,
2454                        ISD::FADD,
2455                        ISD::FSUB,
2456                        ISD::FNEG,
2457                        ISD::FMA,
2458                        ISD::STRICT_FMA,
2459                        ISD::FMINNUM,
2460                        ISD::FMAXNUM,
2461                        ISD::SUB,
2462                        ISD::LOAD,
2463                        ISD::MLOAD,
2464                        ISD::STORE,
2465                        ISD::MSTORE,
2466                        ISD::TRUNCATE,
2467                        ISD::ZERO_EXTEND,
2468                        ISD::ANY_EXTEND,
2469                        ISD::SIGN_EXTEND,
2470                        ISD::SIGN_EXTEND_INREG,
2471                        ISD::ANY_EXTEND_VECTOR_INREG,
2472                        ISD::SIGN_EXTEND_VECTOR_INREG,
2473                        ISD::ZERO_EXTEND_VECTOR_INREG,
2474                        ISD::SINT_TO_FP,
2475                        ISD::UINT_TO_FP,
2476                        ISD::STRICT_SINT_TO_FP,
2477                        ISD::STRICT_UINT_TO_FP,
2478                        ISD::SETCC,
2479                        ISD::MUL,
2480                        ISD::XOR,
2481                        ISD::MSCATTER,
2482                        ISD::MGATHER,
2483                        ISD::FP16_TO_FP,
2484                        ISD::FP_EXTEND,
2485                        ISD::STRICT_FP_EXTEND,
2486                        ISD::FP_ROUND,
2487                        ISD::STRICT_FP_ROUND});
2488
2489   computeRegisterProperties(Subtarget.getRegisterInfo());
2490
2491   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2492   MaxStoresPerMemsetOptSize = 8;
2493   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2494   MaxStoresPerMemcpyOptSize = 4;
2495   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2496   MaxStoresPerMemmoveOptSize = 4;
2497
2498   // TODO: These control memcmp expansion in CGP and could be raised higher, but
2499   // that needs to benchmarked and balanced with the potential use of vector
2500   // load/store types (PR33329, PR33914).
2501   MaxLoadsPerMemcmp = 2;
2502   MaxLoadsPerMemcmpOptSize = 2;
2503
2504   // Default loop alignment, which can be overridden by -align-loops.
2505   setPrefLoopAlignment(Align(16));
2506
2507   // An out-of-order CPU can speculatively execute past a predictable branch,
2508   // but a conditional move could be stalled by an expensive earlier operation.
2509   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2510   EnableExtLdPromotion = true;
2511   setPrefFunctionAlignment(Align(16));
2512
2513   verifyIntrinsicTables();
2514
2515   // Default to having -disable-strictnode-mutation on
2516   IsStrictFPEnabled = true;
2517 }
2518
2519 // This has so far only been implemented for 64-bit MachO.
2520 bool X86TargetLowering::useLoadStackGuardNode() const {
2521   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2522 }
2523
2524 bool X86TargetLowering::useStackGuardXorFP() const {
2525   // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2526   return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2527 }
2528
2529 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2530                                                const SDLoc &DL) const {
2531   EVT PtrTy = getPointerTy(DAG.getDataLayout());
2532   unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2533   MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2534   return SDValue(Node, 0);
2535 }
2536
2537 TargetLoweringBase::LegalizeTypeAction
2538 X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2539   if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2540       !Subtarget.hasBWI())
2541     return TypeSplitVector;
2542
2543   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2544       !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2545     return TypeSplitVector;
2546
2547   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2548       VT.getVectorElementType() != MVT::i1)
2549     return TypeWidenVector;
2550
2551   return TargetLoweringBase::getPreferredVectorAction(VT);
2552 }
2553
2554 static std::pair<MVT, unsigned>
2555 handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2556                                  const X86Subtarget &Subtarget) {
2557   // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2558   // convention is one that uses k registers.
2559   if (NumElts == 2)
2560     return {MVT::v2i64, 1};
2561   if (NumElts == 4)
2562     return {MVT::v4i32, 1};
2563   if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2564       CC != CallingConv::Intel_OCL_BI)
2565     return {MVT::v8i16, 1};
2566   if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2567       CC != CallingConv::Intel_OCL_BI)
2568     return {MVT::v16i8, 1};
2569   // v32i1 passes in ymm unless we have BWI and the calling convention is
2570   // regcall.
2571   if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2572     return {MVT::v32i8, 1};
2573   // Split v64i1 vectors if we don't have v64i8 available.
2574   if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2575     if (Subtarget.useAVX512Regs())
2576       return {MVT::v64i8, 1};
2577     return {MVT::v32i8, 2};
2578   }
2579
2580   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2581   if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2582       NumElts > 64)
2583     return {MVT::i8, NumElts};
2584
2585   return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2586 }
2587
2588 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2589                                                      CallingConv::ID CC,
2590                                                      EVT VT) const {
2591   if (VT.isVector()) {
2592     if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
2593       unsigned NumElts = VT.getVectorNumElements();
2594
2595       MVT RegisterVT;
2596       unsigned NumRegisters;
2597       std::tie(RegisterVT, NumRegisters) =
2598           handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2599       if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2600         return RegisterVT;
2601     }
2602
2603     if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
2604       return MVT::v8f16;
2605   }
2606
2607   // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
2608   if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
2609       !Subtarget.hasX87())
2610     return MVT::i32;
2611
2612   if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2613     return getRegisterTypeForCallingConv(Context, CC,
2614                                          VT.changeVectorElementType(MVT::f16));
2615
2616   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2617 }
2618
2619 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2620                                                           CallingConv::ID CC,
2621                                                           EVT VT) const {
2622   if (VT.isVector()) {
2623     if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
2624       unsigned NumElts = VT.getVectorNumElements();
2625
2626       MVT RegisterVT;
2627       unsigned NumRegisters;
2628       std::tie(RegisterVT, NumRegisters) =
2629           handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2630       if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2631         return NumRegisters;
2632     }
2633
2634     if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
2635       return 1;
2636   }
2637
2638   // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
2639   // x87 is disabled.
2640   if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {
2641     if (VT == MVT::f64)
2642       return 2;
2643     if (VT == MVT::f80)
2644       return 3;
2645   }
2646
2647   if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2648     return getNumRegistersForCallingConv(Context, CC,
2649                                          VT.changeVectorElementType(MVT::f16));
2650
2651   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2652 }
2653
2654 unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2655     LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2656     unsigned &NumIntermediates, MVT &RegisterVT) const {
2657   // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2658   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2659       Subtarget.hasAVX512() &&
2660       (!isPowerOf2_32(VT.getVectorNumElements()) ||
2661        (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2662        VT.getVectorNumElements() > 64)) {
2663     RegisterVT = MVT::i8;
2664     IntermediateVT = MVT::i1;
2665     NumIntermediates = VT.getVectorNumElements();
2666     return NumIntermediates;
2667   }
2668
2669   // Split v64i1 vectors if we don't have v64i8 available.
2670   if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2671       CC != CallingConv::X86_RegCall) {
2672     RegisterVT = MVT::v32i8;
2673     IntermediateVT = MVT::v32i1;
2674     NumIntermediates = 2;
2675     return 2;
2676   }
2677
2678   // Split vNbf16 vectors according to vNf16.
2679   if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
2680     VT = VT.changeVectorElementType(MVT::f16);
2681
2682   return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2683                                               NumIntermediates, RegisterVT);
2684 }
2685
2686 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2687                                           LLVMContext& Context,
2688                                           EVT VT) const {
2689   if (!VT.isVector())
2690     return MVT::i8;
2691
2692   if (Subtarget.hasAVX512()) {
2693     // Figure out what this type will be legalized to.
2694     EVT LegalVT = VT;
2695     while (getTypeAction(Context, LegalVT) != TypeLegal)
2696       LegalVT = getTypeToTransformTo(Context, LegalVT);
2697
2698     // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2699     if (LegalVT.getSimpleVT().is512BitVector())
2700       return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2701
2702     if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2703       // If we legalized to less than a 512-bit vector, then we will use a vXi1
2704       // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2705       // vXi16/vXi8.
2706       MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2707       if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2708         return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2709     }
2710   }
2711
2712   return VT.changeVectorElementTypeToInteger();
2713 }
2714
2715 /// Helper for getByValTypeAlignment to determine
2716 /// the desired ByVal argument alignment.
2717 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2718   if (MaxAlign == 16)
2719     return;
2720   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2721     if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)
2722       MaxAlign = Align(16);
2723   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2724     Align EltAlign;
2725     getMaxByValAlign(ATy->getElementType(), EltAlign);
2726     if (EltAlign > MaxAlign)
2727       MaxAlign = EltAlign;
2728   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2729     for (auto *EltTy : STy->elements()) {
2730       Align EltAlign;
2731       getMaxByValAlign(EltTy, EltAlign);
2732       if (EltAlign > MaxAlign)
2733         MaxAlign = EltAlign;
2734       if (MaxAlign == 16)
2735         break;
2736     }
2737   }
2738 }
2739
2740 /// Return the desired alignment for ByVal aggregate
2741 /// function arguments in the caller parameter area. For X86, aggregates
2742 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
2743 /// are at 4-byte boundaries.
2744 uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,
2745                                                   const DataLayout &DL) const {
2746   if (Subtarget.is64Bit()) {
2747     // Max of 8 and alignment of type.
2748     Align TyAlign = DL.getABITypeAlign(Ty);
2749     if (TyAlign > 8)
2750       return TyAlign.value();
2751     return 8;
2752   }
2753
2754   Align Alignment(4);
2755   if (Subtarget.hasSSE1())
2756     getMaxByValAlign(Ty, Alignment);
2757   return Alignment.value();
2758 }
2759
2760 /// It returns EVT::Other if the type should be determined using generic
2761 /// target-independent logic.
2762 /// For vector ops we check that the overall size isn't larger than our
2763 /// preferred vector width.
2764 EVT X86TargetLowering::getOptimalMemOpType(
2765     const MemOp &Op, const AttributeList &FuncAttributes) const {
2766   if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
2767     if (Op.size() >= 16 &&
2768         (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2769       // FIXME: Check if unaligned 64-byte accesses are slow.
2770       if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2771           (Subtarget.getPreferVectorWidth() >= 512)) {
2772         return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2773       }
2774       // FIXME: Check if unaligned 32-byte accesses are slow.
2775       if (Op.size() >= 32 && Subtarget.hasAVX() &&
2776           Subtarget.useLight256BitInstructions()) {
2777         // Although this isn't a well-supported type for AVX1, we'll let
2778         // legalization and shuffle lowering produce the optimal codegen. If we
2779         // choose an optimal type with a vector element larger than a byte,
2780         // getMemsetStores() may create an intermediate splat (using an integer
2781         // multiply) before we splat as a vector.
2782         return MVT::v32i8;
2783       }
2784       if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2785         return MVT::v16i8;
2786       // TODO: Can SSE1 handle a byte vector?
2787       // If we have SSE1 registers we should be able to use them.
2788       if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2789           (Subtarget.getPreferVectorWidth() >= 128))
2790         return MVT::v4f32;
2791     } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2792                Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2793       // Do not use f64 to lower memcpy if source is string constant. It's
2794       // better to use i32 to avoid the loads.
2795       // Also, do not use f64 to lower memset unless this is a memset of zeros.
2796       // The gymnastics of splatting a byte value into an XMM register and then
2797       // only using 8-byte stores (because this is a CPU with slow unaligned
2798       // 16-byte accesses) makes that a loser.
2799       return MVT::f64;
2800     }
2801   }
2802   // This is a compromise. If we reach here, unaligned accesses may be slow on
2803   // this target. However, creating smaller, aligned accesses could be even
2804   // slower and would certainly be a lot more code.
2805   if (Subtarget.is64Bit() && Op.size() >= 8)
2806     return MVT::i64;
2807   return MVT::i32;
2808 }
2809
2810 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2811   if (VT == MVT::f32)
2812     return Subtarget.hasSSE1();
2813   if (VT == MVT::f64)
2814     return Subtarget.hasSSE2();
2815   return true;
2816 }
2817
2818 static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {
2819   return (8 * Alignment.value()) % SizeInBits == 0;
2820 }
2821
2822 bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {
2823   if (isBitAligned(Alignment, VT.getSizeInBits()))
2824     return true;
2825   switch (VT.getSizeInBits()) {
2826   default:
2827     // 8-byte and under are always assumed to be fast.
2828     return true;
2829   case 128:
2830     return !Subtarget.isUnalignedMem16Slow();
2831   case 256:
2832     return !Subtarget.isUnalignedMem32Slow();
2833     // TODO: What about AVX-512 (512-bit) accesses?
2834   }
2835 }
2836
2837 bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2838     EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2839     unsigned *Fast) const {
2840   if (Fast)
2841     *Fast = isMemoryAccessFast(VT, Alignment);
2842   // NonTemporal vector memory ops must be aligned.
2843   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2844     // NT loads can only be vector aligned, so if its less aligned than the
2845     // minimum vector size (which we can split the vector down to), we might as
2846     // well use a regular unaligned vector load.
2847     // We don't have any NT loads pre-SSE41.
2848     if (!!(Flags & MachineMemOperand::MOLoad))
2849       return (Alignment < 16 || !Subtarget.hasSSE41());
2850     return false;
2851   }
2852   // Misaligned accesses of any size are always allowed.
2853   return true;
2854 }
2855
2856 bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
2857                                            const DataLayout &DL, EVT VT,
2858                                            unsigned AddrSpace, Align Alignment,
2859                                            MachineMemOperand::Flags Flags,
2860                                            unsigned *Fast) const {
2861   if (Fast)
2862     *Fast = isMemoryAccessFast(VT, Alignment);
2863   if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2864     if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,
2865                                        /*Fast=*/nullptr))
2866       return true;
2867     // NonTemporal vector memory ops are special, and must be aligned.
2868     if (!isBitAligned(Alignment, VT.getSizeInBits()))
2869       return false;
2870     switch (VT.getSizeInBits()) {
2871     case 128:
2872       if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())
2873         return true;
2874       if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())
2875         return true;
2876       return false;
2877     case 256:
2878       if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())
2879         return true;
2880       if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())
2881         return true;
2882       return false;
2883     case 512:
2884       if (Subtarget.hasAVX512())
2885         return true;
2886       return false;
2887     default:
2888       return false; // Don't have NonTemporal vector memory ops of this size.
2889     }
2890   }
2891   return true;
2892 }
2893
2894 /// Return the entry encoding for a jump table in the
2895 /// current function.  The returned value is a member of the
2896 /// MachineJumpTableInfo::JTEntryKind enum.
2897 unsigned X86TargetLowering::getJumpTableEncoding() const {
2898   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2899   // symbol.
2900   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2901     return MachineJumpTableInfo::EK_Custom32;
2902
2903   // Otherwise, use the normal jump table encoding heuristics.
2904   return TargetLowering::getJumpTableEncoding();
2905 }
2906
2907 bool X86TargetLowering::splitValueIntoRegisterParts(
2908     SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
2909     unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
2910   bool IsABIRegCopy = CC.has_value();
2911   EVT ValueVT = Val.getValueType();
2912   if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
2913     unsigned ValueBits = ValueVT.getSizeInBits();
2914     unsigned PartBits = PartVT.getSizeInBits();
2915     Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
2916     Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
2917     Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
2918     Parts[0] = Val;
2919     return true;
2920   }
2921   return false;
2922 }
2923
2924 SDValue X86TargetLowering::joinRegisterPartsIntoValue(
2925     SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
2926     MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
2927   bool IsABIRegCopy = CC.has_value();
2928   if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {
2929     unsigned ValueBits = ValueVT.getSizeInBits();
2930     unsigned PartBits = PartVT.getSizeInBits();
2931     SDValue Val = Parts[0];
2932
2933     Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
2934     Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
2935     Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
2936     return Val;
2937   }
2938   return SDValue();
2939 }
2940
2941 bool X86TargetLowering::useSoftFloat() const {
2942   return Subtarget.useSoftFloat();
2943 }
2944
2945 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2946                                               ArgListTy &Args) const {
2947
2948   // Only relabel X86-32 for C / Stdcall CCs.
2949   if (Subtarget.is64Bit())
2950     return;
2951   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2952     return;
2953   unsigned ParamRegs = 0;
2954   if (auto *M = MF->getFunction().getParent())
2955     ParamRegs = M->getNumberRegisterParameters();
2956
2957   // Mark the first N int arguments as having reg
2958   for (auto &Arg : Args) {
2959     Type *T = Arg.Ty;
2960     if (T->isIntOrPtrTy())
2961       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2962         unsigned numRegs = 1;
2963         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2964           numRegs = 2;
2965         if (ParamRegs < numRegs)
2966           return;
2967         ParamRegs -= numRegs;
2968         Arg.IsInReg = true;
2969       }
2970   }
2971 }
2972
2973 const MCExpr *
2974 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2975                                              const MachineBasicBlock *MBB,
2976                                              unsigned uid,MCContext &Ctx) const{
2977   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2978   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2979   // entries.
2980   return MCSymbolRefExpr::create(MBB->getSymbol(),
2981                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
2982 }
2983
2984 /// Returns relocation base for the given PIC jumptable.
2985 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2986                                                     SelectionDAG &DAG) const {
2987   if (!Subtarget.is64Bit())
2988     // This doesn't have SDLoc associated with it, but is not really the
2989     // same as a Register.
2990     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2991                        getPointerTy(DAG.getDataLayout()));
2992   return Table;
2993 }
2994
2995 /// This returns the relocation base for the given PIC jumptable,
2996 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2997 const MCExpr *X86TargetLowering::
2998 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2999                              MCContext &Ctx) const {
3000   // X86-64 uses RIP relative addressing based on the jump table label.
3001   if (Subtarget.isPICStyleRIPRel())
3002     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
3003
3004   // Otherwise, the reference is relative to the PIC base.
3005   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
3006 }
3007
3008 std::pair<const TargetRegisterClass *, uint8_t>
3009 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
3010                                            MVT VT) const {
3011   const TargetRegisterClass *RRC = nullptr;
3012   uint8_t Cost = 1;
3013   switch (VT.SimpleTy) {
3014   default:
3015     return TargetLowering::findRepresentativeClass(TRI, VT);
3016   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
3017     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
3018     break;
3019   case MVT::x86mmx:
3020     RRC = &X86::VR64RegClass;
3021     break;
3022   case MVT::f32: case MVT::f64:
3023   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
3024   case MVT::v4f32: case MVT::v2f64:
3025   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
3026   case MVT::v8f32: case MVT::v4f64:
3027   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
3028   case MVT::v16f32: case MVT::v8f64:
3029     RRC = &X86::VR128XRegClass;
3030     break;
3031   }
3032   return std::make_pair(RRC, Cost);
3033 }
3034
3035 unsigned X86TargetLowering::getAddressSpace() const {
3036   if (Subtarget.is64Bit())
3037     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
3038   return 256;
3039 }
3040
3041 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
3042   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
3043          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
3044 }
3045
3046 static Constant* SegmentOffset(IRBuilderBase &IRB,
3047                                int Offset, unsigned AddressSpace) {
3048   return ConstantExpr::getIntToPtr(
3049       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
3050       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
3051 }
3052
3053 Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
3054   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
3055   // tcbhead_t; use it instead of the usual global variable (see
3056   // sysdeps/{i386,x86_64}/nptl/tls.h)
3057   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
3058     if (Subtarget.isTargetFuchsia()) {
3059       // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
3060       return SegmentOffset(IRB, 0x10, getAddressSpace());
3061     } else {
3062       unsigned AddressSpace = getAddressSpace();
3063       Module *M = IRB.GetInsertBlock()->getParent()->getParent();
3064       // Specially, some users may customize the base reg and offset.
3065       int Offset = M->getStackProtectorGuardOffset();
3066       // If we don't set -stack-protector-guard-offset value:
3067       // %fs:0x28, unless we're using a Kernel code model, in which case
3068       // it's %gs:0x28.  gs:0x14 on i386.
3069       if (Offset == INT_MAX)
3070         Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
3071
3072       StringRef GuardReg = M->getStackProtectorGuardReg();
3073       if (GuardReg == "fs")
3074         AddressSpace = X86AS::FS;
3075       else if (GuardReg == "gs")
3076         AddressSpace = X86AS::GS;
3077
3078       // Use symbol guard if user specify.
3079       StringRef GuardSymb = M->getStackProtectorGuardSymbol();
3080       if (!GuardSymb.empty()) {
3081         GlobalVariable *GV = M->getGlobalVariable(GuardSymb);
3082         if (!GV) {
3083           Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())
3084                                          : Type::getInt32Ty(M->getContext());
3085           GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
3086                                   nullptr, GuardSymb, nullptr,
3087                                   GlobalValue::NotThreadLocal, AddressSpace);
3088           if (!Subtarget.isTargetDarwin())
3089             GV->setDSOLocal(M->getDirectAccessExternalData());
3090         }
3091         return GV;
3092       }
3093
3094       return SegmentOffset(IRB, Offset, AddressSpace);
3095     }
3096   }
3097   return TargetLowering::getIRStackGuard(IRB);
3098 }
3099
3100 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
3101   // MSVC CRT provides functionalities for stack protection.
3102   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3103       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3104     // MSVC CRT has a global variable holding security cookie.
3105     M.getOrInsertGlobal("__security_cookie",
3106                         Type::getInt8PtrTy(M.getContext()));
3107
3108     // MSVC CRT has a function to validate security cookie.
3109     FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
3110         "__security_check_cookie", Type::getVoidTy(M.getContext()),
3111         Type::getInt8PtrTy(M.getContext()));
3112     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
3113       F->setCallingConv(CallingConv::X86_FastCall);
3114       F->addParamAttr(0, Attribute::AttrKind::InReg);
3115     }
3116     return;
3117   }
3118
3119   StringRef GuardMode = M.getStackProtectorGuard();
3120
3121   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
3122   if ((GuardMode == "tls" || GuardMode.empty()) &&
3123       hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
3124     return;
3125   TargetLowering::insertSSPDeclarations(M);
3126 }
3127
3128 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
3129   // MSVC CRT has a global variable holding security cookie.
3130   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3131       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3132     return M.getGlobalVariable("__security_cookie");
3133   }
3134   return TargetLowering::getSDagStackGuard(M);
3135 }
3136
3137 Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
3138   // MSVC CRT has a function to validate security cookie.
3139   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
3140       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
3141     return M.getFunction("__security_check_cookie");
3142   }
3143   return TargetLowering::getSSPStackGuardCheck(M);
3144 }
3145
3146 Value *
3147 X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
3148   if (Subtarget.getTargetTriple().isOSContiki())
3149     return getDefaultSafeStackPointerLocation(IRB, false);
3150
3151   // Android provides a fixed TLS slot for the SafeStack pointer. See the
3152   // definition of TLS_SLOT_SAFESTACK in
3153   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
3154   if (Subtarget.isTargetAndroid()) {
3155     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
3156     // %gs:0x24 on i386
3157     int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
3158     return SegmentOffset(IRB, Offset, getAddressSpace());
3159   }
3160
3161   // Fuchsia is similar.
3162   if (Subtarget.isTargetFuchsia()) {
3163     // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
3164     return SegmentOffset(IRB, 0x18, getAddressSpace());
3165   }
3166
3167   return TargetLowering::getSafeStackPointerLocation(IRB);
3168 }
3169
3170 //===----------------------------------------------------------------------===//
3171 //               Return Value Calling Convention Implementation
3172 //===----------------------------------------------------------------------===//
3173
3174 bool X86TargetLowering::CanLowerReturn(
3175     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
3176     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
3177   SmallVector<CCValAssign, 16> RVLocs;
3178   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3179   return CCInfo.CheckReturn(Outs, RetCC_X86);
3180 }
3181
3182 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
3183   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
3184   return ScratchRegs;
3185 }
3186
3187 ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {
3188   // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit
3189   // tests at the moment, which is not what we expected.
3190   static const MCPhysReg RCRegs[] = {X86::MXCSR};
3191   return RCRegs;
3192 }
3193
3194 /// Lowers masks values (v*i1) to the local register values
3195 /// \returns DAG node after lowering to register type
3196 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
3197                                const SDLoc &Dl, SelectionDAG &DAG) {
3198   EVT ValVT = ValArg.getValueType();
3199
3200   if (ValVT == MVT::v1i1)
3201     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
3202                        DAG.getIntPtrConstant(0, Dl));
3203
3204   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
3205       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
3206     // Two stage lowering might be required
3207     // bitcast:   v8i1 -> i8 / v16i1 -> i16
3208     // anyextend: i8   -> i32 / i16   -> i32
3209     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
3210     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
3211     if (ValLoc == MVT::i32)
3212       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
3213     return ValToCopy;
3214   }
3215
3216   if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
3217       (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
3218     // One stage lowering is required
3219     // bitcast:   v32i1 -> i32 / v64i1 -> i64
3220     return DAG.getBitcast(ValLoc, ValArg);
3221   }
3222
3223   return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
3224 }
3225
3226 /// Breaks v64i1 value into two registers and adds the new node to the DAG
3227 static void Passv64i1ArgInRegs(
3228     const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
3229     SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
3230     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
3231   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
3232   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
3233   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
3234   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
3235          "The value should reside in two registers");
3236
3237   // Before splitting the value we cast it to i64
3238   Arg = DAG.getBitcast(MVT::i64, Arg);
3239
3240   // Splitting the value into two i32 types
3241   SDValue Lo, Hi;
3242   std::tie(Lo, Hi) = DAG.SplitScalar(Arg, Dl, MVT::i32, MVT::i32);
3243
3244   // Attach the two i32 types into corresponding registers
3245   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
3246   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
3247 }
3248
3249 SDValue
3250 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3251                                bool isVarArg,
3252                                const SmallVectorImpl<ISD::OutputArg> &Outs,
3253                                const SmallVectorImpl<SDValue> &OutVals,
3254                                const SDLoc &dl, SelectionDAG &DAG) const {
3255   MachineFunction &MF = DAG.getMachineFunction();
3256   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3257
3258   // In some cases we need to disable registers from the default CSR list.
3259   // For example, when they are used as return registers (preserve_* and X86's
3260   // regcall) or for argument passing (X86's regcall).
3261   bool ShouldDisableCalleeSavedRegister =
3262       shouldDisableRetRegFromCSR(CallConv) ||
3263       MF.getFunction().hasFnAttribute("no_caller_saved_registers");
3264
3265   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
3266     report_fatal_error("X86 interrupts may not return any value");
3267
3268   SmallVector<CCValAssign, 16> RVLocs;
3269   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
3270   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
3271
3272   SmallVector<std::pair<Register, SDValue>, 4> RetVals;
3273   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
3274        ++I, ++OutsIndex) {
3275     CCValAssign &VA = RVLocs[I];
3276     assert(VA.isRegLoc() && "Can only return in registers!");
3277
3278     // Add the register to the CalleeSaveDisableRegs list.
3279     if (ShouldDisableCalleeSavedRegister)
3280       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
3281
3282     SDValue ValToCopy = OutVals[OutsIndex];
3283     EVT ValVT = ValToCopy.getValueType();
3284
3285     // Promote values to the appropriate types.
3286     if (VA.getLocInfo() == CCValAssign::SExt)
3287       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
3288     else if (VA.getLocInfo() == CCValAssign::ZExt)
3289       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
3290     else if (VA.getLocInfo() == CCValAssign::AExt) {
3291       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
3292         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
3293       else
3294         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
3295     }
3296     else if (VA.getLocInfo() == CCValAssign::BCvt)
3297       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
3298
3299     assert(VA.getLocInfo() != CCValAssign::FPExt &&
3300            "Unexpected FP-extend for return value.");
3301
3302     // Report an error if we have attempted to return a value via an XMM
3303     // register and SSE was disabled.
3304     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3305       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3306       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3307     } else if (!Subtarget.hasSSE2() &&
3308                X86::FR64XRegClass.contains(VA.getLocReg()) &&
3309                ValVT == MVT::f64) {
3310       // When returning a double via an XMM register, report an error if SSE2 is
3311       // not enabled.
3312       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3313       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3314     }
3315
3316     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
3317     // the RET instruction and handled by the FP Stackifier.
3318     if (VA.getLocReg() == X86::FP0 ||
3319         VA.getLocReg() == X86::FP1) {
3320       // If this is a copy from an xmm register to ST(0), use an FPExtend to
3321       // change the value to the FP stack register class.
3322       if (isScalarFPTypeInSSEReg(VA.getValVT()))
3323         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
3324       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3325       // Don't emit a copytoreg.
3326       continue;
3327     }
3328
3329     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
3330     // which is returned in RAX / RDX.
3331     if (Subtarget.is64Bit()) {
3332       if (ValVT == MVT::x86mmx) {
3333         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
3334           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
3335           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
3336                                   ValToCopy);
3337           // If we don't have SSE2 available, convert to v4f32 so the generated
3338           // register is legal.
3339           if (!Subtarget.hasSSE2())
3340             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
3341         }
3342       }
3343     }
3344
3345     if (VA.needsCustom()) {
3346       assert(VA.getValVT() == MVT::v64i1 &&
3347              "Currently the only custom case is when we split v64i1 to 2 regs");
3348
3349       Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
3350                          Subtarget);
3351
3352       // Add the second register to the CalleeSaveDisableRegs list.
3353       if (ShouldDisableCalleeSavedRegister)
3354         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
3355     } else {
3356       RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
3357     }
3358   }
3359
3360   SDValue Glue;
3361   SmallVector<SDValue, 6> RetOps;
3362   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3363   // Operand #1 = Bytes To Pop
3364   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
3365                    MVT::i32));
3366
3367   // Copy the result values into the output registers.
3368   for (auto &RetVal : RetVals) {
3369     if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
3370       RetOps.push_back(RetVal.second);
3371       continue; // Don't emit a copytoreg.
3372     }
3373
3374     Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);
3375     Glue = Chain.getValue(1);
3376     RetOps.push_back(
3377         DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
3378   }
3379
3380   // Swift calling convention does not require we copy the sret argument
3381   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
3382
3383   // All x86 ABIs require that for returning structs by value we copy
3384   // the sret argument into %rax/%eax (depending on ABI) for the return.
3385   // We saved the argument into a virtual register in the entry block,
3386   // so now we copy the value out and into %rax/%eax.
3387   //
3388   // Checking Function.hasStructRetAttr() here is insufficient because the IR
3389   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
3390   // false, then an sret argument may be implicitly inserted in the SelDAG. In
3391   // either case FuncInfo->setSRetReturnReg() will have been called.
3392   if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
3393     // When we have both sret and another return value, we should use the
3394     // original Chain stored in RetOps[0], instead of the current Chain updated
3395     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
3396
3397     // For the case of sret and another return value, we have
3398     //   Chain_0 at the function entry
3399     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
3400     // If we use Chain_1 in getCopyFromReg, we will have
3401     //   Val = getCopyFromReg(Chain_1)
3402     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
3403
3404     // getCopyToReg(Chain_0) will be glued together with
3405     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
3406     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
3407     //   Data dependency from Unit B to Unit A due to usage of Val in
3408     //     getCopyToReg(Chain_1, Val)
3409     //   Chain dependency from Unit A to Unit B
3410
3411     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
3412     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
3413                                      getPointerTy(MF.getDataLayout()));
3414
3415     Register RetValReg
3416         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
3417           X86::RAX : X86::EAX;
3418     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);
3419     Glue = Chain.getValue(1);
3420
3421     // RAX/EAX now acts like a return value.
3422     RetOps.push_back(
3423         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
3424
3425     // Add the returned register to the CalleeSaveDisableRegs list. Don't do
3426     // this however for preserve_most/preserve_all to minimize the number of
3427     // callee-saved registers for these CCs.
3428     if (ShouldDisableCalleeSavedRegister &&
3429         CallConv != CallingConv::PreserveAll &&
3430         CallConv != CallingConv::PreserveMost)
3431       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
3432   }
3433
3434   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3435   const MCPhysReg *I =
3436       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3437   if (I) {
3438     for (; *I; ++I) {
3439       if (X86::GR64RegClass.contains(*I))
3440         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3441       else
3442         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3443     }
3444   }
3445
3446   RetOps[0] = Chain;  // Update chain.
3447
3448   // Add the glue if we have it.
3449   if (Glue.getNode())
3450     RetOps.push_back(Glue);
3451
3452   X86ISD::NodeType opcode = X86ISD::RET_GLUE;
3453   if (CallConv == CallingConv::X86_INTR)
3454     opcode = X86ISD::IRET;
3455   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
3456 }
3457
3458 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3459   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
3460     return false;
3461
3462   SDValue TCChain = Chain;
3463   SDNode *Copy = *N->use_begin();
3464   if (Copy->getOpcode() == ISD::CopyToReg) {
3465     // If the copy has a glue operand, we conservatively assume it isn't safe to
3466     // perform a tail call.
3467     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3468       return false;
3469     TCChain = Copy->getOperand(0);
3470   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
3471     return false;
3472
3473   bool HasRet = false;
3474   for (const SDNode *U : Copy->uses()) {
3475     if (U->getOpcode() != X86ISD::RET_GLUE)
3476       return false;
3477     // If we are returning more than one value, we can definitely
3478     // not make a tail call see PR19530
3479     if (U->getNumOperands() > 4)
3480       return false;
3481     if (U->getNumOperands() == 4 &&
3482         U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)
3483       return false;
3484     HasRet = true;
3485   }
3486
3487   if (!HasRet)
3488     return false;
3489
3490   Chain = TCChain;
3491   return true;
3492 }
3493
3494 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
3495                                            ISD::NodeType ExtendKind) const {
3496   MVT ReturnMVT = MVT::i32;
3497
3498   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
3499   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
3500     // The ABI does not require i1, i8 or i16 to be extended.
3501     //
3502     // On Darwin, there is code in the wild relying on Clang's old behaviour of
3503     // always extending i8/i16 return values, so keep doing that for now.
3504     // (PR26665).
3505     ReturnMVT = MVT::i8;
3506   }
3507
3508   EVT MinVT = getRegisterType(Context, ReturnMVT);
3509   return VT.bitsLT(MinVT) ? MinVT : VT;
3510 }
3511
3512 /// Reads two 32 bit registers and creates a 64 bit mask value.
3513 /// \param VA The current 32 bit value that need to be assigned.
3514 /// \param NextVA The next 32 bit value that need to be assigned.
3515 /// \param Root The parent DAG node.
3516 /// \param [in,out] InGlue Represents SDvalue in the parent DAG node for
3517 ///                        glue purposes. In the case the DAG is already using
3518 ///                        physical register instead of virtual, we should glue
3519 ///                        our new SDValue to InGlue SDvalue.
3520 /// \return a new SDvalue of size 64bit.
3521 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
3522                                 SDValue &Root, SelectionDAG &DAG,
3523                                 const SDLoc &Dl, const X86Subtarget &Subtarget,
3524                                 SDValue *InGlue = nullptr) {
3525   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
3526   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
3527   assert(VA.getValVT() == MVT::v64i1 &&
3528          "Expecting first location of 64 bit width type");
3529   assert(NextVA.getValVT() == VA.getValVT() &&
3530          "The locations should have the same type");
3531   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
3532          "The values should reside in two registers");
3533
3534   SDValue Lo, Hi;
3535   SDValue ArgValueLo, ArgValueHi;
3536
3537   MachineFunction &MF = DAG.getMachineFunction();
3538   const TargetRegisterClass *RC = &X86::GR32RegClass;
3539
3540   // Read a 32 bit value from the registers.
3541   if (nullptr == InGlue) {
3542     // When no physical register is present,
3543     // create an intermediate virtual register.
3544     Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3545     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3546     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3547     ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3548   } else {
3549     // When a physical register is available read the value from it and glue
3550     // the reads together.
3551     ArgValueLo =
3552       DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InGlue);
3553     *InGlue = ArgValueLo.getValue(2);
3554     ArgValueHi =
3555       DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InGlue);
3556     *InGlue = ArgValueHi.getValue(2);
3557   }
3558
3559   // Convert the i32 type into v32i1 type.
3560   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
3561
3562   // Convert the i32 type into v32i1 type.
3563   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
3564
3565   // Concatenate the two values together.
3566   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
3567 }
3568
3569 /// The function will lower a register of various sizes (8/16/32/64)
3570 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
3571 /// \returns a DAG node contains the operand after lowering to mask type.
3572 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3573                                const EVT &ValLoc, const SDLoc &Dl,
3574                                SelectionDAG &DAG) {
3575   SDValue ValReturned = ValArg;
3576
3577   if (ValVT == MVT::v1i1)
3578     return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3579
3580   if (ValVT == MVT::v64i1) {
3581     // In 32 bit machine, this case is handled by getv64i1Argument
3582     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
3583     // In 64 bit machine, There is no need to truncate the value only bitcast
3584   } else {
3585     MVT maskLen;
3586     switch (ValVT.getSimpleVT().SimpleTy) {
3587     case MVT::v8i1:
3588       maskLen = MVT::i8;
3589       break;
3590     case MVT::v16i1:
3591       maskLen = MVT::i16;
3592       break;
3593     case MVT::v32i1:
3594       maskLen = MVT::i32;
3595       break;
3596     default:
3597       llvm_unreachable("Expecting a vector of i1 types");
3598     }
3599
3600     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3601   }
3602   return DAG.getBitcast(ValVT, ValReturned);
3603 }
3604
3605 /// Lower the result values of a call into the
3606 /// appropriate copies out of appropriate physical registers.
3607 ///
3608 SDValue X86TargetLowering::LowerCallResult(
3609     SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
3610     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3611     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3612     uint32_t *RegMask) const {
3613
3614   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3615   // Assign locations to each value returned by this call.
3616   SmallVector<CCValAssign, 16> RVLocs;
3617   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3618                  *DAG.getContext());
3619   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3620
3621   // Copy all of the result registers out of their specified physreg.
3622   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3623        ++I, ++InsIndex) {
3624     CCValAssign &VA = RVLocs[I];
3625     EVT CopyVT = VA.getLocVT();
3626
3627     // In some calling conventions we need to remove the used registers
3628     // from the register mask.
3629     if (RegMask) {
3630       for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg()))
3631         RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
3632     }
3633
3634     // Report an error if there was an attempt to return FP values via XMM
3635     // registers.
3636     if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3637       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3638       if (VA.getLocReg() == X86::XMM1)
3639         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3640       else
3641         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3642     } else if (!Subtarget.hasSSE2() &&
3643                X86::FR64XRegClass.contains(VA.getLocReg()) &&
3644                CopyVT == MVT::f64) {
3645       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3646       if (VA.getLocReg() == X86::XMM1)
3647         VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3648       else
3649         VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3650     }
3651
3652     // If we prefer to use the value in xmm registers, copy it out as f80 and
3653     // use a truncate to move it from fp stack reg to xmm reg.
3654     bool RoundAfterCopy = false;
3655     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3656         isScalarFPTypeInSSEReg(VA.getValVT())) {
3657       if (!Subtarget.hasX87())
3658         report_fatal_error("X87 register return with X87 disabled");
3659       CopyVT = MVT::f80;
3660       RoundAfterCopy = (CopyVT != VA.getLocVT());
3661     }
3662
3663     SDValue Val;
3664     if (VA.needsCustom()) {
3665       assert(VA.getValVT() == MVT::v64i1 &&
3666              "Currently the only custom case is when we split v64i1 to 2 regs");
3667       Val =
3668           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);
3669     } else {
3670       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)
3671                   .getValue(1);
3672       Val = Chain.getValue(0);
3673       InGlue = Chain.getValue(2);
3674     }
3675
3676     if (RoundAfterCopy)
3677       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3678                         // This truncation won't change the value.
3679                         DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
3680
3681     if (VA.isExtInLoc()) {
3682       if (VA.getValVT().isVector() &&
3683           VA.getValVT().getScalarType() == MVT::i1 &&
3684           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3685            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3686         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3687         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3688       } else
3689         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3690     }
3691
3692     if (VA.getLocInfo() == CCValAssign::BCvt)
3693       Val = DAG.getBitcast(VA.getValVT(), Val);
3694
3695     InVals.push_back(Val);
3696   }
3697
3698   return Chain;
3699 }
3700
3701 //===----------------------------------------------------------------------===//
3702 //                C & StdCall & Fast Calling Convention implementation
3703 //===----------------------------------------------------------------------===//
3704 //  StdCall calling convention seems to be standard for many Windows' API
3705 //  routines and around. It differs from C calling convention just a little:
3706 //  callee should clean up the stack, not caller. Symbols should be also
3707 //  decorated in some fancy way :) It doesn't support any vector arguments.
3708 //  For info on fast calling convention see Fast Calling Convention (tail call)
3709 //  implementation LowerX86_32FastCCCallTo.
3710
3711 /// Determines whether Args, either a set of outgoing arguments to a call, or a
3712 /// set of incoming args of a call, contains an sret pointer that the callee
3713 /// pops
3714 template <typename T>
3715 static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,
3716                              const X86Subtarget &Subtarget) {
3717   // Not C++20 (yet), so no concepts available.
3718   static_assert(std::is_same_v<T, ISD::OutputArg> ||
3719                     std::is_same_v<T, ISD::InputArg>,
3720                 "requires ISD::OutputArg or ISD::InputArg");
3721
3722   // Only 32-bit pops the sret.  It's a 64-bit world these days, so early-out
3723   // for most compilations.
3724   if (!Subtarget.is32Bit())
3725     return false;
3726
3727   if (Args.empty())
3728     return false;
3729
3730   // Most calls do not have an sret argument, check the arg next.
3731   const ISD::ArgFlagsTy &Flags = Args[0].Flags;
3732   if (!Flags.isSRet() || Flags.isInReg())
3733     return false;
3734
3735   // The MSVCabi does not pop the sret.
3736   if (Subtarget.getTargetTriple().isOSMSVCRT())
3737     return false;
3738
3739   // MCUs don't pop the sret
3740   if (Subtarget.isTargetMCU())
3741     return false;
3742
3743   // Callee pops argument
3744   return true;
3745 }
3746
3747 /// Make a copy of an aggregate at address specified by "Src" to address
3748 /// "Dst" with size and alignment information specified by the specific
3749 /// parameter attribute. The copy will be passed as a byval function parameter.
3750 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3751                                          SDValue Chain, ISD::ArgFlagsTy Flags,
3752                                          SelectionDAG &DAG, const SDLoc &dl) {
3753   SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3754
3755   return DAG.getMemcpy(
3756       Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3757       /*isVolatile*/ false, /*AlwaysInline=*/true,
3758       /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3759 }
3760
3761 /// Return true if the calling convention is one that we can guarantee TCO for.
3762 static bool canGuaranteeTCO(CallingConv::ID CC) {
3763   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3764           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3765           CC == CallingConv::Tail || CC == CallingConv::SwiftTail);
3766 }
3767
3768 /// Return true if we might ever do TCO for calls with this calling convention.
3769 static bool mayTailCallThisCC(CallingConv::ID CC) {
3770   switch (CC) {
3771   // C calling conventions:
3772   case CallingConv::C:
3773   case CallingConv::Win64:
3774   case CallingConv::X86_64_SysV:
3775   // Callee pop conventions:
3776   case CallingConv::X86_ThisCall:
3777   case CallingConv::X86_StdCall:
3778   case CallingConv::X86_VectorCall:
3779   case CallingConv::X86_FastCall:
3780   // Swift:
3781   case CallingConv::Swift:
3782     return true;
3783   default:
3784     return canGuaranteeTCO(CC);
3785   }
3786 }
3787
3788 /// Return true if the function is being made into a tailcall target by
3789 /// changing its ABI.
3790 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3791   return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3792          CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3793 }
3794
3795 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3796   if (!CI->isTailCall())
3797     return false;
3798
3799   CallingConv::ID CalleeCC = CI->getCallingConv();
3800   if (!mayTailCallThisCC(CalleeCC))
3801     return false;
3802
3803   return true;
3804 }
3805
3806 SDValue
3807 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3808                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3809                                     const SDLoc &dl, SelectionDAG &DAG,
3810                                     const CCValAssign &VA,
3811                                     MachineFrameInfo &MFI, unsigned i) const {
3812   // Create the nodes corresponding to a load from this parameter slot.
3813   ISD::ArgFlagsTy Flags = Ins[i].Flags;
3814   bool AlwaysUseMutable = shouldGuaranteeTCO(
3815       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3816   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3817   EVT ValVT;
3818   MVT PtrVT = getPointerTy(DAG.getDataLayout());
3819
3820   // If value is passed by pointer we have address passed instead of the value
3821   // itself. No need to extend if the mask value and location share the same
3822   // absolute size.
3823   bool ExtendedInMem =
3824       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3825       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3826
3827   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3828     ValVT = VA.getLocVT();
3829   else
3830     ValVT = VA.getValVT();
3831
3832   // FIXME: For now, all byval parameter objects are marked mutable. This can be
3833   // changed with more analysis.
3834   // In case of tail call optimization mark all arguments mutable. Since they
3835   // could be overwritten by lowering of arguments in case of a tail call.
3836   if (Flags.isByVal()) {
3837     unsigned Bytes = Flags.getByValSize();
3838     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3839
3840     // FIXME: For now, all byval parameter objects are marked as aliasing. This
3841     // can be improved with deeper analysis.
3842     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3843                                    /*isAliased=*/true);
3844     return DAG.getFrameIndex(FI, PtrVT);
3845   }
3846
3847   EVT ArgVT = Ins[i].ArgVT;
3848
3849   // If this is a vector that has been split into multiple parts, and the
3850   // scalar size of the parts don't match the vector element size, then we can't
3851   // elide the copy. The parts will have padding between them instead of being
3852   // packed like a vector.
3853   bool ScalarizedAndExtendedVector =
3854       ArgVT.isVector() && !VA.getLocVT().isVector() &&
3855       VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3856
3857   // This is an argument in memory. We might be able to perform copy elision.
3858   // If the argument is passed directly in memory without any extension, then we
3859   // can perform copy elision. Large vector types, for example, may be passed
3860   // indirectly by pointer.
3861   if (Flags.isCopyElisionCandidate() &&
3862       VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3863       !ScalarizedAndExtendedVector) {
3864     SDValue PartAddr;
3865     if (Ins[i].PartOffset == 0) {
3866       // If this is a one-part value or the first part of a multi-part value,
3867       // create a stack object for the entire argument value type and return a
3868       // load from our portion of it. This assumes that if the first part of an
3869       // argument is in memory, the rest will also be in memory.
3870       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3871                                      /*IsImmutable=*/false);
3872       PartAddr = DAG.getFrameIndex(FI, PtrVT);
3873       return DAG.getLoad(
3874           ValVT, dl, Chain, PartAddr,
3875           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3876     } else {
3877       // This is not the first piece of an argument in memory. See if there is
3878       // already a fixed stack object including this offset. If so, assume it
3879       // was created by the PartOffset == 0 branch above and create a load from
3880       // the appropriate offset into it.
3881       int64_t PartBegin = VA.getLocMemOffset();
3882       int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3883       int FI = MFI.getObjectIndexBegin();
3884       for (; MFI.isFixedObjectIndex(FI); ++FI) {
3885         int64_t ObjBegin = MFI.getObjectOffset(FI);
3886         int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3887         if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3888           break;
3889       }
3890       if (MFI.isFixedObjectIndex(FI)) {
3891         SDValue Addr =
3892             DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3893                         DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3894         return DAG.getLoad(
3895             ValVT, dl, Chain, Addr,
3896             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3897                                               Ins[i].PartOffset));
3898       }
3899     }
3900   }
3901
3902   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3903                                  VA.getLocMemOffset(), isImmutable);
3904
3905   // Set SExt or ZExt flag.
3906   if (VA.getLocInfo() == CCValAssign::ZExt) {
3907     MFI.setObjectZExt(FI, true);
3908   } else if (VA.getLocInfo() == CCValAssign::SExt) {
3909     MFI.setObjectSExt(FI, true);
3910   }
3911
3912   MaybeAlign Alignment;
3913   if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
3914       ValVT != MVT::f80)
3915     Alignment = MaybeAlign(4);
3916   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3917   SDValue Val = DAG.getLoad(
3918       ValVT, dl, Chain, FIN,
3919       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3920       Alignment);
3921   return ExtendedInMem
3922              ? (VA.getValVT().isVector()
3923                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3924                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3925              : Val;
3926 }
3927
3928 // FIXME: Get this from tablegen.
3929 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3930                                                 const X86Subtarget &Subtarget) {
3931   assert(Subtarget.is64Bit());
3932
3933   if (Subtarget.isCallingConvWin64(CallConv)) {
3934     static const MCPhysReg GPR64ArgRegsWin64[] = {
3935       X86::RCX, X86::RDX, X86::R8,  X86::R9
3936     };
3937     return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3938   }
3939
3940   static const MCPhysReg GPR64ArgRegs64Bit[] = {
3941     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3942   };
3943   return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3944 }
3945
3946 // FIXME: Get this from tablegen.
3947 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3948                                                 CallingConv::ID CallConv,
3949                                                 const X86Subtarget &Subtarget) {
3950   assert(Subtarget.is64Bit());
3951   if (Subtarget.isCallingConvWin64(CallConv)) {
3952     // The XMM registers which might contain var arg parameters are shadowed
3953     // in their paired GPR.  So we only need to save the GPR to their home
3954     // slots.
3955     // TODO: __vectorcall will change this.
3956     return std::nullopt;
3957   }
3958
3959   bool isSoftFloat = Subtarget.useSoftFloat();
3960   if (isSoftFloat || !Subtarget.hasSSE1())
3961     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3962     // registers.
3963     return std::nullopt;
3964
3965   static const MCPhysReg XMMArgRegs64Bit[] = {
3966     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3967     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3968   };
3969   return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3970 }
3971
3972 #ifndef NDEBUG
3973 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3974   return llvm::is_sorted(
3975       ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3976         return A.getValNo() < B.getValNo();
3977       });
3978 }
3979 #endif
3980
3981 namespace {
3982 /// This is a helper class for lowering variable arguments parameters.
3983 class VarArgsLoweringHelper {
3984 public:
3985   VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3986                         SelectionDAG &DAG, const X86Subtarget &Subtarget,
3987                         CallingConv::ID CallConv, CCState &CCInfo)
3988       : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3989         TheMachineFunction(DAG.getMachineFunction()),
3990         TheFunction(TheMachineFunction.getFunction()),
3991         FrameInfo(TheMachineFunction.getFrameInfo()),
3992         FrameLowering(*Subtarget.getFrameLowering()),
3993         TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3994         CCInfo(CCInfo) {}
3995
3996   // Lower variable arguments parameters.
3997   void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3998
3999 private:
4000   void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
4001
4002   void forwardMustTailParameters(SDValue &Chain);
4003
4004   bool is64Bit() const { return Subtarget.is64Bit(); }
4005   bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
4006
4007   X86MachineFunctionInfo *FuncInfo;
4008   const SDLoc &DL;
4009   SelectionDAG &DAG;
4010   const X86Subtarget &Subtarget;
4011   MachineFunction &TheMachineFunction;
4012   const Function &TheFunction;
4013   MachineFrameInfo &FrameInfo;
4014   const TargetFrameLowering &FrameLowering;
4015   const TargetLowering &TargLowering;
4016   CallingConv::ID CallConv;
4017   CCState &CCInfo;
4018 };
4019 } // namespace
4020
4021 void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
4022     SDValue &Chain, unsigned StackSize) {
4023   // If the function takes variable number of arguments, make a frame index for
4024   // the start of the first vararg value... for expansion of llvm.va_start. We
4025   // can skip this if there are no va_start calls.
4026   if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
4027                     CallConv != CallingConv::X86_ThisCall)) {
4028     FuncInfo->setVarArgsFrameIndex(
4029         FrameInfo.CreateFixedObject(1, StackSize, true));
4030   }
4031
4032   // 64-bit calling conventions support varargs and register parameters, so we
4033   // have to do extra work to spill them in the prologue.
4034   if (is64Bit()) {
4035     // Find the first unallocated argument registers.
4036     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
4037     ArrayRef<MCPhysReg> ArgXMMs =
4038         get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
4039     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
4040     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
4041
4042     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
4043            "SSE register cannot be used when SSE is disabled!");
4044
4045     if (isWin64()) {
4046       // Get to the caller-allocated home save location.  Add 8 to account
4047       // for the return address.
4048       int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
4049       FuncInfo->setRegSaveFrameIndex(
4050           FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
4051       // Fixup to set vararg frame on shadow area (4 x i64).
4052       if (NumIntRegs < 4)
4053         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
4054     } else {
4055       // For X86-64, if there are vararg parameters that are passed via
4056       // registers, then we must store them to their spots on the stack so
4057       // they may be loaded by dereferencing the result of va_next.
4058       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
4059       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
4060       FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
4061           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
4062     }
4063
4064     SmallVector<SDValue, 6>
4065         LiveGPRs; // list of SDValue for GPR registers keeping live input value
4066     SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
4067                                          // keeping live input value
4068     SDValue ALVal; // if applicable keeps SDValue for %al register
4069
4070     // Gather all the live in physical registers.
4071     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
4072       Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
4073       LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
4074     }
4075     const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
4076     if (!AvailableXmms.empty()) {
4077       Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
4078       ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
4079       for (MCPhysReg Reg : AvailableXmms) {
4080         // FastRegisterAllocator spills virtual registers at basic
4081         // block boundary. That leads to usages of xmm registers
4082         // outside of check for %al. Pass physical registers to
4083         // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
4084         TheMachineFunction.getRegInfo().addLiveIn(Reg);
4085         LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
4086       }
4087     }
4088
4089     // Store the integer parameter registers.
4090     SmallVector<SDValue, 8> MemOps;
4091     SDValue RSFIN =
4092         DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
4093                           TargLowering.getPointerTy(DAG.getDataLayout()));
4094     unsigned Offset = FuncInfo->getVarArgsGPOffset();
4095     for (SDValue Val : LiveGPRs) {
4096       SDValue FIN = DAG.getNode(ISD::ADD, DL,
4097                                 TargLowering.getPointerTy(DAG.getDataLayout()),
4098                                 RSFIN, DAG.getIntPtrConstant(Offset, DL));
4099       SDValue Store =
4100           DAG.getStore(Val.getValue(1), DL, Val, FIN,
4101                        MachinePointerInfo::getFixedStack(
4102                            DAG.getMachineFunction(),
4103                            FuncInfo->getRegSaveFrameIndex(), Offset));
4104       MemOps.push_back(Store);
4105       Offset += 8;
4106     }
4107
4108     // Now store the XMM (fp + vector) parameter registers.
4109     if (!LiveXMMRegs.empty()) {
4110       SmallVector<SDValue, 12> SaveXMMOps;
4111       SaveXMMOps.push_back(Chain);
4112       SaveXMMOps.push_back(ALVal);
4113       SaveXMMOps.push_back(RSFIN);
4114       SaveXMMOps.push_back(
4115           DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
4116       llvm::append_range(SaveXMMOps, LiveXMMRegs);
4117       MachineMemOperand *StoreMMO =
4118           DAG.getMachineFunction().getMachineMemOperand(
4119               MachinePointerInfo::getFixedStack(
4120                   DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),
4121                   Offset),
4122               MachineMemOperand::MOStore, 128, Align(16));
4123       MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,
4124                                                DL, DAG.getVTList(MVT::Other),
4125                                                SaveXMMOps, MVT::i8, StoreMMO));
4126     }
4127
4128     if (!MemOps.empty())
4129       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
4130   }
4131 }
4132
4133 void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
4134   // Find the largest legal vector type.
4135   MVT VecVT = MVT::Other;
4136   // FIXME: Only some x86_32 calling conventions support AVX512.
4137   if (Subtarget.useAVX512Regs() &&
4138       (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
4139                      CallConv == CallingConv::Intel_OCL_BI)))
4140     VecVT = MVT::v16f32;
4141   else if (Subtarget.hasAVX())
4142     VecVT = MVT::v8f32;
4143   else if (Subtarget.hasSSE2())
4144     VecVT = MVT::v4f32;
4145
4146   // We forward some GPRs and some vector types.
4147   SmallVector<MVT, 2> RegParmTypes;
4148   MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
4149   RegParmTypes.push_back(IntVT);
4150   if (VecVT != MVT::Other)
4151     RegParmTypes.push_back(VecVT);
4152
4153   // Compute the set of forwarded registers. The rest are scratch.
4154   SmallVectorImpl<ForwardedRegister> &Forwards =
4155       FuncInfo->getForwardedMustTailRegParms();
4156   CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
4157
4158   // Forward AL for SysV x86_64 targets, since it is used for varargs.
4159   if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
4160     Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
4161     Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
4162   }
4163
4164   // Copy all forwards from physical to virtual registers.
4165   for (ForwardedRegister &FR : Forwards) {
4166     // FIXME: Can we use a less constrained schedule?
4167     SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
4168     FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
4169         TargLowering.getRegClassFor(FR.VT));
4170     Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
4171   }
4172 }
4173
4174 void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
4175                                                    unsigned StackSize) {
4176   // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
4177   // If necessary, it would be set into the correct value later.
4178   FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
4179   FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4180
4181   if (FrameInfo.hasVAStart())
4182     createVarArgAreaAndStoreRegisters(Chain, StackSize);
4183
4184   if (FrameInfo.hasMustTailInVarArgFunc())
4185     forwardMustTailParameters(Chain);
4186 }
4187
4188 SDValue X86TargetLowering::LowerFormalArguments(
4189     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
4190     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4191     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4192   MachineFunction &MF = DAG.getMachineFunction();
4193   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4194
4195   const Function &F = MF.getFunction();
4196   if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
4197       F.getName() == "main")
4198     FuncInfo->setForceFramePointer(true);
4199
4200   MachineFrameInfo &MFI = MF.getFrameInfo();
4201   bool Is64Bit = Subtarget.is64Bit();
4202   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4203
4204   assert(
4205       !(IsVarArg && canGuaranteeTCO(CallConv)) &&
4206       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
4207
4208   // Assign locations to all of the incoming arguments.
4209   SmallVector<CCValAssign, 16> ArgLocs;
4210   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4211
4212   // Allocate shadow area for Win64.
4213   if (IsWin64)
4214     CCInfo.AllocateStack(32, Align(8));
4215
4216   CCInfo.AnalyzeArguments(Ins, CC_X86);
4217
4218   // In vectorcall calling convention a second pass is required for the HVA
4219   // types.
4220   if (CallingConv::X86_VectorCall == CallConv) {
4221     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
4222   }
4223
4224   // The next loop assumes that the locations are in the same order of the
4225   // input arguments.
4226   assert(isSortedByValueNo(ArgLocs) &&
4227          "Argument Location list must be sorted before lowering");
4228
4229   SDValue ArgValue;
4230   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
4231        ++I, ++InsIndex) {
4232     assert(InsIndex < Ins.size() && "Invalid Ins index");
4233     CCValAssign &VA = ArgLocs[I];
4234
4235     if (VA.isRegLoc()) {
4236       EVT RegVT = VA.getLocVT();
4237       if (VA.needsCustom()) {
4238         assert(
4239             VA.getValVT() == MVT::v64i1 &&
4240             "Currently the only custom case is when we split v64i1 to 2 regs");
4241
4242         // v64i1 values, in regcall calling convention, that are
4243         // compiled to 32 bit arch, are split up into two registers.
4244         ArgValue =
4245             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
4246       } else {
4247         const TargetRegisterClass *RC;
4248         if (RegVT == MVT::i8)
4249           RC = &X86::GR8RegClass;
4250         else if (RegVT == MVT::i16)
4251           RC = &X86::GR16RegClass;
4252         else if (RegVT == MVT::i32)
4253           RC = &X86::GR32RegClass;
4254         else if (Is64Bit && RegVT == MVT::i64)
4255           RC = &X86::GR64RegClass;
4256         else if (RegVT == MVT::f16)
4257           RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;
4258         else if (RegVT == MVT::f32)
4259           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
4260         else if (RegVT == MVT::f64)
4261           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
4262         else if (RegVT == MVT::f80)
4263           RC = &X86::RFP80RegClass;
4264         else if (RegVT == MVT::f128)
4265           RC = &X86::VR128RegClass;
4266         else if (RegVT.is512BitVector())
4267           RC = &X86::VR512RegClass;
4268         else if (RegVT.is256BitVector())
4269           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
4270         else if (RegVT.is128BitVector())
4271           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
4272         else if (RegVT == MVT::x86mmx)
4273           RC = &X86::VR64RegClass;
4274         else if (RegVT == MVT::v1i1)
4275           RC = &X86::VK1RegClass;
4276         else if (RegVT == MVT::v8i1)
4277           RC = &X86::VK8RegClass;
4278         else if (RegVT == MVT::v16i1)
4279           RC = &X86::VK16RegClass;
4280         else if (RegVT == MVT::v32i1)
4281           RC = &X86::VK32RegClass;
4282         else if (RegVT == MVT::v64i1)
4283           RC = &X86::VK64RegClass;
4284         else
4285           llvm_unreachable("Unknown argument type!");
4286
4287         Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4288         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4289       }
4290
4291       // If this is an 8 or 16-bit value, it is really passed promoted to 32
4292       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
4293       // right size.
4294       if (VA.getLocInfo() == CCValAssign::SExt)
4295         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
4296                                DAG.getValueType(VA.getValVT()));
4297       else if (VA.getLocInfo() == CCValAssign::ZExt)
4298         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
4299                                DAG.getValueType(VA.getValVT()));
4300       else if (VA.getLocInfo() == CCValAssign::BCvt)
4301         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
4302
4303       if (VA.isExtInLoc()) {
4304         // Handle MMX values passed in XMM regs.
4305         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
4306           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
4307         else if (VA.getValVT().isVector() &&
4308                  VA.getValVT().getScalarType() == MVT::i1 &&
4309                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
4310                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
4311           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
4312           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
4313         } else
4314           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4315       }
4316     } else {
4317       assert(VA.isMemLoc());
4318       ArgValue =
4319           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
4320     }
4321
4322     // If value is passed via pointer - do a load.
4323     if (VA.getLocInfo() == CCValAssign::Indirect &&
4324         !(Ins[I].Flags.isByVal() && VA.isRegLoc())) {
4325       ArgValue =
4326           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
4327     }
4328
4329     InVals.push_back(ArgValue);
4330   }
4331
4332   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
4333     if (Ins[I].Flags.isSwiftAsync()) {
4334       auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
4335       if (Subtarget.is64Bit())
4336         X86FI->setHasSwiftAsyncContext(true);
4337       else {
4338         int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
4339         X86FI->setSwiftAsyncContextFrameIdx(FI);
4340         SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
4341                                   DAG.getFrameIndex(FI, MVT::i32),
4342                                   MachinePointerInfo::getFixedStack(MF, FI));
4343         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
4344       }
4345     }
4346
4347     // Swift calling convention does not require we copy the sret argument
4348     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
4349     if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
4350       continue;
4351
4352     // All x86 ABIs require that for returning structs by value we copy the
4353     // sret argument into %rax/%eax (depending on ABI) for the return. Save
4354     // the argument into a virtual register so that we can access it from the
4355     // return points.
4356     if (Ins[I].Flags.isSRet()) {
4357       assert(!FuncInfo->getSRetReturnReg() &&
4358              "SRet return has already been set");
4359       MVT PtrTy = getPointerTy(DAG.getDataLayout());
4360       Register Reg =
4361           MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
4362       FuncInfo->setSRetReturnReg(Reg);
4363       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
4364       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
4365       break;
4366     }
4367   }
4368
4369   unsigned StackSize = CCInfo.getStackSize();
4370   // Align stack specially for tail calls.
4371   if (shouldGuaranteeTCO(CallConv,
4372                          MF.getTarget().Options.GuaranteedTailCallOpt))
4373     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
4374
4375   if (IsVarArg)
4376     VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
4377         .lowerVarArgsParameters(Chain, StackSize);
4378
4379   // Some CCs need callee pop.
4380   if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
4381                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
4382     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
4383   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
4384     // X86 interrupts must pop the error code (and the alignment padding) if
4385     // present.
4386     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
4387   } else {
4388     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
4389     // If this is an sret function, the return should pop the hidden pointer.
4390     if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))
4391       FuncInfo->setBytesToPopOnReturn(4);
4392   }
4393
4394   if (!Is64Bit) {
4395     // RegSaveFrameIndex is X86-64 only.
4396     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4397   }
4398
4399   FuncInfo->setArgumentStackSize(StackSize);
4400
4401   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
4402     EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
4403     if (Personality == EHPersonality::CoreCLR) {
4404       assert(Is64Bit);
4405       // TODO: Add a mechanism to frame lowering that will allow us to indicate
4406       // that we'd prefer this slot be allocated towards the bottom of the frame
4407       // (i.e. near the stack pointer after allocating the frame).  Every
4408       // funclet needs a copy of this slot in its (mostly empty) frame, and the
4409       // offset from the bottom of this and each funclet's frame must be the
4410       // same, so the size of funclets' (mostly empty) frames is dictated by
4411       // how far this slot is from the bottom (since they allocate just enough
4412       // space to accommodate holding this slot at the correct offset).
4413       int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
4414       EHInfo->PSPSymFrameIdx = PSPSymFI;
4415     }
4416   }
4417
4418   if (shouldDisableArgRegFromCSR(CallConv) ||
4419       F.hasFnAttribute("no_caller_saved_registers")) {
4420     MachineRegisterInfo &MRI = MF.getRegInfo();
4421     for (std::pair<Register, Register> Pair : MRI.liveins())
4422       MRI.disableCalleeSavedRegister(Pair.first);
4423   }
4424
4425   return Chain;
4426 }
4427
4428 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
4429                                             SDValue Arg, const SDLoc &dl,
4430                                             SelectionDAG &DAG,
4431                                             const CCValAssign &VA,
4432                                             ISD::ArgFlagsTy Flags,
4433                                             bool isByVal) const {
4434   unsigned LocMemOffset = VA.getLocMemOffset();
4435   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
4436   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4437                        StackPtr, PtrOff);
4438   if (isByVal)
4439     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
4440
4441   MaybeAlign Alignment;
4442   if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&
4443       Arg.getSimpleValueType() != MVT::f80)
4444     Alignment = MaybeAlign(4);
4445   return DAG.getStore(
4446       Chain, dl, Arg, PtrOff,
4447       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
4448       Alignment);
4449 }
4450
4451 /// Emit a load of return address if tail call
4452 /// optimization is performed and it is required.
4453 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
4454     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
4455     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
4456   // Adjust the Return address stack slot.
4457   EVT VT = getPointerTy(DAG.getDataLayout());
4458   OutRetAddr = getReturnAddressFrameIndex(DAG);
4459
4460   // Load the "old" Return address.
4461   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
4462   return SDValue(OutRetAddr.getNode(), 1);
4463 }
4464
4465 /// Emit a store of the return address if tail call
4466 /// optimization is performed and it is required (FPDiff!=0).
4467 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
4468                                         SDValue Chain, SDValue RetAddrFrIdx,
4469                                         EVT PtrVT, unsigned SlotSize,
4470                                         int FPDiff, const SDLoc &dl) {
4471   // Store the return address to the appropriate stack slot.
4472   if (!FPDiff) return Chain;
4473   // Calculate the new stack slot for the return address.
4474   int NewReturnAddrFI =
4475     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
4476                                          false);
4477   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
4478   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
4479                        MachinePointerInfo::getFixedStack(
4480                            DAG.getMachineFunction(), NewReturnAddrFI));
4481   return Chain;
4482 }
4483
4484 /// Returns a vector_shuffle mask for an movs{s|d}, movd
4485 /// operation of specified width.
4486 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
4487                        SDValue V2) {
4488   unsigned NumElems = VT.getVectorNumElements();
4489   SmallVector<int, 8> Mask;
4490   Mask.push_back(NumElems);
4491   for (unsigned i = 1; i != NumElems; ++i)
4492     Mask.push_back(i);
4493   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4494 }
4495
4496 SDValue
4497 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
4498                              SmallVectorImpl<SDValue> &InVals) const {
4499   SelectionDAG &DAG                     = CLI.DAG;
4500   SDLoc &dl                             = CLI.DL;
4501   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
4502   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
4503   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
4504   SDValue Chain                         = CLI.Chain;
4505   SDValue Callee                        = CLI.Callee;
4506   CallingConv::ID CallConv              = CLI.CallConv;
4507   bool &isTailCall                      = CLI.IsTailCall;
4508   bool isVarArg                         = CLI.IsVarArg;
4509   const auto *CB                        = CLI.CB;
4510
4511   MachineFunction &MF = DAG.getMachineFunction();
4512   bool Is64Bit        = Subtarget.is64Bit();
4513   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
4514   bool IsSibcall      = false;
4515   bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
4516       CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
4517   bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);
4518   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
4519   bool HasNCSR = (CB && isa<CallInst>(CB) &&
4520                   CB->hasFnAttr("no_caller_saved_registers"));
4521   bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
4522   bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
4523   bool IsCFICall = IsIndirectCall && CLI.CFIType;
4524   const Module *M = MF.getMMI().getModule();
4525   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
4526
4527   MachineFunction::CallSiteInfo CSInfo;
4528   if (CallConv == CallingConv::X86_INTR)
4529     report_fatal_error("X86 interrupts may not be called directly");
4530
4531   bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
4532   if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
4533     // If we are using a GOT, disable tail calls to external symbols with
4534     // default visibility. Tail calling such a symbol requires using a GOT
4535     // relocation, which forces early binding of the symbol. This breaks code
4536     // that require lazy function symbol resolution. Using musttail or
4537     // GuaranteedTailCallOpt will override this.
4538     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4539     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
4540                G->getGlobal()->hasDefaultVisibility()))
4541       isTailCall = false;
4542   }
4543
4544   if (isTailCall && !IsMustTail) {
4545     // Check if it's really possible to do a tail call.
4546     isTailCall = IsEligibleForTailCallOptimization(
4547         Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,
4548         Ins, DAG);
4549
4550     // Sibcalls are automatically detected tailcalls which do not require
4551     // ABI changes.
4552     if (!IsGuaranteeTCO && isTailCall)
4553       IsSibcall = true;
4554
4555     if (isTailCall)
4556       ++NumTailCalls;
4557   }
4558
4559   if (IsMustTail && !isTailCall)
4560     report_fatal_error("failed to perform tail call elimination on a call "
4561                        "site marked musttail");
4562
4563   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
4564          "Var args not supported with calling convention fastcc, ghc or hipe");
4565
4566   // Analyze operands of the call, assigning locations to each operand.
4567   SmallVector<CCValAssign, 16> ArgLocs;
4568   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
4569
4570   // Allocate shadow area for Win64.
4571   if (IsWin64)
4572     CCInfo.AllocateStack(32, Align(8));
4573
4574   CCInfo.AnalyzeArguments(Outs, CC_X86);
4575
4576   // In vectorcall calling convention a second pass is required for the HVA
4577   // types.
4578   if (CallingConv::X86_VectorCall == CallConv) {
4579     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
4580   }
4581
4582   // Get a count of how many bytes are to be pushed on the stack.
4583   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4584   if (IsSibcall)
4585     // This is a sibcall. The memory operands are available in caller's
4586     // own caller's stack.
4587     NumBytes = 0;
4588   else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4589     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4590
4591   int FPDiff = 0;
4592   if (isTailCall &&
4593       shouldGuaranteeTCO(CallConv,
4594                          MF.getTarget().Options.GuaranteedTailCallOpt)) {
4595     // Lower arguments at fp - stackoffset + fpdiff.
4596     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4597
4598     FPDiff = NumBytesCallerPushed - NumBytes;
4599
4600     // Set the delta of movement of the returnaddr stackslot.
4601     // But only set if delta is greater than previous delta.
4602     if (FPDiff < X86Info->getTCReturnAddrDelta())
4603       X86Info->setTCReturnAddrDelta(FPDiff);
4604   }
4605
4606   unsigned NumBytesToPush = NumBytes;
4607   unsigned NumBytesToPop = NumBytes;
4608
4609   // If we have an inalloca argument, all stack space has already been allocated
4610   // for us and be right at the top of the stack.  We don't support multiple
4611   // arguments passed in memory when using inalloca.
4612   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4613     NumBytesToPush = 0;
4614     if (!ArgLocs.back().isMemLoc())
4615       report_fatal_error("cannot use inalloca attribute on a register "
4616                          "parameter");
4617     if (ArgLocs.back().getLocMemOffset() != 0)
4618       report_fatal_error("any parameter with the inalloca attribute must be "
4619                          "the only memory argument");
4620   } else if (CLI.IsPreallocated) {
4621     assert(ArgLocs.back().isMemLoc() &&
4622            "cannot use preallocated attribute on a register "
4623            "parameter");
4624     SmallVector<size_t, 4> PreallocatedOffsets;
4625     for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4626       if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4627         PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4628       }
4629     }
4630     auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4631     size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4632     MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4633     MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4634     NumBytesToPush = 0;
4635   }
4636
4637   if (!IsSibcall && !IsMustTail)
4638     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4639                                  NumBytes - NumBytesToPush, dl);
4640
4641   SDValue RetAddrFrIdx;
4642   // Load return address for tail calls.
4643   if (isTailCall && FPDiff)
4644     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4645                                     Is64Bit, FPDiff, dl);
4646
4647   SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4648   SmallVector<SDValue, 8> MemOpChains;
4649   SDValue StackPtr;
4650
4651   // The next loop assumes that the locations are in the same order of the
4652   // input arguments.
4653   assert(isSortedByValueNo(ArgLocs) &&
4654          "Argument Location list must be sorted before lowering");
4655
4656   // Walk the register/memloc assignments, inserting copies/loads.  In the case
4657   // of tail call optimization arguments are handle later.
4658   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4659   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4660        ++I, ++OutIndex) {
4661     assert(OutIndex < Outs.size() && "Invalid Out index");
4662     // Skip inalloca/preallocated arguments, they have already been written.
4663     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4664     if (Flags.isInAlloca() || Flags.isPreallocated())
4665       continue;
4666
4667     CCValAssign &VA = ArgLocs[I];
4668     EVT RegVT = VA.getLocVT();
4669     SDValue Arg = OutVals[OutIndex];
4670     bool isByVal = Flags.isByVal();
4671
4672     // Promote the value if needed.
4673     switch (VA.getLocInfo()) {
4674     default: llvm_unreachable("Unknown loc info!");
4675     case CCValAssign::Full: break;
4676     case CCValAssign::SExt:
4677       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4678       break;
4679     case CCValAssign::ZExt:
4680       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4681       break;
4682     case CCValAssign::AExt:
4683       if (Arg.getValueType().isVector() &&
4684           Arg.getValueType().getVectorElementType() == MVT::i1)
4685         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4686       else if (RegVT.is128BitVector()) {
4687         // Special case: passing MMX values in XMM registers.
4688         Arg = DAG.getBitcast(MVT::i64, Arg);
4689         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4690         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4691       } else
4692         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4693       break;
4694     case CCValAssign::BCvt:
4695       Arg = DAG.getBitcast(RegVT, Arg);
4696       break;
4697     case CCValAssign::Indirect: {
4698       if (isByVal) {
4699         // Memcpy the argument to a temporary stack slot to prevent
4700         // the caller from seeing any modifications the callee may make
4701         // as guaranteed by the `byval` attribute.
4702         int FrameIdx = MF.getFrameInfo().CreateStackObject(
4703             Flags.getByValSize(),
4704             std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4705         SDValue StackSlot =
4706             DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4707         Chain =
4708             CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4709         // From now on treat this as a regular pointer
4710         Arg = StackSlot;
4711         isByVal = false;
4712       } else {
4713         // Store the argument.
4714         SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4715         int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4716         Chain = DAG.getStore(
4717             Chain, dl, Arg, SpillSlot,
4718             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4719         Arg = SpillSlot;
4720       }
4721       break;
4722     }
4723     }
4724
4725     if (VA.needsCustom()) {
4726       assert(VA.getValVT() == MVT::v64i1 &&
4727              "Currently the only custom case is when we split v64i1 to 2 regs");
4728       // Split v64i1 value into two registers
4729       Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4730     } else if (VA.isRegLoc()) {
4731       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4732       const TargetOptions &Options = DAG.getTarget().Options;
4733       if (Options.EmitCallSiteInfo)
4734         CSInfo.emplace_back(VA.getLocReg(), I);
4735       if (isVarArg && IsWin64) {
4736         // Win64 ABI requires argument XMM reg to be copied to the corresponding
4737         // shadow reg if callee is a varargs function.
4738         Register ShadowReg;
4739         switch (VA.getLocReg()) {
4740         case X86::XMM0: ShadowReg = X86::RCX; break;
4741         case X86::XMM1: ShadowReg = X86::RDX; break;
4742         case X86::XMM2: ShadowReg = X86::R8; break;
4743         case X86::XMM3: ShadowReg = X86::R9; break;
4744         }
4745         if (ShadowReg)
4746           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4747       }
4748     } else if (!IsSibcall && (!isTailCall || isByVal)) {
4749       assert(VA.isMemLoc());
4750       if (!StackPtr.getNode())
4751         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4752                                       getPointerTy(DAG.getDataLayout()));
4753       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4754                                              dl, DAG, VA, Flags, isByVal));
4755     }
4756   }
4757
4758   if (!MemOpChains.empty())
4759     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4760
4761   if (Subtarget.isPICStyleGOT()) {
4762     // ELF / PIC requires GOT in the EBX register before function calls via PLT
4763     // GOT pointer (except regcall).
4764     if (!isTailCall) {
4765       // Indirect call with RegCall calling convertion may use up all the
4766       // general registers, so it is not suitable to bind EBX reister for
4767       // GOT address, just let register allocator handle it.
4768       if (CallConv != CallingConv::X86_RegCall)
4769         RegsToPass.push_back(std::make_pair(
4770           Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4771                                           getPointerTy(DAG.getDataLayout()))));
4772     } else {
4773       // If we are tail calling and generating PIC/GOT style code load the
4774       // address of the callee into ECX. The value in ecx is used as target of
4775       // the tail jump. This is done to circumvent the ebx/callee-saved problem
4776       // for tail calls on PIC/GOT architectures. Normally we would just put the
4777       // address of GOT into ebx and then call target@PLT. But for tail calls
4778       // ebx would be restored (since ebx is callee saved) before jumping to the
4779       // target@PLT.
4780
4781       // Note: The actual moving to ECX is done further down.
4782       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4783       if (G && !G->getGlobal()->hasLocalLinkage() &&
4784           G->getGlobal()->hasDefaultVisibility())
4785         Callee = LowerGlobalAddress(Callee, DAG);
4786       else if (isa<ExternalSymbolSDNode>(Callee))
4787         Callee = LowerExternalSymbol(Callee, DAG);
4788     }
4789   }
4790
4791   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&
4792       (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {
4793     // From AMD64 ABI document:
4794     // For calls that may call functions that use varargs or stdargs
4795     // (prototype-less calls or calls to functions containing ellipsis (...) in
4796     // the declaration) %al is used as hidden argument to specify the number
4797     // of SSE registers used. The contents of %al do not need to match exactly
4798     // the number of registers, but must be an ubound on the number of SSE
4799     // registers used and is in the range 0 - 8 inclusive.
4800
4801     // Count the number of XMM registers allocated.
4802     static const MCPhysReg XMMArgRegs[] = {
4803       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4804       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4805     };
4806     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4807     assert((Subtarget.hasSSE1() || !NumXMMRegs)
4808            && "SSE registers cannot be used when SSE is disabled");
4809     RegsToPass.push_back(std::make_pair(Register(X86::AL),
4810                                         DAG.getConstant(NumXMMRegs, dl,
4811                                                         MVT::i8)));
4812   }
4813
4814   if (isVarArg && IsMustTail) {
4815     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4816     for (const auto &F : Forwards) {
4817       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4818       RegsToPass.push_back(std::make_pair(F.PReg, Val));
4819     }
4820   }
4821
4822   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
4823   // don't need this because the eligibility check rejects calls that require
4824   // shuffling arguments passed in memory.
4825   if (!IsSibcall && isTailCall) {
4826     // Force all the incoming stack arguments to be loaded from the stack
4827     // before any new outgoing arguments are stored to the stack, because the
4828     // outgoing stack slots may alias the incoming argument stack slots, and
4829     // the alias isn't otherwise explicit. This is slightly more conservative
4830     // than necessary, because it means that each store effectively depends
4831     // on every argument instead of just those arguments it would clobber.
4832     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4833
4834     SmallVector<SDValue, 8> MemOpChains2;
4835     SDValue FIN;
4836     int FI = 0;
4837     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4838          ++I, ++OutsIndex) {
4839       CCValAssign &VA = ArgLocs[I];
4840
4841       if (VA.isRegLoc()) {
4842         if (VA.needsCustom()) {
4843           assert((CallConv == CallingConv::X86_RegCall) &&
4844                  "Expecting custom case only in regcall calling convention");
4845           // This means that we are in special case where one argument was
4846           // passed through two register locations - Skip the next location
4847           ++I;
4848         }
4849
4850         continue;
4851       }
4852
4853       assert(VA.isMemLoc());
4854       SDValue Arg = OutVals[OutsIndex];
4855       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4856       // Skip inalloca/preallocated arguments.  They don't require any work.
4857       if (Flags.isInAlloca() || Flags.isPreallocated())
4858         continue;
4859       // Create frame index.
4860       int32_t Offset = VA.getLocMemOffset()+FPDiff;
4861       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4862       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4863       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4864
4865       if (Flags.isByVal()) {
4866         // Copy relative to framepointer.
4867         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4868         if (!StackPtr.getNode())
4869           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4870                                         getPointerTy(DAG.getDataLayout()));
4871         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4872                              StackPtr, Source);
4873
4874         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4875                                                          ArgChain,
4876                                                          Flags, DAG, dl));
4877       } else {
4878         // Store relative to framepointer.
4879         MemOpChains2.push_back(DAG.getStore(
4880             ArgChain, dl, Arg, FIN,
4881             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4882       }
4883     }
4884
4885     if (!MemOpChains2.empty())
4886       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4887
4888     // Store the return address to the appropriate stack slot.
4889     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4890                                      getPointerTy(DAG.getDataLayout()),
4891                                      RegInfo->getSlotSize(), FPDiff, dl);
4892   }
4893
4894   // Build a sequence of copy-to-reg nodes chained together with token chain
4895   // and glue operands which copy the outgoing args into registers.
4896   SDValue InGlue;
4897   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4898     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4899                              RegsToPass[i].second, InGlue);
4900     InGlue = Chain.getValue(1);
4901   }
4902
4903   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4904     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
4905     // In the 64-bit large code model, we have to make all calls
4906     // through a register, since the call instruction's 32-bit
4907     // pc-relative offset may not be large enough to hold the whole
4908     // address.
4909   } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4910              Callee->getOpcode() == ISD::ExternalSymbol) {
4911     // Lower direct calls to global addresses and external symbols. Setting
4912     // ForCall to true here has the effect of removing WrapperRIP when possible
4913     // to allow direct calls to be selected without first materializing the
4914     // address into a register.
4915     Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4916   } else if (Subtarget.isTarget64BitILP32() &&
4917              Callee.getValueType() == MVT::i32) {
4918     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4919     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4920   }
4921
4922   // Returns a chain & a glue for retval copy to use.
4923   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4924   SmallVector<SDValue, 8> Ops;
4925
4926   if (!IsSibcall && isTailCall && !IsMustTail) {
4927     Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);
4928     InGlue = Chain.getValue(1);
4929   }
4930
4931   Ops.push_back(Chain);
4932   Ops.push_back(Callee);
4933
4934   if (isTailCall)
4935     Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4936
4937   // Add argument registers to the end of the list so that they are known live
4938   // into the call.
4939   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4940     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4941                                   RegsToPass[i].second.getValueType()));
4942
4943   // Add a register mask operand representing the call-preserved registers.
4944   const uint32_t *Mask = [&]() {
4945     auto AdaptedCC = CallConv;
4946     // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4947     // use X86_INTR calling convention because it has the same CSR mask
4948     // (same preserved registers).
4949     if (HasNCSR)
4950       AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4951     // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4952     // to use the CSR_NoRegs_RegMask.
4953     if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4954       AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4955     return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4956   }();
4957   assert(Mask && "Missing call preserved mask for calling convention");
4958
4959   // If this is an invoke in a 32-bit function using a funclet-based
4960   // personality, assume the function clobbers all registers. If an exception
4961   // is thrown, the runtime will not restore CSRs.
4962   // FIXME: Model this more precisely so that we can register allocate across
4963   // the normal edge and spill and fill across the exceptional edge.
4964   if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4965     const Function &CallerFn = MF.getFunction();
4966     EHPersonality Pers =
4967         CallerFn.hasPersonalityFn()
4968             ? classifyEHPersonality(CallerFn.getPersonalityFn())
4969             : EHPersonality::Unknown;
4970     if (isFuncletEHPersonality(Pers))
4971       Mask = RegInfo->getNoPreservedMask();
4972   }
4973
4974   // Define a new register mask from the existing mask.
4975   uint32_t *RegMask = nullptr;
4976
4977   // In some calling conventions we need to remove the used physical registers
4978   // from the reg mask. Create a new RegMask for such calling conventions.
4979   // RegMask for calling conventions that disable only return registers (e.g.
4980   // preserve_most) will be modified later in LowerCallResult.
4981   bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;
4982   if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {
4983     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4984
4985     // Allocate a new Reg Mask and copy Mask.
4986     RegMask = MF.allocateRegMask();
4987     unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4988     memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4989
4990     // Make sure all sub registers of the argument registers are reset
4991     // in the RegMask.
4992     if (ShouldDisableArgRegs) {
4993       for (auto const &RegPair : RegsToPass)
4994         for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first))
4995           RegMask[SubReg / 32] &= ~(1u << (SubReg % 32));
4996     }
4997
4998     // Create the RegMask Operand according to our updated mask.
4999     Ops.push_back(DAG.getRegisterMask(RegMask));
5000   } else {
5001     // Create the RegMask Operand according to the static mask.
5002     Ops.push_back(DAG.getRegisterMask(Mask));
5003   }
5004
5005   if (InGlue.getNode())
5006     Ops.push_back(InGlue);
5007
5008   if (isTailCall) {
5009     // We used to do:
5010     //// If this is the first return lowered for this function, add the regs
5011     //// to the liveout set for the function.
5012     // This isn't right, although it's probably harmless on x86; liveouts
5013     // should be computed from returns not tail calls.  Consider a void
5014     // function making a tail call to a function returning int.
5015     MF.getFrameInfo().setHasTailCall();
5016     SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
5017
5018     if (IsCFICall)
5019       Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
5020
5021     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
5022     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
5023     return Ret;
5024   }
5025
5026   if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
5027     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
5028   } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
5029     // Calls with a "clang.arc.attachedcall" bundle are special. They should be
5030     // expanded to the call, directly followed by a special marker sequence and
5031     // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
5032     assert(!isTailCall &&
5033            "tail calls cannot be marked with clang.arc.attachedcall");
5034     assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode");
5035
5036     // Add a target global address for the retainRV/claimRV runtime function
5037     // just before the call target.
5038     Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
5039     auto PtrVT = getPointerTy(DAG.getDataLayout());
5040     auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);
5041     Ops.insert(Ops.begin() + 1, GA);
5042     Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
5043   } else {
5044     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
5045   }
5046
5047   if (IsCFICall)
5048     Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
5049
5050   InGlue = Chain.getValue(1);
5051   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
5052   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
5053
5054   // Save heapallocsite metadata.
5055   if (CLI.CB)
5056     if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
5057       DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
5058
5059   // Create the CALLSEQ_END node.
5060   unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.
5061   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
5062                        DAG.getTarget().Options.GuaranteedTailCallOpt))
5063     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
5064   else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)
5065     // If this call passes a struct-return pointer, the callee
5066     // pops that struct pointer.
5067     NumBytesForCalleeToPop = 4;
5068
5069   // Returns a glue for retval copy to use.
5070   if (!IsSibcall) {
5071     Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,
5072                                InGlue, dl);
5073     InGlue = Chain.getValue(1);
5074   }
5075
5076   // Handle result values, copying them out of physregs into vregs that we
5077   // return.
5078   return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
5079                          InVals, RegMask);
5080 }
5081
5082 //===----------------------------------------------------------------------===//
5083 //                Fast Calling Convention (tail call) implementation
5084 //===----------------------------------------------------------------------===//
5085
5086 //  Like std call, callee cleans arguments, convention except that ECX is
5087 //  reserved for storing the tail called function address. Only 2 registers are
5088 //  free for argument passing (inreg). Tail call optimization is performed
5089 //  provided:
5090 //                * tailcallopt is enabled
5091 //                * caller/callee are fastcc
5092 //  On X86_64 architecture with GOT-style position independent code only local
5093 //  (within module) calls are supported at the moment.
5094 //  To keep the stack aligned according to platform abi the function
5095 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
5096 //  of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
5097 //  If a tail called function callee has more arguments than the caller the
5098 //  caller needs to make sure that there is room to move the RETADDR to. This is
5099 //  achieved by reserving an area the size of the argument delta right after the
5100 //  original RETADDR, but before the saved framepointer or the spilled registers
5101 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
5102 //  stack layout:
5103 //    arg1
5104 //    arg2
5105 //    RETADDR
5106 //    [ new RETADDR
5107 //      move area ]
5108 //    (possible EBP)
5109 //    ESI
5110 //    EDI
5111 //    local1 ..
5112
5113 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
5114 /// requirement.
5115 unsigned
5116 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
5117                                                SelectionDAG &DAG) const {
5118   const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
5119   const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
5120   assert(StackSize % SlotSize == 0 &&
5121          "StackSize must be a multiple of SlotSize");
5122   return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
5123 }
5124
5125 /// Return true if the given stack call argument is already available in the
5126 /// same position (relatively) of the caller's incoming argument stack.
5127 static
5128 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
5129                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
5130                          const X86InstrInfo *TII, const CCValAssign &VA) {
5131   unsigned Bytes = Arg.getValueSizeInBits() / 8;
5132
5133   for (;;) {
5134     // Look through nodes that don't alter the bits of the incoming value.
5135     unsigned Op = Arg.getOpcode();
5136     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
5137       Arg = Arg.getOperand(0);
5138       continue;
5139     }
5140     if (Op == ISD::TRUNCATE) {
5141       const SDValue &TruncInput = Arg.getOperand(0);
5142       if (TruncInput.getOpcode() == ISD::AssertZext &&
5143           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
5144               Arg.getValueType()) {
5145         Arg = TruncInput.getOperand(0);
5146         continue;
5147       }
5148     }
5149     break;
5150   }
5151
5152   int FI = INT_MAX;
5153   if (Arg.getOpcode() == ISD::CopyFromReg) {
5154     Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
5155     if (!VR.isVirtual())
5156       return false;
5157     MachineInstr *Def = MRI->getVRegDef(VR);
5158     if (!Def)
5159       return false;
5160     if (!Flags.isByVal()) {
5161       if (!TII->isLoadFromStackSlot(*Def, FI))
5162         return false;
5163     } else {
5164       unsigned Opcode = Def->getOpcode();
5165       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
5166            Opcode == X86::LEA64_32r) &&
5167           Def->getOperand(1).isFI()) {
5168         FI = Def->getOperand(1).getIndex();
5169         Bytes = Flags.getByValSize();
5170       } else
5171         return false;
5172     }
5173   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
5174     if (Flags.isByVal())
5175       // ByVal argument is passed in as a pointer but it's now being
5176       // dereferenced. e.g.
5177       // define @foo(%struct.X* %A) {
5178       //   tail call @bar(%struct.X* byval %A)
5179       // }
5180       return false;
5181     SDValue Ptr = Ld->getBasePtr();
5182     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
5183     if (!FINode)
5184       return false;
5185     FI = FINode->getIndex();
5186   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
5187     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
5188     FI = FINode->getIndex();
5189     Bytes = Flags.getByValSize();
5190   } else
5191     return false;
5192
5193   assert(FI != INT_MAX);
5194   if (!MFI.isFixedObjectIndex(FI))
5195     return false;
5196
5197   if (Offset != MFI.getObjectOffset(FI))
5198     return false;
5199
5200   // If this is not byval, check that the argument stack object is immutable.
5201   // inalloca and argument copy elision can create mutable argument stack
5202   // objects. Byval objects can be mutated, but a byval call intends to pass the
5203   // mutated memory.
5204   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
5205     return false;
5206
5207   if (VA.getLocVT().getFixedSizeInBits() >
5208       Arg.getValueSizeInBits().getFixedValue()) {
5209     // If the argument location is wider than the argument type, check that any
5210     // extension flags match.
5211     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
5212         Flags.isSExt() != MFI.isObjectSExt(FI)) {
5213       return false;
5214     }
5215   }
5216
5217   return Bytes == MFI.getObjectSize(FI);
5218 }
5219
5220 /// Check whether the call is eligible for tail call optimization. Targets
5221 /// that want to do tail call optimization should implement this function.
5222 bool X86TargetLowering::IsEligibleForTailCallOptimization(
5223     SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,
5224     bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
5225     const SmallVectorImpl<SDValue> &OutVals,
5226     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
5227   if (!mayTailCallThisCC(CalleeCC))
5228     return false;
5229
5230   // If -tailcallopt is specified, make fastcc functions tail-callable.
5231   MachineFunction &MF = DAG.getMachineFunction();
5232   const Function &CallerF = MF.getFunction();
5233
5234   // If the function return type is x86_fp80 and the callee return type is not,
5235   // then the FP_EXTEND of the call result is not a nop. It's not safe to
5236   // perform a tailcall optimization here.
5237   if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
5238     return false;
5239
5240   CallingConv::ID CallerCC = CallerF.getCallingConv();
5241   bool CCMatch = CallerCC == CalleeCC;
5242   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
5243   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
5244   bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
5245       CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
5246
5247   // Win64 functions have extra shadow space for argument homing. Don't do the
5248   // sibcall if the caller and callee have mismatched expectations for this
5249   // space.
5250   if (IsCalleeWin64 != IsCallerWin64)
5251     return false;
5252
5253   if (IsGuaranteeTCO) {
5254     if (canGuaranteeTCO(CalleeCC) && CCMatch)
5255       return true;
5256     return false;
5257   }
5258
5259   // Look for obvious safe cases to perform tail call optimization that do not
5260   // require ABI changes. This is what gcc calls sibcall.
5261
5262   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
5263   // emit a special epilogue.
5264   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5265   if (RegInfo->hasStackRealignment(MF))
5266     return false;
5267
5268   // Also avoid sibcall optimization if we're an sret return fn and the callee
5269   // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
5270   // insufficient.
5271   if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
5272     // For a compatible tail call the callee must return our sret pointer. So it
5273     // needs to be (a) an sret function itself and (b) we pass our sret as its
5274     // sret. Condition #b is harder to determine.
5275     return false;
5276   } else if (IsCalleePopSRet)
5277     // The callee pops an sret, so we cannot tail-call, as our caller doesn't
5278     // expect that.
5279     return false;
5280
5281   // Do not sibcall optimize vararg calls unless all arguments are passed via
5282   // registers.
5283   LLVMContext &C = *DAG.getContext();
5284   if (isVarArg && !Outs.empty()) {
5285     // Optimizing for varargs on Win64 is unlikely to be safe without
5286     // additional testing.
5287     if (IsCalleeWin64 || IsCallerWin64)
5288       return false;
5289
5290     SmallVector<CCValAssign, 16> ArgLocs;
5291     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5292
5293     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
5294     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
5295       if (!ArgLocs[i].isRegLoc())
5296         return false;
5297   }
5298
5299   // If the call result is in ST0 / ST1, it needs to be popped off the x87
5300   // stack.  Therefore, if it's not used by the call it is not safe to optimize
5301   // this into a sibcall.
5302   bool Unused = false;
5303   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
5304     if (!Ins[i].Used) {
5305       Unused = true;
5306       break;
5307     }
5308   }
5309   if (Unused) {
5310     SmallVector<CCValAssign, 16> RVLocs;
5311     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
5312     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
5313     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
5314       CCValAssign &VA = RVLocs[i];
5315       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
5316         return false;
5317     }
5318   }
5319
5320   // Check that the call results are passed in the same way.
5321   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
5322                                   RetCC_X86, RetCC_X86))
5323     return false;
5324   // The callee has to preserve all registers the caller needs to preserve.
5325   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
5326   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
5327   if (!CCMatch) {
5328     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
5329     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
5330       return false;
5331   }
5332
5333   unsigned StackArgsSize = 0;
5334
5335   // If the callee takes no arguments then go on to check the results of the
5336   // call.
5337   if (!Outs.empty()) {
5338     // Check if stack adjustment is needed. For now, do not do this if any
5339     // argument is passed on the stack.
5340     SmallVector<CCValAssign, 16> ArgLocs;
5341     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
5342
5343     // Allocate shadow area for Win64
5344     if (IsCalleeWin64)
5345       CCInfo.AllocateStack(32, Align(8));
5346
5347     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
5348     StackArgsSize = CCInfo.getStackSize();
5349
5350     if (CCInfo.getStackSize()) {
5351       // Check if the arguments are already laid out in the right way as
5352       // the caller's fixed stack objects.
5353       MachineFrameInfo &MFI = MF.getFrameInfo();
5354       const MachineRegisterInfo *MRI = &MF.getRegInfo();
5355       const X86InstrInfo *TII = Subtarget.getInstrInfo();
5356       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5357         CCValAssign &VA = ArgLocs[i];
5358         SDValue Arg = OutVals[i];
5359         ISD::ArgFlagsTy Flags = Outs[i].Flags;
5360         if (VA.getLocInfo() == CCValAssign::Indirect)
5361           return false;
5362         if (!VA.isRegLoc()) {
5363           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
5364                                    MFI, MRI, TII, VA))
5365             return false;
5366         }
5367       }
5368     }
5369
5370     bool PositionIndependent = isPositionIndependent();
5371     // If the tailcall address may be in a register, then make sure it's
5372     // possible to register allocate for it. In 32-bit, the call address can
5373     // only target EAX, EDX, or ECX since the tail call must be scheduled after
5374     // callee-saved registers are restored. These happen to be the same
5375     // registers used to pass 'inreg' arguments so watch out for those.
5376     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
5377                                   !isa<ExternalSymbolSDNode>(Callee)) ||
5378                                  PositionIndependent)) {
5379       unsigned NumInRegs = 0;
5380       // In PIC we need an extra register to formulate the address computation
5381       // for the callee.
5382       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
5383
5384       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
5385         CCValAssign &VA = ArgLocs[i];
5386         if (!VA.isRegLoc())
5387           continue;
5388         Register Reg = VA.getLocReg();
5389         switch (Reg) {
5390         default: break;
5391         case X86::EAX: case X86::EDX: case X86::ECX:
5392           if (++NumInRegs == MaxInRegs)
5393             return false;
5394           break;
5395         }
5396       }
5397     }
5398
5399     const MachineRegisterInfo &MRI = MF.getRegInfo();
5400     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5401       return false;
5402   }
5403
5404   bool CalleeWillPop =
5405       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
5406                        MF.getTarget().Options.GuaranteedTailCallOpt);
5407
5408   if (unsigned BytesToPop =
5409           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
5410     // If we have bytes to pop, the callee must pop them.
5411     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
5412     if (!CalleePopMatches)
5413       return false;
5414   } else if (CalleeWillPop && StackArgsSize > 0) {
5415     // If we don't have bytes to pop, make sure the callee doesn't pop any.
5416     return false;
5417   }
5418
5419   return true;
5420 }
5421
5422 FastISel *
5423 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
5424                                   const TargetLibraryInfo *libInfo) const {
5425   return X86::createFastISel(funcInfo, libInfo);
5426 }
5427
5428 //===----------------------------------------------------------------------===//
5429 //                           Other Lowering Hooks
5430 //===----------------------------------------------------------------------===//
5431
5432 bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
5433                       bool AssumeSingleUse) {
5434   if (!AssumeSingleUse && !Op.hasOneUse())
5435     return false;
5436   if (!ISD::isNormalLoad(Op.getNode()))
5437     return false;
5438
5439   // If this is an unaligned vector, make sure the target supports folding it.
5440   auto *Ld = cast<LoadSDNode>(Op.getNode());
5441   if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
5442       Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
5443     return false;
5444
5445   // TODO: If this is a non-temporal load and the target has an instruction
5446   //       for it, it should not be folded. See "useNonTemporalLoad()".
5447
5448   return true;
5449 }
5450
5451 bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
5452                                           const X86Subtarget &Subtarget,
5453                                           bool AssumeSingleUse) {
5454   assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
5455   if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
5456     return false;
5457
5458   // We can not replace a wide volatile load with a broadcast-from-memory,
5459   // because that would narrow the load, which isn't legal for volatiles.
5460   auto *Ld = cast<LoadSDNode>(Op.getNode());
5461   return !Ld->isVolatile() ||
5462          Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
5463 }
5464
5465 bool X86::mayFoldIntoStore(SDValue Op) {
5466   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
5467 }
5468
5469 bool X86::mayFoldIntoZeroExtend(SDValue Op) {
5470   if (Op.hasOneUse()) {
5471     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
5472     return (ISD::ZERO_EXTEND == Opcode);
5473   }
5474   return false;
5475 }
5476
5477 static bool isTargetShuffle(unsigned Opcode) {
5478   switch(Opcode) {
5479   default: return false;
5480   case X86ISD::BLENDI:
5481   case X86ISD::PSHUFB:
5482   case X86ISD::PSHUFD:
5483   case X86ISD::PSHUFHW:
5484   case X86ISD::PSHUFLW:
5485   case X86ISD::SHUFP:
5486   case X86ISD::INSERTPS:
5487   case X86ISD::EXTRQI:
5488   case X86ISD::INSERTQI:
5489   case X86ISD::VALIGN:
5490   case X86ISD::PALIGNR:
5491   case X86ISD::VSHLDQ:
5492   case X86ISD::VSRLDQ:
5493   case X86ISD::MOVLHPS:
5494   case X86ISD::MOVHLPS:
5495   case X86ISD::MOVSHDUP:
5496   case X86ISD::MOVSLDUP:
5497   case X86ISD::MOVDDUP:
5498   case X86ISD::MOVSS:
5499   case X86ISD::MOVSD:
5500   case X86ISD::MOVSH:
5501   case X86ISD::UNPCKL:
5502   case X86ISD::UNPCKH:
5503   case X86ISD::VBROADCAST:
5504   case X86ISD::VPERMILPI:
5505   case X86ISD::VPERMILPV:
5506   case X86ISD::VPERM2X128:
5507   case X86ISD::SHUF128:
5508   case X86ISD::VPERMIL2:
5509   case X86ISD::VPERMI:
5510   case X86ISD::VPPERM:
5511   case X86ISD::VPERMV:
5512   case X86ISD::VPERMV3:
5513   case X86ISD::VZEXT_MOVL:
5514     return true;
5515   }
5516 }
5517
5518 static bool isTargetShuffleVariableMask(unsigned Opcode) {
5519   switch (Opcode) {
5520   default: return false;
5521   // Target Shuffles.
5522   case X86ISD::PSHUFB:
5523   case X86ISD::VPERMILPV:
5524   case X86ISD::VPERMIL2:
5525   case X86ISD::VPPERM:
5526   case X86ISD::VPERMV:
5527   case X86ISD::VPERMV3:
5528     return true;
5529   // 'Faux' Target Shuffles.
5530   case ISD::OR:
5531   case ISD::AND:
5532   case X86ISD::ANDNP:
5533     return true;
5534   }
5535 }
5536
5537 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
5538   MachineFunction &MF = DAG.getMachineFunction();
5539   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5540   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
5541   int ReturnAddrIndex = FuncInfo->getRAIndex();
5542
5543   if (ReturnAddrIndex == 0) {
5544     // Set up a frame object for the return address.
5545     unsigned SlotSize = RegInfo->getSlotSize();
5546     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
5547                                                           -(int64_t)SlotSize,
5548                                                           false);
5549     FuncInfo->setRAIndex(ReturnAddrIndex);
5550   }
5551
5552   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
5553 }
5554
5555 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
5556                                        bool hasSymbolicDisplacement) {
5557   // Offset should fit into 32 bit immediate field.
5558   if (!isInt<32>(Offset))
5559     return false;
5560
5561   // If we don't have a symbolic displacement - we don't have any extra
5562   // restrictions.
5563   if (!hasSymbolicDisplacement)
5564     return true;
5565
5566   // FIXME: Some tweaks might be needed for medium code model.
5567   if (M != CodeModel::Small && M != CodeModel::Kernel)
5568     return false;
5569
5570   // For small code model we assume that latest object is 16MB before end of 31
5571   // bits boundary. We may also accept pretty large negative constants knowing
5572   // that all objects are in the positive half of address space.
5573   if (M == CodeModel::Small && Offset < 16*1024*1024)
5574     return true;
5575
5576   // For kernel code model we know that all object resist in the negative half
5577   // of 32bits address space. We may not accept negative offsets, since they may
5578   // be just off and we may accept pretty large positive ones.
5579   if (M == CodeModel::Kernel && Offset >= 0)
5580     return true;
5581
5582   return false;
5583 }
5584
5585 /// Determines whether the callee is required to pop its own arguments.
5586 /// Callee pop is necessary to support tail calls.
5587 bool X86::isCalleePop(CallingConv::ID CallingConv,
5588                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
5589   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
5590   // can guarantee TCO.
5591   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
5592     return true;
5593
5594   switch (CallingConv) {
5595   default:
5596     return false;
5597   case CallingConv::X86_StdCall:
5598   case CallingConv::X86_FastCall:
5599   case CallingConv::X86_ThisCall:
5600   case CallingConv::X86_VectorCall:
5601     return !is64Bit;
5602   }
5603 }
5604
5605 /// Return true if the condition is an signed comparison operation.
5606 static bool isX86CCSigned(unsigned X86CC) {
5607   switch (X86CC) {
5608   default:
5609     llvm_unreachable("Invalid integer condition!");
5610   case X86::COND_E:
5611   case X86::COND_NE:
5612   case X86::COND_B:
5613   case X86::COND_A:
5614   case X86::COND_BE:
5615   case X86::COND_AE:
5616     return false;
5617   case X86::COND_G:
5618   case X86::COND_GE:
5619   case X86::COND_L:
5620   case X86::COND_LE:
5621     return true;
5622   }
5623 }
5624
5625 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5626   switch (SetCCOpcode) {
5627   default: llvm_unreachable("Invalid integer condition!");
5628   case ISD::SETEQ:  return X86::COND_E;
5629   case ISD::SETGT:  return X86::COND_G;
5630   case ISD::SETGE:  return X86::COND_GE;
5631   case ISD::SETLT:  return X86::COND_L;
5632   case ISD::SETLE:  return X86::COND_LE;
5633   case ISD::SETNE:  return X86::COND_NE;
5634   case ISD::SETULT: return X86::COND_B;
5635   case ISD::SETUGT: return X86::COND_A;
5636   case ISD::SETULE: return X86::COND_BE;
5637   case ISD::SETUGE: return X86::COND_AE;
5638   }
5639 }
5640
5641 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5642 /// condition code, returning the condition code and the LHS/RHS of the
5643 /// comparison to make.
5644 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5645                                     bool isFP, SDValue &LHS, SDValue &RHS,
5646                                     SelectionDAG &DAG) {
5647   if (!isFP) {
5648     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5649       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
5650         // X > -1   -> X == 0, jump !sign.
5651         RHS = DAG.getConstant(0, DL, RHS.getValueType());
5652         return X86::COND_NS;
5653       }
5654       if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
5655         // X < 0   -> X == 0, jump on sign.
5656         return X86::COND_S;
5657       }
5658       if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
5659         // X >= 0   -> X == 0, jump on !sign.
5660         return X86::COND_NS;
5661       }
5662       if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5663         // X < 1   -> X <= 0
5664         RHS = DAG.getConstant(0, DL, RHS.getValueType());
5665         return X86::COND_LE;
5666       }
5667     }
5668
5669     return TranslateIntegerX86CC(SetCCOpcode);
5670   }
5671
5672   // First determine if it is required or is profitable to flip the operands.
5673
5674   // If LHS is a foldable load, but RHS is not, flip the condition.
5675   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5676       !ISD::isNON_EXTLoad(RHS.getNode())) {
5677     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5678     std::swap(LHS, RHS);
5679   }
5680
5681   switch (SetCCOpcode) {
5682   default: break;
5683   case ISD::SETOLT:
5684   case ISD::SETOLE:
5685   case ISD::SETUGT:
5686   case ISD::SETUGE:
5687     std::swap(LHS, RHS);
5688     break;
5689   }
5690
5691   // On a floating point condition, the flags are set as follows:
5692   // ZF  PF  CF   op
5693   //  0 | 0 | 0 | X > Y
5694   //  0 | 0 | 1 | X < Y
5695   //  1 | 0 | 0 | X == Y
5696   //  1 | 1 | 1 | unordered
5697   switch (SetCCOpcode) {
5698   default: llvm_unreachable("Condcode should be pre-legalized away");
5699   case ISD::SETUEQ:
5700   case ISD::SETEQ:   return X86::COND_E;
5701   case ISD::SETOLT:              // flipped
5702   case ISD::SETOGT:
5703   case ISD::SETGT:   return X86::COND_A;
5704   case ISD::SETOLE:              // flipped
5705   case ISD::SETOGE:
5706   case ISD::SETGE:   return X86::COND_AE;
5707   case ISD::SETUGT:              // flipped
5708   case ISD::SETULT:
5709   case ISD::SETLT:   return X86::COND_B;
5710   case ISD::SETUGE:              // flipped
5711   case ISD::SETULE:
5712   case ISD::SETLE:   return X86::COND_BE;
5713   case ISD::SETONE:
5714   case ISD::SETNE:   return X86::COND_NE;
5715   case ISD::SETUO:   return X86::COND_P;
5716   case ISD::SETO:    return X86::COND_NP;
5717   case ISD::SETOEQ:
5718   case ISD::SETUNE:  return X86::COND_INVALID;
5719   }
5720 }
5721
5722 /// Is there a floating point cmov for the specific X86 condition code?
5723 /// Current x86 isa includes the following FP cmov instructions:
5724 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5725 static bool hasFPCMov(unsigned X86CC) {
5726   switch (X86CC) {
5727   default:
5728     return false;
5729   case X86::COND_B:
5730   case X86::COND_BE:
5731   case X86::COND_E:
5732   case X86::COND_P:
5733   case X86::COND_A:
5734   case X86::COND_AE:
5735   case X86::COND_NE:
5736   case X86::COND_NP:
5737     return true;
5738   }
5739 }
5740
5741 static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
5742   return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
5743          VT.is512BitVector();
5744 }
5745
5746 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5747                                            const CallInst &I,
5748                                            MachineFunction &MF,
5749                                            unsigned Intrinsic) const {
5750   Info.flags = MachineMemOperand::MONone;
5751   Info.offset = 0;
5752
5753   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5754   if (!IntrData) {
5755     switch (Intrinsic) {
5756     case Intrinsic::x86_aesenc128kl:
5757     case Intrinsic::x86_aesdec128kl:
5758       Info.opc = ISD::INTRINSIC_W_CHAIN;
5759       Info.ptrVal = I.getArgOperand(1);
5760       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5761       Info.align = Align(1);
5762       Info.flags |= MachineMemOperand::MOLoad;
5763       return true;
5764     case Intrinsic::x86_aesenc256kl:
5765     case Intrinsic::x86_aesdec256kl:
5766       Info.opc = ISD::INTRINSIC_W_CHAIN;
5767       Info.ptrVal = I.getArgOperand(1);
5768       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5769       Info.align = Align(1);
5770       Info.flags |= MachineMemOperand::MOLoad;
5771       return true;
5772     case Intrinsic::x86_aesencwide128kl:
5773     case Intrinsic::x86_aesdecwide128kl:
5774       Info.opc = ISD::INTRINSIC_W_CHAIN;
5775       Info.ptrVal = I.getArgOperand(0);
5776       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5777       Info.align = Align(1);
5778       Info.flags |= MachineMemOperand::MOLoad;
5779       return true;
5780     case Intrinsic::x86_aesencwide256kl:
5781     case Intrinsic::x86_aesdecwide256kl:
5782       Info.opc = ISD::INTRINSIC_W_CHAIN;
5783       Info.ptrVal = I.getArgOperand(0);
5784       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5785       Info.align = Align(1);
5786       Info.flags |= MachineMemOperand::MOLoad;
5787       return true;
5788     case Intrinsic::x86_cmpccxadd32:
5789     case Intrinsic::x86_cmpccxadd64:
5790     case Intrinsic::x86_atomic_bts:
5791     case Intrinsic::x86_atomic_btc:
5792     case Intrinsic::x86_atomic_btr: {
5793       Info.opc = ISD::INTRINSIC_W_CHAIN;
5794       Info.ptrVal = I.getArgOperand(0);
5795       unsigned Size = I.getType()->getScalarSizeInBits();
5796       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5797       Info.align = Align(Size);
5798       Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5799                     MachineMemOperand::MOVolatile;
5800       return true;
5801     }
5802     case Intrinsic::x86_atomic_bts_rm:
5803     case Intrinsic::x86_atomic_btc_rm:
5804     case Intrinsic::x86_atomic_btr_rm: {
5805       Info.opc = ISD::INTRINSIC_W_CHAIN;
5806       Info.ptrVal = I.getArgOperand(0);
5807       unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5808       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5809       Info.align = Align(Size);
5810       Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5811                     MachineMemOperand::MOVolatile;
5812       return true;
5813     }
5814     case Intrinsic::x86_aadd32:
5815     case Intrinsic::x86_aadd64:
5816     case Intrinsic::x86_aand32:
5817     case Intrinsic::x86_aand64:
5818     case Intrinsic::x86_aor32:
5819     case Intrinsic::x86_aor64:
5820     case Intrinsic::x86_axor32:
5821     case Intrinsic::x86_axor64:
5822     case Intrinsic::x86_atomic_add_cc:
5823     case Intrinsic::x86_atomic_sub_cc:
5824     case Intrinsic::x86_atomic_or_cc:
5825     case Intrinsic::x86_atomic_and_cc:
5826     case Intrinsic::x86_atomic_xor_cc: {
5827       Info.opc = ISD::INTRINSIC_W_CHAIN;
5828       Info.ptrVal = I.getArgOperand(0);
5829       unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
5830       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
5831       Info.align = Align(Size);
5832       Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
5833                     MachineMemOperand::MOVolatile;
5834       return true;
5835     }
5836     }
5837     return false;
5838   }
5839
5840   switch (IntrData->Type) {
5841   case TRUNCATE_TO_MEM_VI8:
5842   case TRUNCATE_TO_MEM_VI16:
5843   case TRUNCATE_TO_MEM_VI32: {
5844     Info.opc = ISD::INTRINSIC_VOID;
5845     Info.ptrVal = I.getArgOperand(0);
5846     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
5847     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5848     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5849       ScalarVT = MVT::i8;
5850     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5851       ScalarVT = MVT::i16;
5852     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5853       ScalarVT = MVT::i32;
5854
5855     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5856     Info.align = Align(1);
5857     Info.flags |= MachineMemOperand::MOStore;
5858     break;
5859   }
5860   case GATHER:
5861   case GATHER_AVX2: {
5862     Info.opc = ISD::INTRINSIC_W_CHAIN;
5863     Info.ptrVal = nullptr;
5864     MVT DataVT = MVT::getVT(I.getType());
5865     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5866     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5867                                 IndexVT.getVectorNumElements());
5868     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5869     Info.align = Align(1);
5870     Info.flags |= MachineMemOperand::MOLoad;
5871     break;
5872   }
5873   case SCATTER: {
5874     Info.opc = ISD::INTRINSIC_VOID;
5875     Info.ptrVal = nullptr;
5876     MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5877     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5878     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5879                                 IndexVT.getVectorNumElements());
5880     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5881     Info.align = Align(1);
5882     Info.flags |= MachineMemOperand::MOStore;
5883     break;
5884   }
5885   default:
5886     return false;
5887   }
5888
5889   return true;
5890 }
5891
5892 /// Returns true if the target can instruction select the
5893 /// specified FP immediate natively. If false, the legalizer will
5894 /// materialize the FP immediate as a load from a constant pool.
5895 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5896                                      bool ForCodeSize) const {
5897   for (const APFloat &FPImm : LegalFPImmediates)
5898     if (Imm.bitwiseIsEqual(FPImm))
5899       return true;
5900   return false;
5901 }
5902
5903 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5904                                               ISD::LoadExtType ExtTy,
5905                                               EVT NewVT) const {
5906   assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
5907
5908   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5909   // relocation target a movq or addq instruction: don't let the load shrink.
5910   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5911   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5912     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5913       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5914
5915   // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5916   // those uses are extracted directly into a store, then the extract + store
5917   // can be store-folded. Therefore, it's probably not worth splitting the load.
5918   EVT VT = Load->getValueType(0);
5919   if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5920     for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5921       // Skip uses of the chain value. Result 0 of the node is the load value.
5922       if (UI.getUse().getResNo() != 0)
5923         continue;
5924
5925       // If this use is not an extract + store, it's probably worth splitting.
5926       if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5927           UI->use_begin()->getOpcode() != ISD::STORE)
5928         return true;
5929     }
5930     // All non-chain uses are extract + store.
5931     return false;
5932   }
5933
5934   return true;
5935 }
5936
5937 /// Returns true if it is beneficial to convert a load of a constant
5938 /// to just the constant itself.
5939 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5940                                                           Type *Ty) const {
5941   assert(Ty->isIntegerTy());
5942
5943   unsigned BitSize = Ty->getPrimitiveSizeInBits();
5944   if (BitSize == 0 || BitSize > 64)
5945     return false;
5946   return true;
5947 }
5948
5949 bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5950   // If we are using XMM registers in the ABI and the condition of the select is
5951   // a floating-point compare and we have blendv or conditional move, then it is
5952   // cheaper to select instead of doing a cross-register move and creating a
5953   // load that depends on the compare result.
5954   bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5955   return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5956 }
5957
5958 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5959   // TODO: It might be a win to ease or lift this restriction, but the generic
5960   // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5961   if (VT.isVector() && Subtarget.hasAVX512())
5962     return false;
5963
5964   return true;
5965 }
5966
5967 bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5968                                                SDValue C) const {
5969   // TODO: We handle scalars using custom code, but generic combining could make
5970   // that unnecessary.
5971   APInt MulC;
5972   if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5973     return false;
5974
5975   // Find the type this will be legalized too. Otherwise we might prematurely
5976   // convert this to shl+add/sub and then still have to type legalize those ops.
5977   // Another choice would be to defer the decision for illegal types until
5978   // after type legalization. But constant splat vectors of i64 can't make it
5979   // through type legalization on 32-bit targets so we would need to special
5980   // case vXi64.
5981   while (getTypeAction(Context, VT) != TypeLegal)
5982     VT = getTypeToTransformTo(Context, VT);
5983
5984   // If vector multiply is legal, assume that's faster than shl + add/sub.
5985   // Multiply is a complex op with higher latency and lower throughput in
5986   // most implementations, sub-vXi32 vector multiplies are always fast,
5987   // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
5988   // is always going to be slow.
5989   unsigned EltSizeInBits = VT.getScalarSizeInBits();
5990   if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
5991       (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
5992     return false;
5993
5994   // shl+add, shl+sub, shl+add+neg
5995   return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5996          (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5997 }
5998
5999 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
6000                                                 unsigned Index) const {
6001   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
6002     return false;
6003
6004   // Mask vectors support all subregister combinations and operations that
6005   // extract half of vector.
6006   if (ResVT.getVectorElementType() == MVT::i1)
6007     return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
6008                           (Index == ResVT.getVectorNumElements()));
6009
6010   return (Index % ResVT.getVectorNumElements()) == 0;
6011 }
6012
6013 bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
6014   unsigned Opc = VecOp.getOpcode();
6015
6016   // Assume target opcodes can't be scalarized.
6017   // TODO - do we have any exceptions?
6018   if (Opc >= ISD::BUILTIN_OP_END)
6019     return false;
6020
6021   // If the vector op is not supported, try to convert to scalar.
6022   EVT VecVT = VecOp.getValueType();
6023   if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
6024     return true;
6025
6026   // If the vector op is supported, but the scalar op is not, the transform may
6027   // not be worthwhile.
6028   EVT ScalarVT = VecVT.getScalarType();
6029   return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
6030 }
6031
6032 bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
6033                                              bool) const {
6034   // TODO: Allow vectors?
6035   if (VT.isVector())
6036     return false;
6037   return VT.isSimple() || !isOperationExpand(Opcode, VT);
6038 }
6039
6040 bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
6041   // Speculate cttz only if we can directly use TZCNT or can promote to i32.
6042   return Subtarget.hasBMI() ||
6043          (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
6044 }
6045
6046 bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
6047   // Speculate ctlz only if we can directly use LZCNT.
6048   return Subtarget.hasLZCNT();
6049 }
6050
6051 bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
6052   // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
6053   // expensive than a straight movsd. On the other hand, it's important to
6054   // shrink long double fp constant since fldt is very slow.
6055   return !Subtarget.hasSSE2() || VT == MVT::f80;
6056 }
6057
6058 bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
6059   return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
6060          (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
6061 }
6062
6063 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
6064                                                 const SelectionDAG &DAG,
6065                                                 const MachineMemOperand &MMO) const {
6066   if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
6067       BitcastVT.getVectorElementType() == MVT::i1)
6068     return false;
6069
6070   if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
6071     return false;
6072
6073   // If both types are legal vectors, it's always ok to convert them.
6074   if (LoadVT.isVector() && BitcastVT.isVector() &&
6075       isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
6076     return true;
6077
6078   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
6079 }
6080
6081 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
6082                                          const MachineFunction &MF) const {
6083   // Do not merge to float value size (128 bytes) if no implicit
6084   // float attribute is set.
6085   bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
6086
6087   if (NoFloat) {
6088     unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
6089     return (MemVT.getSizeInBits() <= MaxIntSize);
6090   }
6091   // Make sure we don't merge greater than our preferred vector
6092   // width.
6093   if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
6094     return false;
6095
6096   return true;
6097 }
6098
6099 bool X86TargetLowering::isCtlzFast() const {
6100   return Subtarget.hasFastLZCNT();
6101 }
6102
6103 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
6104     const Instruction &AndI) const {
6105   return true;
6106 }
6107
6108 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
6109   EVT VT = Y.getValueType();
6110
6111   if (VT.isVector())
6112     return false;
6113
6114   if (!Subtarget.hasBMI())
6115     return false;
6116
6117   // There are only 32-bit and 64-bit forms for 'andn'.
6118   if (VT != MVT::i32 && VT != MVT::i64)
6119     return false;
6120
6121   return !isa<ConstantSDNode>(Y);
6122 }
6123
6124 bool X86TargetLowering::hasAndNot(SDValue Y) const {
6125   EVT VT = Y.getValueType();
6126
6127   if (!VT.isVector())
6128     return hasAndNotCompare(Y);
6129
6130   // Vector.
6131
6132   if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
6133     return false;
6134
6135   if (VT == MVT::v4i32)
6136     return true;
6137
6138   return Subtarget.hasSSE2();
6139 }
6140
6141 bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
6142   return X.getValueType().isScalarInteger(); // 'bt'
6143 }
6144
6145 bool X86TargetLowering::
6146     shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
6147         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
6148         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
6149         SelectionDAG &DAG) const {
6150   // Does baseline recommend not to perform the fold by default?
6151   if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
6152           X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
6153     return false;
6154   // For scalars this transform is always beneficial.
6155   if (X.getValueType().isScalarInteger())
6156     return true;
6157   // If all the shift amounts are identical, then transform is beneficial even
6158   // with rudimentary SSE2 shifts.
6159   if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
6160     return true;
6161   // If we have AVX2 with it's powerful shift operations, then it's also good.
6162   if (Subtarget.hasAVX2())
6163     return true;
6164   // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
6165   return NewShiftOpcode == ISD::SHL;
6166 }
6167
6168 bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
6169   return N->getOpcode() != ISD::FP_EXTEND;
6170 }
6171
6172 bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
6173     const SDNode *N, CombineLevel Level) const {
6174   assert(((N->getOpcode() == ISD::SHL &&
6175            N->getOperand(0).getOpcode() == ISD::SRL) ||
6176           (N->getOpcode() == ISD::SRL &&
6177            N->getOperand(0).getOpcode() == ISD::SHL)) &&
6178          "Expected shift-shift mask");
6179   // TODO: Should we always create i64 masks? Or only folded immediates?
6180   EVT VT = N->getValueType(0);
6181   if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
6182       (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
6183     // Only fold if the shift values are equal - so it folds to AND.
6184     // TODO - we should fold if either is a non-uniform vector but we don't do
6185     // the fold for non-splats yet.
6186     return N->getOperand(1) == N->getOperand(0).getOperand(1);
6187   }
6188   return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
6189 }
6190
6191 bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
6192   EVT VT = Y.getValueType();
6193
6194   // For vectors, we don't have a preference, but we probably want a mask.
6195   if (VT.isVector())
6196     return false;
6197
6198   // 64-bit shifts on 32-bit targets produce really bad bloated code.
6199   if (VT == MVT::i64 && !Subtarget.is64Bit())
6200     return false;
6201
6202   return true;
6203 }
6204
6205 TargetLowering::ShiftLegalizationStrategy
6206 X86TargetLowering::preferredShiftLegalizationStrategy(
6207     SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
6208   if (DAG.getMachineFunction().getFunction().hasMinSize() &&
6209       !Subtarget.isOSWindows())
6210     return ShiftLegalizationStrategy::LowerToLibcall;
6211   return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
6212                                                             ExpansionFactor);
6213 }
6214
6215 bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
6216   // Any legal vector type can be splatted more efficiently than
6217   // loading/spilling from memory.
6218   return isTypeLegal(VT);
6219 }
6220
6221 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
6222   MVT VT = MVT::getIntegerVT(NumBits);
6223   if (isTypeLegal(VT))
6224     return VT;
6225
6226   // PMOVMSKB can handle this.
6227   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
6228     return MVT::v16i8;
6229
6230   // VPMOVMSKB can handle this.
6231   if (NumBits == 256 && isTypeLegal(MVT::v32i8))
6232     return MVT::v32i8;
6233
6234   // TODO: Allow 64-bit type for 32-bit target.
6235   // TODO: 512-bit types should be allowed, but make sure that those
6236   // cases are handled in combineVectorSizedSetCCEquality().
6237
6238   return MVT::INVALID_SIMPLE_VALUE_TYPE;
6239 }
6240
6241 /// Val is the undef sentinel value or equal to the specified value.
6242 static bool isUndefOrEqual(int Val, int CmpVal) {
6243   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
6244 }
6245
6246 /// Return true if every element in Mask is the undef sentinel value or equal to
6247 /// the specified value..
6248 static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
6249   return llvm::all_of(Mask, [CmpVal](int M) {
6250     return (M == SM_SentinelUndef) || (M == CmpVal);
6251   });
6252 }
6253
6254 /// Val is either the undef or zero sentinel value.
6255 static bool isUndefOrZero(int Val) {
6256   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
6257 }
6258
6259 /// Return true if every element in Mask, beginning from position Pos and ending
6260 /// in Pos+Size is the undef sentinel value.
6261 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
6262   return llvm::all_of(Mask.slice(Pos, Size),
6263                       [](int M) { return M == SM_SentinelUndef; });
6264 }
6265
6266 /// Return true if the mask creates a vector whose lower half is undefined.
6267 static bool isUndefLowerHalf(ArrayRef<int> Mask) {
6268   unsigned NumElts = Mask.size();
6269   return isUndefInRange(Mask, 0, NumElts / 2);
6270 }
6271
6272 /// Return true if the mask creates a vector whose upper half is undefined.
6273 static bool isUndefUpperHalf(ArrayRef<int> Mask) {
6274   unsigned NumElts = Mask.size();
6275   return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
6276 }
6277
6278 /// Return true if Val falls within the specified range (L, H].
6279 static bool isInRange(int Val, int Low, int Hi) {
6280   return (Val >= Low && Val < Hi);
6281 }
6282
6283 /// Return true if the value of any element in Mask falls within the specified
6284 /// range (L, H].
6285 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
6286   return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
6287 }
6288
6289 /// Return true if the value of any element in Mask is the zero sentinel value.
6290 static bool isAnyZero(ArrayRef<int> Mask) {
6291   return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
6292 }
6293
6294 /// Return true if the value of any element in Mask is the zero or undef
6295 /// sentinel values.
6296 static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
6297   return llvm::any_of(Mask, [](int M) {
6298     return M == SM_SentinelZero || M == SM_SentinelUndef;
6299   });
6300 }
6301
6302 /// Return true if Val is undef or if its value falls within the
6303 /// specified range (L, H].
6304 static bool isUndefOrInRange(int Val, int Low, int Hi) {
6305   return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
6306 }
6307
6308 /// Return true if every element in Mask is undef or if its value
6309 /// falls within the specified range (L, H].
6310 static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
6311   return llvm::all_of(
6312       Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
6313 }
6314
6315 /// Return true if Val is undef, zero or if its value falls within the
6316 /// specified range (L, H].
6317 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
6318   return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
6319 }
6320
6321 /// Return true if every element in Mask is undef, zero or if its value
6322 /// falls within the specified range (L, H].
6323 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
6324   return llvm::all_of(
6325       Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
6326 }
6327
6328 /// Return true if every element in Mask, beginning
6329 /// from position Pos and ending in Pos + Size, falls within the specified
6330 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
6331 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
6332                                        unsigned Size, int Low, int Step = 1) {
6333   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
6334     if (!isUndefOrEqual(Mask[i], Low))
6335       return false;
6336   return true;
6337 }
6338
6339 /// Return true if every element in Mask, beginning
6340 /// from position Pos and ending in Pos+Size, falls within the specified
6341 /// sequential range (Low, Low+Size], or is undef or is zero.
6342 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
6343                                              unsigned Size, int Low,
6344                                              int Step = 1) {
6345   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
6346     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
6347       return false;
6348   return true;
6349 }
6350
6351 /// Return true if every element in Mask, beginning
6352 /// from position Pos and ending in Pos+Size is undef or is zero.
6353 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
6354                                  unsigned Size) {
6355   return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
6356 }
6357
6358 /// Helper function to test whether a shuffle mask could be
6359 /// simplified by widening the elements being shuffled.
6360 ///
6361 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
6362 /// leaves it in an unspecified state.
6363 ///
6364 /// NOTE: This must handle normal vector shuffle masks and *target* vector
6365 /// shuffle masks. The latter have the special property of a '-2' representing
6366 /// a zero-ed lane of a vector.
6367 static bool canWidenShuffleElements(ArrayRef<int> Mask,
6368                                     SmallVectorImpl<int> &WidenedMask) {
6369   WidenedMask.assign(Mask.size() / 2, 0);
6370   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
6371     int M0 = Mask[i];
6372     int M1 = Mask[i + 1];
6373
6374     // If both elements are undef, its trivial.
6375     if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
6376       WidenedMask[i / 2] = SM_SentinelUndef;
6377       continue;
6378     }
6379
6380     // Check for an undef mask and a mask value properly aligned to fit with
6381     // a pair of values. If we find such a case, use the non-undef mask's value.
6382     if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
6383       WidenedMask[i / 2] = M1 / 2;
6384       continue;
6385     }
6386     if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
6387       WidenedMask[i / 2] = M0 / 2;
6388       continue;
6389     }
6390
6391     // When zeroing, we need to spread the zeroing across both lanes to widen.
6392     if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
6393       if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
6394           (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
6395         WidenedMask[i / 2] = SM_SentinelZero;
6396         continue;
6397       }
6398       return false;
6399     }
6400
6401     // Finally check if the two mask values are adjacent and aligned with
6402     // a pair.
6403     if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
6404       WidenedMask[i / 2] = M0 / 2;
6405       continue;
6406     }
6407
6408     // Otherwise we can't safely widen the elements used in this shuffle.
6409     return false;
6410   }
6411   assert(WidenedMask.size() == Mask.size() / 2 &&
6412          "Incorrect size of mask after widening the elements!");
6413
6414   return true;
6415 }
6416
6417 static bool canWidenShuffleElements(ArrayRef<int> Mask,
6418                                     const APInt &Zeroable,
6419                                     bool V2IsZero,
6420                                     SmallVectorImpl<int> &WidenedMask) {
6421   // Create an alternative mask with info about zeroable elements.
6422   // Here we do not set undef elements as zeroable.
6423   SmallVector<int, 64> ZeroableMask(Mask);
6424   if (V2IsZero) {
6425     assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
6426     for (int i = 0, Size = Mask.size(); i != Size; ++i)
6427       if (Mask[i] != SM_SentinelUndef && Zeroable[i])
6428         ZeroableMask[i] = SM_SentinelZero;
6429   }
6430   return canWidenShuffleElements(ZeroableMask, WidenedMask);
6431 }
6432
6433 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
6434   SmallVector<int, 32> WidenedMask;
6435   return canWidenShuffleElements(Mask, WidenedMask);
6436 }
6437
6438 // Attempt to narrow/widen shuffle mask until it matches the target number of
6439 // elements.
6440 static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
6441                                  SmallVectorImpl<int> &ScaledMask) {
6442   unsigned NumSrcElts = Mask.size();
6443   assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
6444          "Illegal shuffle scale factor");
6445
6446   // Narrowing is guaranteed to work.
6447   if (NumDstElts >= NumSrcElts) {
6448     int Scale = NumDstElts / NumSrcElts;
6449     llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
6450     return true;
6451   }
6452
6453   // We have to repeat the widening until we reach the target size, but we can
6454   // split out the first widening as it sets up ScaledMask for us.
6455   if (canWidenShuffleElements(Mask, ScaledMask)) {
6456     while (ScaledMask.size() > NumDstElts) {
6457       SmallVector<int, 16> WidenedMask;
6458       if (!canWidenShuffleElements(ScaledMask, WidenedMask))
6459         return false;
6460       ScaledMask = std::move(WidenedMask);
6461     }
6462     return true;
6463   }
6464
6465   return false;
6466 }
6467
6468 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
6469 bool X86::isZeroNode(SDValue Elt) {
6470   return isNullConstant(Elt) || isNullFPConstant(Elt);
6471 }
6472
6473 // Build a vector of constants.
6474 // Use an UNDEF node if MaskElt == -1.
6475 // Split 64-bit constants in the 32-bit mode.
6476 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
6477                               const SDLoc &dl, bool IsMask = false) {
6478
6479   SmallVector<SDValue, 32>  Ops;
6480   bool Split = false;
6481
6482   MVT ConstVecVT = VT;
6483   unsigned NumElts = VT.getVectorNumElements();
6484   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6485   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6486     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6487     Split = true;
6488   }
6489
6490   MVT EltVT = ConstVecVT.getVectorElementType();
6491   for (unsigned i = 0; i < NumElts; ++i) {
6492     bool IsUndef = Values[i] < 0 && IsMask;
6493     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
6494       DAG.getConstant(Values[i], dl, EltVT);
6495     Ops.push_back(OpNode);
6496     if (Split)
6497       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
6498                     DAG.getConstant(0, dl, EltVT));
6499   }
6500   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6501   if (Split)
6502     ConstsNode = DAG.getBitcast(VT, ConstsNode);
6503   return ConstsNode;
6504 }
6505
6506 static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
6507                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6508   assert(Bits.size() == Undefs.getBitWidth() &&
6509          "Unequal constant and undef arrays");
6510   SmallVector<SDValue, 32> Ops;
6511   bool Split = false;
6512
6513   MVT ConstVecVT = VT;
6514   unsigned NumElts = VT.getVectorNumElements();
6515   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6516   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6517     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6518     Split = true;
6519   }
6520
6521   MVT EltVT = ConstVecVT.getVectorElementType();
6522   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
6523     if (Undefs[i]) {
6524       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
6525       continue;
6526     }
6527     const APInt &V = Bits[i];
6528     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
6529     if (Split) {
6530       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
6531       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
6532     } else if (EltVT == MVT::f32) {
6533       APFloat FV(APFloat::IEEEsingle(), V);
6534       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6535     } else if (EltVT == MVT::f64) {
6536       APFloat FV(APFloat::IEEEdouble(), V);
6537       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6538     } else {
6539       Ops.push_back(DAG.getConstant(V, dl, EltVT));
6540     }
6541   }
6542
6543   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6544   return DAG.getBitcast(VT, ConstsNode);
6545 }
6546
6547 static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT,
6548                               SelectionDAG &DAG, const SDLoc &dl) {
6549   APInt Undefs = APInt::getZero(Bits.size());
6550   return getConstVector(Bits, Undefs, VT, DAG, dl);
6551 }
6552
6553 /// Returns a vector of specified type with all zero elements.
6554 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
6555                              SelectionDAG &DAG, const SDLoc &dl) {
6556   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
6557           VT.getVectorElementType() == MVT::i1) &&
6558          "Unexpected vector type");
6559
6560   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
6561   // type. This ensures they get CSE'd. But if the integer type is not
6562   // available, use a floating-point +0.0 instead.
6563   SDValue Vec;
6564   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
6565     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
6566   } else if (VT.isFloatingPoint()) {
6567     Vec = DAG.getConstantFP(+0.0, dl, VT);
6568   } else if (VT.getVectorElementType() == MVT::i1) {
6569     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
6570            "Unexpected vector type");
6571     Vec = DAG.getConstant(0, dl, VT);
6572   } else {
6573     unsigned Num32BitElts = VT.getSizeInBits() / 32;
6574     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
6575   }
6576   return DAG.getBitcast(VT, Vec);
6577 }
6578
6579 // Helper to determine if the ops are all the extracted subvectors come from a
6580 // single source. If we allow commute they don't have to be in order (Lo/Hi).
6581 static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
6582   if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6583       RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
6584       LHS.getValueType() != RHS.getValueType() ||
6585       LHS.getOperand(0) != RHS.getOperand(0))
6586     return SDValue();
6587
6588   SDValue Src = LHS.getOperand(0);
6589   if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
6590     return SDValue();
6591
6592   unsigned NumElts = LHS.getValueType().getVectorNumElements();
6593   if ((LHS.getConstantOperandAPInt(1) == 0 &&
6594        RHS.getConstantOperandAPInt(1) == NumElts) ||
6595       (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
6596        LHS.getConstantOperandAPInt(1) == NumElts))
6597     return Src;
6598
6599   return SDValue();
6600 }
6601
6602 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
6603                                 const SDLoc &dl, unsigned vectorWidth) {
6604   EVT VT = Vec.getValueType();
6605   EVT ElVT = VT.getVectorElementType();
6606   unsigned Factor = VT.getSizeInBits() / vectorWidth;
6607   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
6608                                   VT.getVectorNumElements() / Factor);
6609
6610   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
6611   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
6612   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
6613
6614   // This is the index of the first element of the vectorWidth-bit chunk
6615   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6616   IdxVal &= ~(ElemsPerChunk - 1);
6617
6618   // If the input is a buildvector just emit a smaller one.
6619   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
6620     return DAG.getBuildVector(ResultVT, dl,
6621                               Vec->ops().slice(IdxVal, ElemsPerChunk));
6622
6623   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6624   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
6625 }
6626
6627 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
6628 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
6629 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
6630 /// instructions or a simple subregister reference. Idx is an index in the
6631 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
6632 /// lowering EXTRACT_VECTOR_ELT operations easier.
6633 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
6634                                    SelectionDAG &DAG, const SDLoc &dl) {
6635   assert((Vec.getValueType().is256BitVector() ||
6636           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
6637   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
6638 }
6639
6640 /// Generate a DAG to grab 256-bits from a 512-bit vector.
6641 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
6642                                    SelectionDAG &DAG, const SDLoc &dl) {
6643   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
6644   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
6645 }
6646
6647 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6648                                SelectionDAG &DAG, const SDLoc &dl,
6649                                unsigned vectorWidth) {
6650   assert((vectorWidth == 128 || vectorWidth == 256) &&
6651          "Unsupported vector width");
6652   // Inserting UNDEF is Result
6653   if (Vec.isUndef())
6654     return Result;
6655   EVT VT = Vec.getValueType();
6656   EVT ElVT = VT.getVectorElementType();
6657   EVT ResultVT = Result.getValueType();
6658
6659   // Insert the relevant vectorWidth bits.
6660   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
6661   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
6662
6663   // This is the index of the first element of the vectorWidth-bit chunk
6664   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6665   IdxVal &= ~(ElemsPerChunk - 1);
6666
6667   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6668   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
6669 }
6670
6671 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
6672 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
6673 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
6674 /// simple superregister reference.  Idx is an index in the 128 bits
6675 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
6676 /// lowering INSERT_VECTOR_ELT operations easier.
6677 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6678                                   SelectionDAG &DAG, const SDLoc &dl) {
6679   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
6680   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
6681 }
6682
6683 /// Widen a vector to a larger size with the same scalar type, with the new
6684 /// elements either zero or undef.
6685 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
6686                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
6687                               const SDLoc &dl) {
6688   assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&
6689          Vec.getValueType().getScalarType() == VT.getScalarType() &&
6690          "Unsupported vector widening type");
6691   SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
6692                                 : DAG.getUNDEF(VT);
6693   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
6694                      DAG.getIntPtrConstant(0, dl));
6695 }
6696
6697 /// Widen a vector to a larger size with the same scalar type, with the new
6698 /// elements either zero or undef.
6699 static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
6700                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
6701                               const SDLoc &dl, unsigned WideSizeInBits) {
6702   assert(Vec.getValueSizeInBits() < WideSizeInBits &&
6703          (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
6704          "Unsupported vector widening type");
6705   unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
6706   MVT SVT = Vec.getSimpleValueType().getScalarType();
6707   MVT VT = MVT::getVectorVT(SVT, WideNumElts);
6708   return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
6709 }
6710
6711 // Helper function to collect subvector ops that are concatenated together,
6712 // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
6713 // The subvectors in Ops are guaranteed to be the same type.
6714 static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
6715                              SelectionDAG &DAG) {
6716   assert(Ops.empty() && "Expected an empty ops vector");
6717
6718   if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6719     Ops.append(N->op_begin(), N->op_end());
6720     return true;
6721   }
6722
6723   if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6724     SDValue Src = N->getOperand(0);
6725     SDValue Sub = N->getOperand(1);
6726     const APInt &Idx = N->getConstantOperandAPInt(2);
6727     EVT VT = Src.getValueType();
6728     EVT SubVT = Sub.getValueType();
6729
6730     // TODO - Handle more general insert_subvector chains.
6731     if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
6732       // insert_subvector(undef, x, lo)
6733       if (Idx == 0 && Src.isUndef()) {
6734         Ops.push_back(Sub);
6735         Ops.push_back(DAG.getUNDEF(SubVT));
6736         return true;
6737       }
6738       if (Idx == (VT.getVectorNumElements() / 2)) {
6739         // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6740         if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6741             Src.getOperand(1).getValueType() == SubVT &&
6742             isNullConstant(Src.getOperand(2))) {
6743           Ops.push_back(Src.getOperand(1));
6744           Ops.push_back(Sub);
6745           return true;
6746         }
6747         // insert_subvector(x, extract_subvector(x, lo), hi)
6748         if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6749             Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6750           Ops.append(2, Sub);
6751           return true;
6752         }
6753         // insert_subvector(undef, x, hi)
6754         if (Src.isUndef()) {
6755           Ops.push_back(DAG.getUNDEF(SubVT));
6756           Ops.push_back(Sub);
6757           return true;
6758         }
6759       }
6760     }
6761   }
6762
6763   return false;
6764 }
6765
6766 // Helper to check if we can access all the constituent subvectors without any
6767 // extract ops.
6768 static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG) {
6769   SmallVector<SDValue> Ops;
6770   return collectConcatOps(N, Ops, DAG);
6771 }
6772
6773 static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6774                                                const SDLoc &dl) {
6775   EVT VT = Op.getValueType();
6776   unsigned NumElems = VT.getVectorNumElements();
6777   unsigned SizeInBits = VT.getSizeInBits();
6778   assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
6779          "Can't split odd sized vector");
6780
6781   // If this is a splat value (with no-undefs) then use the lower subvector,
6782   // which should be a free extraction.
6783   SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6784   if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
6785     return std::make_pair(Lo, Lo);
6786
6787   SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6788   return std::make_pair(Lo, Hi);
6789 }
6790
6791 /// Break an operation into 2 half sized ops and then concatenate the results.
6792 static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {
6793   unsigned NumOps = Op.getNumOperands();
6794   EVT VT = Op.getValueType();
6795   SDLoc dl(Op);
6796
6797   // Extract the LHS Lo/Hi vectors
6798   SmallVector<SDValue> LoOps(NumOps, SDValue());
6799   SmallVector<SDValue> HiOps(NumOps, SDValue());
6800   for (unsigned I = 0; I != NumOps; ++I) {
6801     SDValue SrcOp = Op.getOperand(I);
6802     if (!SrcOp.getValueType().isVector()) {
6803       LoOps[I] = HiOps[I] = SrcOp;
6804       continue;
6805     }
6806     std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
6807   }
6808
6809   EVT LoVT, HiVT;
6810   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6811   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6812                      DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
6813                      DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
6814 }
6815
6816 /// Break an unary integer operation into 2 half sized ops and then
6817 /// concatenate the result back.
6818 static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6819   // Make sure we only try to split 256/512-bit types to avoid creating
6820   // narrow vectors.
6821   EVT VT = Op.getValueType();
6822   (void)VT;
6823   assert((Op.getOperand(0).getValueType().is256BitVector() ||
6824           Op.getOperand(0).getValueType().is512BitVector()) &&
6825          (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
6826   assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
6827              VT.getVectorNumElements() &&
6828          "Unexpected VTs!");
6829   return splitVectorOp(Op, DAG);
6830 }
6831
6832 /// Break a binary integer operation into 2 half sized ops and then
6833 /// concatenate the result back.
6834 static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6835   // Assert that all the types match.
6836   EVT VT = Op.getValueType();
6837   (void)VT;
6838   assert(Op.getOperand(0).getValueType() == VT &&
6839          Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
6840   assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
6841   return splitVectorOp(Op, DAG);
6842 }
6843
6844 // Helper for splitting operands of an operation to legal target size and
6845 // apply a function on each part.
6846 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6847 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6848 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6849 // The argument Builder is a function that will be applied on each split part:
6850 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6851 template <typename F>
6852 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6853                          const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6854                          F Builder, bool CheckBWI = true) {
6855   assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
6856   unsigned NumSubs = 1;
6857   if ((CheckBWI && Subtarget.useBWIRegs()) ||
6858       (!CheckBWI && Subtarget.useAVX512Regs())) {
6859     if (VT.getSizeInBits() > 512) {
6860       NumSubs = VT.getSizeInBits() / 512;
6861       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
6862     }
6863   } else if (Subtarget.hasAVX2()) {
6864     if (VT.getSizeInBits() > 256) {
6865       NumSubs = VT.getSizeInBits() / 256;
6866       assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
6867     }
6868   } else {
6869     if (VT.getSizeInBits() > 128) {
6870       NumSubs = VT.getSizeInBits() / 128;
6871       assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
6872     }
6873   }
6874
6875   if (NumSubs == 1)
6876     return Builder(DAG, DL, Ops);
6877
6878   SmallVector<SDValue, 4> Subs;
6879   for (unsigned i = 0; i != NumSubs; ++i) {
6880     SmallVector<SDValue, 2> SubOps;
6881     for (SDValue Op : Ops) {
6882       EVT OpVT = Op.getValueType();
6883       unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6884       unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6885       SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6886     }
6887     Subs.push_back(Builder(DAG, DL, SubOps));
6888   }
6889   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6890 }
6891
6892 // Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
6893 // targets.
6894 static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
6895                              ArrayRef<SDValue> Ops, SelectionDAG &DAG,
6896                              const X86Subtarget &Subtarget) {
6897   assert(Subtarget.hasAVX512() && "AVX512 target expected");
6898   MVT SVT = VT.getScalarType();
6899
6900   // If we have a 32/64 splatted constant, splat it to DstTy to
6901   // encourage a foldable broadcast'd operand.
6902   auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
6903     unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
6904     // AVX512 broadcasts 32/64-bit operands.
6905     // TODO: Support float once getAVX512Node is used by fp-ops.
6906     if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
6907         !DAG.getTargetLoweringInfo().isTypeLegal(SVT))
6908       return SDValue();
6909     // If we're not widening, don't bother if we're not bitcasting.
6910     if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
6911       return SDValue();
6912     if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
6913       APInt SplatValue, SplatUndef;
6914       unsigned SplatBitSize;
6915       bool HasAnyUndefs;
6916       if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
6917                               HasAnyUndefs, OpEltSizeInBits) &&
6918           !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
6919         return DAG.getConstant(SplatValue, DL, DstVT);
6920     }
6921     return SDValue();
6922   };
6923
6924   bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
6925
6926   MVT DstVT = VT;
6927   if (Widen)
6928     DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
6929
6930   // Canonicalize src operands.
6931   SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
6932   for (SDValue &Op : SrcOps) {
6933     MVT OpVT = Op.getSimpleValueType();
6934     // Just pass through scalar operands.
6935     if (!OpVT.isVector())
6936       continue;
6937     assert(OpVT == VT && "Vector type mismatch");
6938
6939     if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
6940       Op = BroadcastOp;
6941       continue;
6942     }
6943
6944     // Just widen the subvector by inserting into an undef wide vector.
6945     if (Widen)
6946       Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
6947   }
6948
6949   SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
6950
6951   // Perform the 512-bit op then extract the bottom subvector.
6952   if (Widen)
6953     Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
6954   return Res;
6955 }
6956
6957 /// Insert i1-subvector to i1-vector.
6958 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6959                                 const X86Subtarget &Subtarget) {
6960
6961   SDLoc dl(Op);
6962   SDValue Vec = Op.getOperand(0);
6963   SDValue SubVec = Op.getOperand(1);
6964   SDValue Idx = Op.getOperand(2);
6965   unsigned IdxVal = Op.getConstantOperandVal(2);
6966
6967   // Inserting undef is a nop. We can just return the original vector.
6968   if (SubVec.isUndef())
6969     return Vec;
6970
6971   if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6972     return Op;
6973
6974   MVT OpVT = Op.getSimpleValueType();
6975   unsigned NumElems = OpVT.getVectorNumElements();
6976   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6977
6978   // Extend to natively supported kshift.
6979   MVT WideOpVT = OpVT;
6980   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6981     WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6982
6983   // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6984   // if necessary.
6985   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6986     // May need to promote to a legal type.
6987     Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6988                      DAG.getConstant(0, dl, WideOpVT),
6989                      SubVec, Idx);
6990     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6991   }
6992
6993   MVT SubVecVT = SubVec.getSimpleValueType();
6994   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6995   assert(IdxVal + SubVecNumElems <= NumElems &&
6996          IdxVal % SubVecVT.getSizeInBits() == 0 &&
6997          "Unexpected index value in INSERT_SUBVECTOR");
6998
6999   SDValue Undef = DAG.getUNDEF(WideOpVT);
7000
7001   if (IdxVal == 0) {
7002     // Zero lower bits of the Vec
7003     SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
7004     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
7005                       ZeroIdx);
7006     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
7007     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
7008     // Merge them together, SubVec should be zero extended.
7009     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
7010                          DAG.getConstant(0, dl, WideOpVT),
7011                          SubVec, ZeroIdx);
7012     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
7013     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
7014   }
7015
7016   SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
7017                        Undef, SubVec, ZeroIdx);
7018
7019   if (Vec.isUndef()) {
7020     assert(IdxVal != 0 && "Unexpected index");
7021     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7022                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
7023     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
7024   }
7025
7026   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
7027     assert(IdxVal != 0 && "Unexpected index");
7028     // If upper elements of Vec are known undef, then just shift into place.
7029     if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
7030                      [](SDValue V) { return V.isUndef(); })) {
7031       SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7032                            DAG.getTargetConstant(IdxVal, dl, MVT::i8));
7033     } else {
7034       NumElems = WideOpVT.getVectorNumElements();
7035       unsigned ShiftLeft = NumElems - SubVecNumElems;
7036       unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
7037       SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7038                            DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
7039       if (ShiftRight != 0)
7040         SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
7041                              DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7042     }
7043     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
7044   }
7045
7046   // Simple case when we put subvector in the upper part
7047   if (IdxVal + SubVecNumElems == NumElems) {
7048     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7049                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
7050     if (SubVecNumElems * 2 == NumElems) {
7051       // Special case, use legal zero extending insert_subvector. This allows
7052       // isel to optimize when bits are known zero.
7053       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
7054       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
7055                         DAG.getConstant(0, dl, WideOpVT),
7056                         Vec, ZeroIdx);
7057     } else {
7058       // Otherwise use explicit shifts to zero the bits.
7059       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
7060                         Undef, Vec, ZeroIdx);
7061       NumElems = WideOpVT.getVectorNumElements();
7062       SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
7063       Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
7064       Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
7065     }
7066     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
7067     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
7068   }
7069
7070   // Inserting into the middle is more complicated.
7071
7072   NumElems = WideOpVT.getVectorNumElements();
7073
7074   // Widen the vector if needed.
7075   Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
7076
7077   unsigned ShiftLeft = NumElems - SubVecNumElems;
7078   unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
7079
7080   // Do an optimization for the the most frequently used types.
7081   if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
7082     APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
7083     Mask0.flipAllBits();
7084     SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
7085     SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
7086     Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
7087     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7088                          DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
7089     SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
7090                          DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7091     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
7092
7093     // Reduce to original width if needed.
7094     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
7095   }
7096
7097   // Clear the upper bits of the subvector and move it to its insert position.
7098   SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
7099                        DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
7100   SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
7101                        DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
7102
7103   // Isolate the bits below the insertion point.
7104   unsigned LowShift = NumElems - IdxVal;
7105   SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
7106                             DAG.getTargetConstant(LowShift, dl, MVT::i8));
7107   Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
7108                     DAG.getTargetConstant(LowShift, dl, MVT::i8));
7109
7110   // Isolate the bits after the last inserted bit.
7111   unsigned HighShift = IdxVal + SubVecNumElems;
7112   SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
7113                             DAG.getTargetConstant(HighShift, dl, MVT::i8));
7114   High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
7115                     DAG.getTargetConstant(HighShift, dl, MVT::i8));
7116
7117   // Now OR all 3 pieces together.
7118   Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
7119   SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
7120
7121   // Reduce to original width if needed.
7122   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
7123 }
7124
7125 static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
7126                                 const SDLoc &dl) {
7127   assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
7128   EVT SubVT = V1.getValueType();
7129   EVT SubSVT = SubVT.getScalarType();
7130   unsigned SubNumElts = SubVT.getVectorNumElements();
7131   unsigned SubVectorWidth = SubVT.getSizeInBits();
7132   EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
7133   SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
7134   return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
7135 }
7136
7137 /// Returns a vector of specified type with all bits set.
7138 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
7139 /// Then bitcast to their original type, ensuring they get CSE'd.
7140 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
7141   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7142          "Expected a 128/256/512-bit vector type");
7143
7144   APInt Ones = APInt::getAllOnes(32);
7145   unsigned NumElts = VT.getSizeInBits() / 32;
7146   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
7147   return DAG.getBitcast(VT, Vec);
7148 }
7149
7150 static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
7151                                       SDValue In, SelectionDAG &DAG) {
7152   EVT InVT = In.getValueType();
7153   assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
7154   assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
7155           ISD::ZERO_EXTEND == Opcode) &&
7156          "Unknown extension opcode");
7157
7158   // For 256-bit vectors, we only need the lower (128-bit) input half.
7159   // For 512-bit vectors, we only need the lower input half or quarter.
7160   if (InVT.getSizeInBits() > 128) {
7161     assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
7162            "Expected VTs to be the same size!");
7163     unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
7164     In = extractSubVector(In, 0, DAG, DL,
7165                           std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
7166     InVT = In.getValueType();
7167   }
7168
7169   if (VT.getVectorNumElements() != InVT.getVectorNumElements())
7170     Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
7171
7172   return DAG.getNode(Opcode, DL, VT, In);
7173 }
7174
7175 // Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
7176 static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
7177                             SDValue Mask, SelectionDAG &DAG) {
7178   LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
7179   RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
7180   return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
7181 }
7182
7183 // Match (xor X, -1) -> X.
7184 // Match extract_subvector(xor X, -1) -> extract_subvector(X).
7185 // Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
7186 static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
7187   V = peekThroughBitcasts(V);
7188   if (V.getOpcode() == ISD::XOR &&
7189       (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
7190        isAllOnesConstant(V.getOperand(1))))
7191     return V.getOperand(0);
7192   if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7193       (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
7194     if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
7195       Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
7196       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
7197                          Not, V.getOperand(1));
7198     }
7199   }
7200   SmallVector<SDValue, 2> CatOps;
7201   if (collectConcatOps(V.getNode(), CatOps, DAG)) {
7202     for (SDValue &CatOp : CatOps) {
7203       SDValue NotCat = IsNOT(CatOp, DAG);
7204       if (!NotCat) return SDValue();
7205       CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
7206     }
7207     return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
7208   }
7209   return SDValue();
7210 }
7211
7212 void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
7213                                    bool Lo, bool Unary) {
7214   assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
7215          "Illegal vector type to unpack");
7216   assert(Mask.empty() && "Expected an empty shuffle mask vector");
7217   int NumElts = VT.getVectorNumElements();
7218   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
7219   for (int i = 0; i < NumElts; ++i) {
7220     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
7221     int Pos = (i % NumEltsInLane) / 2 + LaneStart;
7222     Pos += (Unary ? 0 : NumElts * (i % 2));
7223     Pos += (Lo ? 0 : NumEltsInLane / 2);
7224     Mask.push_back(Pos);
7225   }
7226 }
7227
7228 /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
7229 /// imposed by AVX and specific to the unary pattern. Example:
7230 /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
7231 /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
7232 void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7233                                    bool Lo) {
7234   assert(Mask.empty() && "Expected an empty shuffle mask vector");
7235   int NumElts = VT.getVectorNumElements();
7236   for (int i = 0; i < NumElts; ++i) {
7237     int Pos = i / 2;
7238     Pos += (Lo ? 0 : NumElts / 2);
7239     Mask.push_back(Pos);
7240   }
7241 }
7242
7243 // Attempt to constant fold, else just create a VECTOR_SHUFFLE.
7244 static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
7245                                 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
7246   if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&
7247       (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
7248     SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
7249     for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
7250       int M = Mask[I];
7251       if (M < 0)
7252         continue;
7253       SDValue V = (M < NumElts) ? V1 : V2;
7254       if (V.isUndef())
7255         continue;
7256       Ops[I] = V.getOperand(M % NumElts);
7257     }
7258     return DAG.getBuildVector(VT, dl, Ops);
7259   }
7260
7261   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
7262 }
7263
7264 /// Returns a vector_shuffle node for an unpackl operation.
7265 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
7266                           SDValue V1, SDValue V2) {
7267   SmallVector<int, 8> Mask;
7268   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
7269   return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
7270 }
7271
7272 /// Returns a vector_shuffle node for an unpackh operation.
7273 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
7274                           SDValue V1, SDValue V2) {
7275   SmallVector<int, 8> Mask;
7276   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
7277   return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
7278 }
7279
7280 /// Returns a node that packs the LHS + RHS nodes together at half width.
7281 /// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
7282 /// TODO: Add subvector splitting if/when we have a need for it.
7283 static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
7284                        const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
7285                        bool PackHiHalf = false) {
7286   MVT OpVT = LHS.getSimpleValueType();
7287   unsigned EltSizeInBits = VT.getScalarSizeInBits();
7288   bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
7289   assert(OpVT == RHS.getSimpleValueType() &&
7290          VT.getSizeInBits() == OpVT.getSizeInBits() &&
7291          (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
7292          "Unexpected PACK operand types");
7293   assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
7294          "Unexpected PACK result type");
7295
7296   // Rely on vector shuffles for vXi64 -> vXi32 packing.
7297   if (EltSizeInBits == 32) {
7298     SmallVector<int> PackMask;
7299     int Offset = PackHiHalf ? 1 : 0;
7300     int NumElts = VT.getVectorNumElements();
7301     for (int I = 0; I != NumElts; I += 4) {
7302       PackMask.push_back(I + Offset);
7303       PackMask.push_back(I + Offset + 2);
7304       PackMask.push_back(I + Offset + NumElts);
7305       PackMask.push_back(I + Offset + NumElts + 2);
7306     }
7307     return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
7308                                 DAG.getBitcast(VT, RHS), PackMask);
7309   }
7310
7311   // See if we already have sufficient leading bits for PACKSS/PACKUS.
7312   if (!PackHiHalf) {
7313     if (UsePackUS &&
7314         DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
7315         DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
7316       return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
7317
7318     if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
7319         DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
7320       return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
7321   }
7322
7323   // Fallback to sign/zero extending the requested half and pack.
7324   SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
7325   if (UsePackUS) {
7326     if (PackHiHalf) {
7327       LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
7328       RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
7329     } else {
7330       SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
7331       LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
7332       RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
7333     };
7334     return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
7335   };
7336
7337   if (!PackHiHalf) {
7338     LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
7339     RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
7340   }
7341   LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
7342   RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
7343   return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
7344 }
7345
7346 /// Return a vector_shuffle of the specified vector of zero or undef vector.
7347 /// This produces a shuffle where the low element of V2 is swizzled into the
7348 /// zero/undef vector, landing at element Idx.
7349 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
7350 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
7351                                            bool IsZero,
7352                                            const X86Subtarget &Subtarget,
7353                                            SelectionDAG &DAG) {
7354   MVT VT = V2.getSimpleValueType();
7355   SDValue V1 = IsZero
7356     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
7357   int NumElems = VT.getVectorNumElements();
7358   SmallVector<int, 16> MaskVec(NumElems);
7359   for (int i = 0; i != NumElems; ++i)
7360     // If this is the insertion idx, put the low elt of V2 here.
7361     MaskVec[i] = (i == Idx) ? NumElems : i;
7362   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
7363 }
7364
7365 static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
7366   if (Ptr.getOpcode() == X86ISD::Wrapper ||
7367       Ptr.getOpcode() == X86ISD::WrapperRIP)
7368     Ptr = Ptr.getOperand(0);
7369
7370   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
7371   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
7372     return nullptr;
7373
7374   return CNode->getConstVal();
7375 }
7376
7377 static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
7378   if (!Load || !ISD::isNormalLoad(Load))
7379     return nullptr;
7380   return getTargetConstantFromBasePtr(Load->getBasePtr());
7381 }
7382
7383 static const Constant *getTargetConstantFromNode(SDValue Op) {
7384   Op = peekThroughBitcasts(Op);
7385   return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
7386 }
7387
7388 const Constant *
7389 X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
7390   assert(LD && "Unexpected null LoadSDNode");
7391   return getTargetConstantFromNode(LD);
7392 }
7393
7394 // Extract raw constant bits from constant pools.
7395 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
7396                                           APInt &UndefElts,
7397                                           SmallVectorImpl<APInt> &EltBits,
7398                                           bool AllowWholeUndefs = true,
7399                                           bool AllowPartialUndefs = true) {
7400   assert(EltBits.empty() && "Expected an empty EltBits vector");
7401
7402   Op = peekThroughBitcasts(Op);
7403
7404   EVT VT = Op.getValueType();
7405   unsigned SizeInBits = VT.getSizeInBits();
7406   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
7407   unsigned NumElts = SizeInBits / EltSizeInBits;
7408
7409   // Bitcast a source array of element bits to the target size.
7410   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
7411     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
7412     unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
7413     assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
7414            "Constant bit sizes don't match");
7415
7416     // Don't split if we don't allow undef bits.
7417     bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
7418     if (UndefSrcElts.getBoolValue() && !AllowUndefs)
7419       return false;
7420
7421     // If we're already the right size, don't bother bitcasting.
7422     if (NumSrcElts == NumElts) {
7423       UndefElts = UndefSrcElts;
7424       EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
7425       return true;
7426     }
7427
7428     // Extract all the undef/constant element data and pack into single bitsets.
7429     APInt UndefBits(SizeInBits, 0);
7430     APInt MaskBits(SizeInBits, 0);
7431
7432     for (unsigned i = 0; i != NumSrcElts; ++i) {
7433       unsigned BitOffset = i * SrcEltSizeInBits;
7434       if (UndefSrcElts[i])
7435         UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
7436       MaskBits.insertBits(SrcEltBits[i], BitOffset);
7437     }
7438
7439     // Split the undef/constant single bitset data into the target elements.
7440     UndefElts = APInt(NumElts, 0);
7441     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
7442
7443     for (unsigned i = 0; i != NumElts; ++i) {
7444       unsigned BitOffset = i * EltSizeInBits;
7445       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
7446
7447       // Only treat an element as UNDEF if all bits are UNDEF.
7448       if (UndefEltBits.isAllOnes()) {
7449         if (!AllowWholeUndefs)
7450           return false;
7451         UndefElts.setBit(i);
7452         continue;
7453       }
7454
7455       // If only some bits are UNDEF then treat them as zero (or bail if not
7456       // supported).
7457       if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
7458         return false;
7459
7460       EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
7461     }
7462     return true;
7463   };
7464
7465   // Collect constant bits and insert into mask/undef bit masks.
7466   auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
7467                                 unsigned UndefBitIndex) {
7468     if (!Cst)
7469       return false;
7470     if (isa<UndefValue>(Cst)) {
7471       Undefs.setBit(UndefBitIndex);
7472       return true;
7473     }
7474     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
7475       Mask = CInt->getValue();
7476       return true;
7477     }
7478     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
7479       Mask = CFP->getValueAPF().bitcastToAPInt();
7480       return true;
7481     }
7482     if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
7483       Type *Ty = CDS->getType();
7484       Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
7485       Type *EltTy = CDS->getElementType();
7486       bool IsInteger = EltTy->isIntegerTy();
7487       bool IsFP =
7488           EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
7489       if (!IsInteger && !IsFP)
7490         return false;
7491       unsigned EltBits = EltTy->getPrimitiveSizeInBits();
7492       for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
7493         if (IsInteger)
7494           Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
7495         else
7496           Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
7497                           I * EltBits);
7498       return true;
7499     }
7500     return false;
7501   };
7502
7503   // Handle UNDEFs.
7504   if (Op.isUndef()) {
7505     APInt UndefSrcElts = APInt::getAllOnes(NumElts);
7506     SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
7507     return CastBitData(UndefSrcElts, SrcEltBits);
7508   }
7509
7510   // Extract scalar constant bits.
7511   if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
7512     APInt UndefSrcElts = APInt::getZero(1);
7513     SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
7514     return CastBitData(UndefSrcElts, SrcEltBits);
7515   }
7516   if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7517     APInt UndefSrcElts = APInt::getZero(1);
7518     APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
7519     SmallVector<APInt, 64> SrcEltBits(1, RawBits);
7520     return CastBitData(UndefSrcElts, SrcEltBits);
7521   }
7522
7523   // Extract constant bits from build vector.
7524   if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
7525     BitVector Undefs;
7526     SmallVector<APInt> SrcEltBits;
7527     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7528     if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
7529       APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
7530       for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
7531         if (Undefs[I])
7532           UndefSrcElts.setBit(I);
7533       return CastBitData(UndefSrcElts, SrcEltBits);
7534     }
7535   }
7536
7537   // Extract constant bits from constant pool vector.
7538   if (auto *Cst = getTargetConstantFromNode(Op)) {
7539     Type *CstTy = Cst->getType();
7540     unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7541     if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
7542       return false;
7543
7544     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
7545     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7546
7547     APInt UndefSrcElts(NumSrcElts, 0);
7548     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
7549     for (unsigned i = 0; i != NumSrcElts; ++i)
7550       if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
7551                                UndefSrcElts, i))
7552         return false;
7553
7554     return CastBitData(UndefSrcElts, SrcEltBits);
7555   }
7556
7557   // Extract constant bits from a broadcasted constant pool scalar.
7558   if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
7559       EltSizeInBits <= VT.getScalarSizeInBits()) {
7560     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7561     if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
7562       return false;
7563
7564     SDValue Ptr = MemIntr->getBasePtr();
7565     if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
7566       unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7567       unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7568
7569       APInt UndefSrcElts(NumSrcElts, 0);
7570       SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
7571       if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
7572         if (UndefSrcElts[0])
7573           UndefSrcElts.setBits(0, NumSrcElts);
7574         if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
7575           SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
7576         SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
7577         return CastBitData(UndefSrcElts, SrcEltBits);
7578       }
7579     }
7580   }
7581
7582   // Extract constant bits from a subvector broadcast.
7583   if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
7584     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
7585     SDValue Ptr = MemIntr->getBasePtr();
7586     // The source constant may be larger than the subvector broadcast,
7587     // ensure we extract the correct subvector constants.
7588     if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
7589       Type *CstTy = Cst->getType();
7590       unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
7591       unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
7592       if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
7593           (SizeInBits % SubVecSizeInBits) != 0)
7594         return false;
7595       unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
7596       unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
7597       unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
7598       APInt UndefSubElts(NumSubElts, 0);
7599       SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
7600                                         APInt(CstEltSizeInBits, 0));
7601       for (unsigned i = 0; i != NumSubElts; ++i) {
7602         if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
7603                                  UndefSubElts, i))
7604           return false;
7605         for (unsigned j = 1; j != NumSubVecs; ++j)
7606           SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
7607       }
7608       UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
7609                                      UndefSubElts);
7610       return CastBitData(UndefSubElts, SubEltBits);
7611     }
7612   }
7613
7614   // Extract a rematerialized scalar constant insertion.
7615   if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
7616       Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
7617       isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
7618     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7619     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
7620
7621     APInt UndefSrcElts(NumSrcElts, 0);
7622     SmallVector<APInt, 64> SrcEltBits;
7623     auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
7624     SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
7625     SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
7626     return CastBitData(UndefSrcElts, SrcEltBits);
7627   }
7628
7629   // Insert constant bits from a base and sub vector sources.
7630   if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
7631     // If bitcasts to larger elements we might lose track of undefs - don't
7632     // allow any to be safe.
7633     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
7634     bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
7635
7636     APInt UndefSrcElts, UndefSubElts;
7637     SmallVector<APInt, 32> EltSrcBits, EltSubBits;
7638     if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
7639                                       UndefSubElts, EltSubBits,
7640                                       AllowWholeUndefs && AllowUndefs,
7641                                       AllowPartialUndefs && AllowUndefs) &&
7642         getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
7643                                       UndefSrcElts, EltSrcBits,
7644                                       AllowWholeUndefs && AllowUndefs,
7645                                       AllowPartialUndefs && AllowUndefs)) {
7646       unsigned BaseIdx = Op.getConstantOperandVal(2);
7647       UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
7648       for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
7649         EltSrcBits[BaseIdx + i] = EltSubBits[i];
7650       return CastBitData(UndefSrcElts, EltSrcBits);
7651     }
7652   }
7653
7654   // Extract constant bits from a subvector's source.
7655   if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
7656     // TODO - support extract_subvector through bitcasts.
7657     if (EltSizeInBits != VT.getScalarSizeInBits())
7658       return false;
7659
7660     if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7661                                       UndefElts, EltBits, AllowWholeUndefs,
7662                                       AllowPartialUndefs)) {
7663       EVT SrcVT = Op.getOperand(0).getValueType();
7664       unsigned NumSrcElts = SrcVT.getVectorNumElements();
7665       unsigned NumSubElts = VT.getVectorNumElements();
7666       unsigned BaseIdx = Op.getConstantOperandVal(1);
7667       UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
7668       if ((BaseIdx + NumSubElts) != NumSrcElts)
7669         EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
7670       if (BaseIdx != 0)
7671         EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
7672       return true;
7673     }
7674   }
7675
7676   // Extract constant bits from shuffle node sources.
7677   if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
7678     // TODO - support shuffle through bitcasts.
7679     if (EltSizeInBits != VT.getScalarSizeInBits())
7680       return false;
7681
7682     ArrayRef<int> Mask = SVN->getMask();
7683     if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
7684         llvm::any_of(Mask, [](int M) { return M < 0; }))
7685       return false;
7686
7687     APInt UndefElts0, UndefElts1;
7688     SmallVector<APInt, 32> EltBits0, EltBits1;
7689     if (isAnyInRange(Mask, 0, NumElts) &&
7690         !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7691                                        UndefElts0, EltBits0, AllowWholeUndefs,
7692                                        AllowPartialUndefs))
7693       return false;
7694     if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
7695         !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
7696                                        UndefElts1, EltBits1, AllowWholeUndefs,
7697                                        AllowPartialUndefs))
7698       return false;
7699
7700     UndefElts = APInt::getZero(NumElts);
7701     for (int i = 0; i != (int)NumElts; ++i) {
7702       int M = Mask[i];
7703       if (M < 0) {
7704         UndefElts.setBit(i);
7705         EltBits.push_back(APInt::getZero(EltSizeInBits));
7706       } else if (M < (int)NumElts) {
7707         if (UndefElts0[M])
7708           UndefElts.setBit(i);
7709         EltBits.push_back(EltBits0[M]);
7710       } else {
7711         if (UndefElts1[M - NumElts])
7712           UndefElts.setBit(i);
7713         EltBits.push_back(EltBits1[M - NumElts]);
7714       }
7715     }
7716     return true;
7717   }
7718
7719   return false;
7720 }
7721
7722 namespace llvm {
7723 namespace X86 {
7724 bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
7725   APInt UndefElts;
7726   SmallVector<APInt, 16> EltBits;
7727   if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
7728                                     UndefElts, EltBits, true,
7729                                     AllowPartialUndefs)) {
7730     int SplatIndex = -1;
7731     for (int i = 0, e = EltBits.size(); i != e; ++i) {
7732       if (UndefElts[i])
7733         continue;
7734       if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
7735         SplatIndex = -1;
7736         break;
7737       }
7738       SplatIndex = i;
7739     }
7740     if (0 <= SplatIndex) {
7741       SplatVal = EltBits[SplatIndex];
7742       return true;
7743     }
7744   }
7745
7746   return false;
7747 }
7748 } // namespace X86
7749 } // namespace llvm
7750
7751 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
7752                                         unsigned MaskEltSizeInBits,
7753                                         SmallVectorImpl<uint64_t> &RawMask,
7754                                         APInt &UndefElts) {
7755   // Extract the raw target constant bits.
7756   SmallVector<APInt, 64> EltBits;
7757   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
7758                                      EltBits, /* AllowWholeUndefs */ true,
7759                                      /* AllowPartialUndefs */ false))
7760     return false;
7761
7762   // Insert the extracted elements into the mask.
7763   for (const APInt &Elt : EltBits)
7764     RawMask.push_back(Elt.getZExtValue());
7765
7766   return true;
7767 }
7768
7769 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
7770 /// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
7771 /// Note: This ignores saturation, so inputs must be checked first.
7772 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7773                                   bool Unary, unsigned NumStages = 1) {
7774   assert(Mask.empty() && "Expected an empty shuffle mask vector");
7775   unsigned NumElts = VT.getVectorNumElements();
7776   unsigned NumLanes = VT.getSizeInBits() / 128;
7777   unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
7778   unsigned Offset = Unary ? 0 : NumElts;
7779   unsigned Repetitions = 1u << (NumStages - 1);
7780   unsigned Increment = 1u << NumStages;
7781   assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
7782
7783   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
7784     for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
7785       for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7786         Mask.push_back(Elt + (Lane * NumEltsPerLane));
7787       for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7788         Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
7789     }
7790   }
7791 }
7792
7793 // Split the demanded elts of a PACKSS/PACKUS node between its operands.
7794 static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
7795                                 APInt &DemandedLHS, APInt &DemandedRHS) {
7796   int NumLanes = VT.getSizeInBits() / 128;
7797   int NumElts = DemandedElts.getBitWidth();
7798   int NumInnerElts = NumElts / 2;
7799   int NumEltsPerLane = NumElts / NumLanes;
7800   int NumInnerEltsPerLane = NumInnerElts / NumLanes;
7801
7802   DemandedLHS = APInt::getZero(NumInnerElts);
7803   DemandedRHS = APInt::getZero(NumInnerElts);
7804
7805   // Map DemandedElts to the packed operands.
7806   for (int Lane = 0; Lane != NumLanes; ++Lane) {
7807     for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
7808       int OuterIdx = (Lane * NumEltsPerLane) + Elt;
7809       int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
7810       if (DemandedElts[OuterIdx])
7811         DemandedLHS.setBit(InnerIdx);
7812       if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
7813         DemandedRHS.setBit(InnerIdx);
7814     }
7815   }
7816 }
7817
7818 // Split the demanded elts of a HADD/HSUB node between its operands.
7819 static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
7820                                  APInt &DemandedLHS, APInt &DemandedRHS) {
7821   int NumLanes = VT.getSizeInBits() / 128;
7822   int NumElts = DemandedElts.getBitWidth();
7823   int NumEltsPerLane = NumElts / NumLanes;
7824   int HalfEltsPerLane = NumEltsPerLane / 2;
7825
7826   DemandedLHS = APInt::getZero(NumElts);
7827   DemandedRHS = APInt::getZero(NumElts);
7828
7829   // Map DemandedElts to the horizontal operands.
7830   for (int Idx = 0; Idx != NumElts; ++Idx) {
7831     if (!DemandedElts[Idx])
7832       continue;
7833     int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
7834     int LocalIdx = Idx % NumEltsPerLane;
7835     if (LocalIdx < HalfEltsPerLane) {
7836       DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7837       DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7838     } else {
7839       LocalIdx -= HalfEltsPerLane;
7840       DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7841       DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7842     }
7843   }
7844 }
7845
7846 /// Calculates the shuffle mask corresponding to the target-specific opcode.
7847 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
7848 /// operands in \p Ops, and returns true.
7849 /// Sets \p IsUnary to true if only one source is used. Note that this will set
7850 /// IsUnary for shuffles which use a single input multiple times, and in those
7851 /// cases it will adjust the mask to only have indices within that single input.
7852 /// It is an error to call this with non-empty Mask/Ops vectors.
7853 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7854                                  SmallVectorImpl<SDValue> &Ops,
7855                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
7856   unsigned NumElems = VT.getVectorNumElements();
7857   unsigned MaskEltSize = VT.getScalarSizeInBits();
7858   SmallVector<uint64_t, 32> RawMask;
7859   APInt RawUndefs;
7860   uint64_t ImmN;
7861
7862   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
7863   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
7864
7865   IsUnary = false;
7866   bool IsFakeUnary = false;
7867   switch (N->getOpcode()) {
7868   case X86ISD::BLENDI:
7869     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7870     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7871     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7872     DecodeBLENDMask(NumElems, ImmN, Mask);
7873     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7874     break;
7875   case X86ISD::SHUFP:
7876     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7877     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7878     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7879     DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7880     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7881     break;
7882   case X86ISD::INSERTPS:
7883     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7884     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7885     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7886     DecodeINSERTPSMask(ImmN, Mask);
7887     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7888     break;
7889   case X86ISD::EXTRQI:
7890     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7891     if (isa<ConstantSDNode>(N->getOperand(1)) &&
7892         isa<ConstantSDNode>(N->getOperand(2))) {
7893       int BitLen = N->getConstantOperandVal(1);
7894       int BitIdx = N->getConstantOperandVal(2);
7895       DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7896       IsUnary = true;
7897     }
7898     break;
7899   case X86ISD::INSERTQI:
7900     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7901     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7902     if (isa<ConstantSDNode>(N->getOperand(2)) &&
7903         isa<ConstantSDNode>(N->getOperand(3))) {
7904       int BitLen = N->getConstantOperandVal(2);
7905       int BitIdx = N->getConstantOperandVal(3);
7906       DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7907       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7908     }
7909     break;
7910   case X86ISD::UNPCKH:
7911     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7912     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7913     DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7914     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7915     break;
7916   case X86ISD::UNPCKL:
7917     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7918     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7919     DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7920     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7921     break;
7922   case X86ISD::MOVHLPS:
7923     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7924     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7925     DecodeMOVHLPSMask(NumElems, Mask);
7926     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7927     break;
7928   case X86ISD::MOVLHPS:
7929     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7930     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7931     DecodeMOVLHPSMask(NumElems, Mask);
7932     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7933     break;
7934   case X86ISD::VALIGN:
7935     assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
7936            "Only 32-bit and 64-bit elements are supported!");
7937     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7938     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7939     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7940     DecodeVALIGNMask(NumElems, ImmN, Mask);
7941     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7942     Ops.push_back(N->getOperand(1));
7943     Ops.push_back(N->getOperand(0));
7944     break;
7945   case X86ISD::PALIGNR:
7946     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7947     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7948     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
7949     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7950     DecodePALIGNRMask(NumElems, ImmN, Mask);
7951     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7952     Ops.push_back(N->getOperand(1));
7953     Ops.push_back(N->getOperand(0));
7954     break;
7955   case X86ISD::VSHLDQ:
7956     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7957     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7958     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7959     DecodePSLLDQMask(NumElems, ImmN, Mask);
7960     IsUnary = true;
7961     break;
7962   case X86ISD::VSRLDQ:
7963     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
7964     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7965     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7966     DecodePSRLDQMask(NumElems, ImmN, Mask);
7967     IsUnary = true;
7968     break;
7969   case X86ISD::PSHUFD:
7970   case X86ISD::VPERMILPI:
7971     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7972     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7973     DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7974     IsUnary = true;
7975     break;
7976   case X86ISD::PSHUFHW:
7977     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7978     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7979     DecodePSHUFHWMask(NumElems, ImmN, Mask);
7980     IsUnary = true;
7981     break;
7982   case X86ISD::PSHUFLW:
7983     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7984     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7985     DecodePSHUFLWMask(NumElems, ImmN, Mask);
7986     IsUnary = true;
7987     break;
7988   case X86ISD::VZEXT_MOVL:
7989     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
7990     DecodeZeroMoveLowMask(NumElems, Mask);
7991     IsUnary = true;
7992     break;
7993   case X86ISD::VBROADCAST:
7994     // We only decode broadcasts of same-sized vectors, peeking through to
7995     // extracted subvectors is likely to cause hasOneUse issues with
7996     // SimplifyDemandedBits etc.
7997     if (N->getOperand(0).getValueType() == VT) {
7998       DecodeVectorBroadcast(NumElems, Mask);
7999       IsUnary = true;
8000       break;
8001     }
8002     return false;
8003   case X86ISD::VPERMILPV: {
8004     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
8005     IsUnary = true;
8006     SDValue MaskNode = N->getOperand(1);
8007     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8008                                     RawUndefs)) {
8009       DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
8010       break;
8011     }
8012     return false;
8013   }
8014   case X86ISD::PSHUFB: {
8015     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
8016     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
8017     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
8018     IsUnary = true;
8019     SDValue MaskNode = N->getOperand(1);
8020     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
8021       DecodePSHUFBMask(RawMask, RawUndefs, Mask);
8022       break;
8023     }
8024     return false;
8025   }
8026   case X86ISD::VPERMI:
8027     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
8028     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
8029     DecodeVPERMMask(NumElems, ImmN, Mask);
8030     IsUnary = true;
8031     break;
8032   case X86ISD::MOVSS:
8033   case X86ISD::MOVSD:
8034   case X86ISD::MOVSH:
8035     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
8036     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
8037     DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
8038     break;
8039   case X86ISD::VPERM2X128:
8040     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
8041     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
8042     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
8043     DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
8044     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
8045     break;
8046   case X86ISD::SHUF128:
8047     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
8048     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
8049     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
8050     decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
8051     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
8052     break;
8053   case X86ISD::MOVSLDUP:
8054     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
8055     DecodeMOVSLDUPMask(NumElems, Mask);
8056     IsUnary = true;
8057     break;
8058   case X86ISD::MOVSHDUP:
8059     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
8060     DecodeMOVSHDUPMask(NumElems, Mask);
8061     IsUnary = true;
8062     break;
8063   case X86ISD::MOVDDUP:
8064     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
8065     DecodeMOVDDUPMask(NumElems, Mask);
8066     IsUnary = true;
8067     break;
8068   case X86ISD::VPERMIL2: {
8069     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
8070     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
8071     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
8072     SDValue MaskNode = N->getOperand(2);
8073     SDValue CtrlNode = N->getOperand(3);
8074     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
8075       unsigned CtrlImm = CtrlOp->getZExtValue();
8076       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8077                                       RawUndefs)) {
8078         DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
8079                             Mask);
8080         break;
8081       }
8082     }
8083     return false;
8084   }
8085   case X86ISD::VPPERM: {
8086     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
8087     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
8088     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
8089     SDValue MaskNode = N->getOperand(2);
8090     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
8091       DecodeVPPERMMask(RawMask, RawUndefs, Mask);
8092       break;
8093     }
8094     return false;
8095   }
8096   case X86ISD::VPERMV: {
8097     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
8098     IsUnary = true;
8099     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
8100     Ops.push_back(N->getOperand(1));
8101     SDValue MaskNode = N->getOperand(0);
8102     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8103                                     RawUndefs)) {
8104       DecodeVPERMVMask(RawMask, RawUndefs, Mask);
8105       break;
8106     }
8107     return false;
8108   }
8109   case X86ISD::VPERMV3: {
8110     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
8111     assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
8112     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
8113     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
8114     Ops.push_back(N->getOperand(0));
8115     Ops.push_back(N->getOperand(2));
8116     SDValue MaskNode = N->getOperand(1);
8117     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
8118                                     RawUndefs)) {
8119       DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
8120       break;
8121     }
8122     return false;
8123   }
8124   default: llvm_unreachable("unknown target shuffle node");
8125   }
8126
8127   // Empty mask indicates the decode failed.
8128   if (Mask.empty())
8129     return false;
8130
8131   // Check if we're getting a shuffle mask with zero'd elements.
8132   if (!AllowSentinelZero && isAnyZero(Mask))
8133     return false;
8134
8135   // If we have a fake unary shuffle, the shuffle mask is spread across two
8136   // inputs that are actually the same node. Re-map the mask to always point
8137   // into the first input.
8138   if (IsFakeUnary)
8139     for (int &M : Mask)
8140       if (M >= (int)Mask.size())
8141         M -= Mask.size();
8142
8143   // If we didn't already add operands in the opcode-specific code, default to
8144   // adding 1 or 2 operands starting at 0.
8145   if (Ops.empty()) {
8146     Ops.push_back(N->getOperand(0));
8147     if (!IsUnary || IsFakeUnary)
8148       Ops.push_back(N->getOperand(1));
8149   }
8150
8151   return true;
8152 }
8153
8154 // Wrapper for getTargetShuffleMask with InUnary;
8155 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
8156                                  SmallVectorImpl<SDValue> &Ops,
8157                                  SmallVectorImpl<int> &Mask) {
8158   bool IsUnary;
8159   return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
8160 }
8161
8162 /// Compute whether each element of a shuffle is zeroable.
8163 ///
8164 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8165 /// Either it is an undef element in the shuffle mask, the element of the input
8166 /// referenced is undef, or the element of the input referenced is known to be
8167 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8168 /// as many lanes with this technique as possible to simplify the remaining
8169 /// shuffle.
8170 static void computeZeroableShuffleElements(ArrayRef<int> Mask,
8171                                            SDValue V1, SDValue V2,
8172                                            APInt &KnownUndef, APInt &KnownZero) {
8173   int Size = Mask.size();
8174   KnownUndef = KnownZero = APInt::getZero(Size);
8175
8176   V1 = peekThroughBitcasts(V1);
8177   V2 = peekThroughBitcasts(V2);
8178
8179   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8180   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8181
8182   int VectorSizeInBits = V1.getValueSizeInBits();
8183   int ScalarSizeInBits = VectorSizeInBits / Size;
8184   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8185
8186   for (int i = 0; i < Size; ++i) {
8187     int M = Mask[i];
8188     // Handle the easy cases.
8189     if (M < 0) {
8190       KnownUndef.setBit(i);
8191       continue;
8192     }
8193     if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8194       KnownZero.setBit(i);
8195       continue;
8196     }
8197
8198     // Determine shuffle input and normalize the mask.
8199     SDValue V = M < Size ? V1 : V2;
8200     M %= Size;
8201
8202     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8203     if (V.getOpcode() != ISD::BUILD_VECTOR)
8204       continue;
8205
8206     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8207     // the (larger) source element must be UNDEF/ZERO.
8208     if ((Size % V.getNumOperands()) == 0) {
8209       int Scale = Size / V->getNumOperands();
8210       SDValue Op = V.getOperand(M / Scale);
8211       if (Op.isUndef())
8212         KnownUndef.setBit(i);
8213       if (X86::isZeroNode(Op))
8214         KnownZero.setBit(i);
8215       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8216         APInt Val = Cst->getAPIntValue();
8217         Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
8218         if (Val == 0)
8219           KnownZero.setBit(i);
8220       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8221         APInt Val = Cst->getValueAPF().bitcastToAPInt();
8222         Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
8223         if (Val == 0)
8224           KnownZero.setBit(i);
8225       }
8226       continue;
8227     }
8228
8229     // If the BUILD_VECTOR has more elements then all the (smaller) source
8230     // elements must be UNDEF or ZERO.
8231     if ((V.getNumOperands() % Size) == 0) {
8232       int Scale = V->getNumOperands() / Size;
8233       bool AllUndef = true;
8234       bool AllZero = true;
8235       for (int j = 0; j < Scale; ++j) {
8236         SDValue Op = V.getOperand((M * Scale) + j);
8237         AllUndef &= Op.isUndef();
8238         AllZero &= X86::isZeroNode(Op);
8239       }
8240       if (AllUndef)
8241         KnownUndef.setBit(i);
8242       if (AllZero)
8243         KnownZero.setBit(i);
8244       continue;
8245     }
8246   }
8247 }
8248
8249 /// Decode a target shuffle mask and inputs and see if any values are
8250 /// known to be undef or zero from their inputs.
8251 /// Returns true if the target shuffle mask was decoded.
8252 /// FIXME: Merge this with computeZeroableShuffleElements?
8253 static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
8254                                          SmallVectorImpl<SDValue> &Ops,
8255                                          APInt &KnownUndef, APInt &KnownZero) {
8256   bool IsUnary;
8257   if (!isTargetShuffle(N.getOpcode()))
8258     return false;
8259
8260   MVT VT = N.getSimpleValueType();
8261   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
8262     return false;
8263
8264   int Size = Mask.size();
8265   SDValue V1 = Ops[0];
8266   SDValue V2 = IsUnary ? V1 : Ops[1];
8267   KnownUndef = KnownZero = APInt::getZero(Size);
8268
8269   V1 = peekThroughBitcasts(V1);
8270   V2 = peekThroughBitcasts(V2);
8271
8272   assert((VT.getSizeInBits() % Size) == 0 &&
8273          "Illegal split of shuffle value type");
8274   unsigned EltSizeInBits = VT.getSizeInBits() / Size;
8275
8276   // Extract known constant input data.
8277   APInt UndefSrcElts[2];
8278   SmallVector<APInt, 32> SrcEltBits[2];
8279   bool IsSrcConstant[2] = {
8280       getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
8281                                     SrcEltBits[0], true, false),
8282       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
8283                                     SrcEltBits[1], true, false)};
8284
8285   for (int i = 0; i < Size; ++i) {
8286     int M = Mask[i];
8287
8288     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
8289     if (M < 0) {
8290       assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
8291       if (SM_SentinelUndef == M)
8292         KnownUndef.setBit(i);
8293       if (SM_SentinelZero == M)
8294         KnownZero.setBit(i);
8295       continue;
8296     }
8297
8298     // Determine shuffle input and normalize the mask.
8299     unsigned SrcIdx = M / Size;
8300     SDValue V = M < Size ? V1 : V2;
8301     M %= Size;
8302
8303     // We are referencing an UNDEF input.
8304     if (V.isUndef()) {
8305       KnownUndef.setBit(i);
8306       continue;
8307     }
8308
8309     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
8310     // TODO: We currently only set UNDEF for integer types - floats use the same
8311     // registers as vectors and many of the scalar folded loads rely on the
8312     // SCALAR_TO_VECTOR pattern.
8313     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
8314         (Size % V.getValueType().getVectorNumElements()) == 0) {
8315       int Scale = Size / V.getValueType().getVectorNumElements();
8316       int Idx = M / Scale;
8317       if (Idx != 0 && !VT.isFloatingPoint())
8318         KnownUndef.setBit(i);
8319       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
8320         KnownZero.setBit(i);
8321       continue;
8322     }
8323
8324     // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
8325     // base vectors.
8326     if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
8327       SDValue Vec = V.getOperand(0);
8328       int NumVecElts = Vec.getValueType().getVectorNumElements();
8329       if (Vec.isUndef() && Size == NumVecElts) {
8330         int Idx = V.getConstantOperandVal(2);
8331         int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
8332         if (M < Idx || (Idx + NumSubElts) <= M)
8333           KnownUndef.setBit(i);
8334       }
8335       continue;
8336     }
8337
8338     // Attempt to extract from the source's constant bits.
8339     if (IsSrcConstant[SrcIdx]) {
8340       if (UndefSrcElts[SrcIdx][M])
8341         KnownUndef.setBit(i);
8342       else if (SrcEltBits[SrcIdx][M] == 0)
8343         KnownZero.setBit(i);
8344     }
8345   }
8346
8347   assert(VT.getVectorNumElements() == (unsigned)Size &&
8348          "Different mask size from vector size!");
8349   return true;
8350 }
8351
8352 // Replace target shuffle mask elements with known undef/zero sentinels.
8353 static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
8354                                               const APInt &KnownUndef,
8355                                               const APInt &KnownZero,
8356                                               bool ResolveKnownZeros= true) {
8357   unsigned NumElts = Mask.size();
8358   assert(KnownUndef.getBitWidth() == NumElts &&
8359          KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
8360
8361   for (unsigned i = 0; i != NumElts; ++i) {
8362     if (KnownUndef[i])
8363       Mask[i] = SM_SentinelUndef;
8364     else if (ResolveKnownZeros && KnownZero[i])
8365       Mask[i] = SM_SentinelZero;
8366   }
8367 }
8368
8369 // Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
8370 static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
8371                                               APInt &KnownUndef,
8372                                               APInt &KnownZero) {
8373   unsigned NumElts = Mask.size();
8374   KnownUndef = KnownZero = APInt::getZero(NumElts);
8375
8376   for (unsigned i = 0; i != NumElts; ++i) {
8377     int M = Mask[i];
8378     if (SM_SentinelUndef == M)
8379       KnownUndef.setBit(i);
8380     if (SM_SentinelZero == M)
8381       KnownZero.setBit(i);
8382   }
8383 }
8384
8385 // Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
8386 static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
8387                                          SDValue Cond, bool IsBLENDV = false) {
8388   EVT CondVT = Cond.getValueType();
8389   unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
8390   unsigned NumElts = CondVT.getVectorNumElements();
8391
8392   APInt UndefElts;
8393   SmallVector<APInt, 32> EltBits;
8394   if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
8395                                      true, false))
8396     return false;
8397
8398   Mask.resize(NumElts, SM_SentinelUndef);
8399
8400   for (int i = 0; i != (int)NumElts; ++i) {
8401     Mask[i] = i;
8402     // Arbitrarily choose from the 2nd operand if the select condition element
8403     // is undef.
8404     // TODO: Can we do better by matching patterns such as even/odd?
8405     if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
8406         (IsBLENDV && EltBits[i].isNonNegative()))
8407       Mask[i] += NumElts;
8408   }
8409
8410   return true;
8411 }
8412
8413 // Forward declaration (for getFauxShuffleMask recursive check).
8414 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8415                                    SmallVectorImpl<SDValue> &Inputs,
8416                                    SmallVectorImpl<int> &Mask,
8417                                    const SelectionDAG &DAG, unsigned Depth,
8418                                    bool ResolveKnownElts);
8419
8420 // Attempt to decode ops that could be represented as a shuffle mask.
8421 // The decoded shuffle mask may contain a different number of elements to the
8422 // destination value type.
8423 // TODO: Merge into getTargetShuffleInputs()
8424 static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
8425                                SmallVectorImpl<int> &Mask,
8426                                SmallVectorImpl<SDValue> &Ops,
8427                                const SelectionDAG &DAG, unsigned Depth,
8428                                bool ResolveKnownElts) {
8429   Mask.clear();
8430   Ops.clear();
8431
8432   MVT VT = N.getSimpleValueType();
8433   unsigned NumElts = VT.getVectorNumElements();
8434   unsigned NumSizeInBits = VT.getSizeInBits();
8435   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
8436   if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
8437     return false;
8438   assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
8439   unsigned NumSizeInBytes = NumSizeInBits / 8;
8440   unsigned NumBytesPerElt = NumBitsPerElt / 8;
8441
8442   unsigned Opcode = N.getOpcode();
8443   switch (Opcode) {
8444   case ISD::VECTOR_SHUFFLE: {
8445     // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
8446     ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
8447     if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
8448       Mask.append(ShuffleMask.begin(), ShuffleMask.end());
8449       Ops.push_back(N.getOperand(0));
8450       Ops.push_back(N.getOperand(1));
8451       return true;
8452     }
8453     return false;
8454   }
8455   case ISD::AND:
8456   case X86ISD::ANDNP: {
8457     // Attempt to decode as a per-byte mask.
8458     APInt UndefElts;
8459     SmallVector<APInt, 32> EltBits;
8460     SDValue N0 = N.getOperand(0);
8461     SDValue N1 = N.getOperand(1);
8462     bool IsAndN = (X86ISD::ANDNP == Opcode);
8463     uint64_t ZeroMask = IsAndN ? 255 : 0;
8464     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
8465       return false;
8466     // We can't assume an undef src element gives an undef dst - the other src
8467     // might be zero.
8468     if (!UndefElts.isZero())
8469       return false;
8470     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
8471       const APInt &ByteBits = EltBits[i];
8472       if (ByteBits != 0 && ByteBits != 255)
8473         return false;
8474       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
8475     }
8476     Ops.push_back(IsAndN ? N1 : N0);
8477     return true;
8478   }
8479   case ISD::OR: {
8480     // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
8481     // is a valid shuffle index.
8482     SDValue N0 = peekThroughBitcasts(N.getOperand(0));
8483     SDValue N1 = peekThroughBitcasts(N.getOperand(1));
8484     if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
8485       return false;
8486
8487     SmallVector<int, 64> SrcMask0, SrcMask1;
8488     SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
8489     APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());
8490     APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());
8491     if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
8492                                 Depth + 1, true) ||
8493         !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
8494                                 Depth + 1, true))
8495       return false;
8496
8497     size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
8498     SmallVector<int, 64> Mask0, Mask1;
8499     narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
8500     narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
8501     for (int i = 0; i != (int)MaskSize; ++i) {
8502       // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
8503       // loops converting between OR and BLEND shuffles due to
8504       // canWidenShuffleElements merging away undef elements, meaning we
8505       // fail to recognise the OR as the undef element isn't known zero.
8506       if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
8507         Mask.push_back(SM_SentinelZero);
8508       else if (Mask1[i] == SM_SentinelZero)
8509         Mask.push_back(i);
8510       else if (Mask0[i] == SM_SentinelZero)
8511         Mask.push_back(i + MaskSize);
8512       else
8513         return false;
8514     }
8515     Ops.push_back(N0);
8516     Ops.push_back(N1);
8517     return true;
8518   }
8519   case ISD::INSERT_SUBVECTOR: {
8520     SDValue Src = N.getOperand(0);
8521     SDValue Sub = N.getOperand(1);
8522     EVT SubVT = Sub.getValueType();
8523     unsigned NumSubElts = SubVT.getVectorNumElements();
8524     if (!N->isOnlyUserOf(Sub.getNode()))
8525       return false;
8526     uint64_t InsertIdx = N.getConstantOperandVal(2);
8527     // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
8528     if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
8529         Sub.getOperand(0).getValueType() == VT) {
8530       uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
8531       for (int i = 0; i != (int)NumElts; ++i)
8532         Mask.push_back(i);
8533       for (int i = 0; i != (int)NumSubElts; ++i)
8534         Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
8535       Ops.push_back(Src);
8536       Ops.push_back(Sub.getOperand(0));
8537       return true;
8538     }
8539     // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
8540     SmallVector<int, 64> SubMask;
8541     SmallVector<SDValue, 2> SubInputs;
8542     SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
8543     EVT SubSrcVT = SubSrc.getValueType();
8544     if (!SubSrcVT.isVector())
8545       return false;
8546
8547     APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
8548     if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
8549                                 Depth + 1, ResolveKnownElts))
8550       return false;
8551
8552     // Subvector shuffle inputs must not be larger than the subvector.
8553     if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
8554           return SubVT.getFixedSizeInBits() <
8555                  SubInput.getValueSizeInBits().getFixedValue();
8556         }))
8557       return false;
8558
8559     if (SubMask.size() != NumSubElts) {
8560       assert(((SubMask.size() % NumSubElts) == 0 ||
8561               (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
8562       if ((NumSubElts % SubMask.size()) == 0) {
8563         int Scale = NumSubElts / SubMask.size();
8564         SmallVector<int,64> ScaledSubMask;
8565         narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
8566         SubMask = ScaledSubMask;
8567       } else {
8568         int Scale = SubMask.size() / NumSubElts;
8569         NumSubElts = SubMask.size();
8570         NumElts *= Scale;
8571         InsertIdx *= Scale;
8572       }
8573     }
8574     Ops.push_back(Src);
8575     Ops.append(SubInputs.begin(), SubInputs.end());
8576     if (ISD::isBuildVectorAllZeros(Src.getNode()))
8577       Mask.append(NumElts, SM_SentinelZero);
8578     else
8579       for (int i = 0; i != (int)NumElts; ++i)
8580         Mask.push_back(i);
8581     for (int i = 0; i != (int)NumSubElts; ++i) {
8582       int M = SubMask[i];
8583       if (0 <= M) {
8584         int InputIdx = M / NumSubElts;
8585         M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
8586       }
8587       Mask[i + InsertIdx] = M;
8588     }
8589     return true;
8590   }
8591   case X86ISD::PINSRB:
8592   case X86ISD::PINSRW:
8593   case ISD::SCALAR_TO_VECTOR:
8594   case ISD::INSERT_VECTOR_ELT: {
8595     // Match against a insert_vector_elt/scalar_to_vector of an extract from a
8596     // vector, for matching src/dst vector types.
8597     SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
8598
8599     unsigned DstIdx = 0;
8600     if (Opcode != ISD::SCALAR_TO_VECTOR) {
8601       // Check we have an in-range constant insertion index.
8602       if (!isa<ConstantSDNode>(N.getOperand(2)) ||
8603           N.getConstantOperandAPInt(2).uge(NumElts))
8604         return false;
8605       DstIdx = N.getConstantOperandVal(2);
8606
8607       // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
8608       if (X86::isZeroNode(Scl)) {
8609         Ops.push_back(N.getOperand(0));
8610         for (unsigned i = 0; i != NumElts; ++i)
8611           Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
8612         return true;
8613       }
8614     }
8615
8616     // Peek through trunc/aext/zext.
8617     // TODO: aext shouldn't require SM_SentinelZero padding.
8618     // TODO: handle shift of scalars.
8619     unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
8620     while (Scl.getOpcode() == ISD::TRUNCATE ||
8621            Scl.getOpcode() == ISD::ANY_EXTEND ||
8622            Scl.getOpcode() == ISD::ZERO_EXTEND) {
8623       Scl = Scl.getOperand(0);
8624       MinBitsPerElt =
8625           std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
8626     }
8627     if ((MinBitsPerElt % 8) != 0)
8628       return false;
8629
8630     // Attempt to find the source vector the scalar was extracted from.
8631     SDValue SrcExtract;
8632     if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
8633          Scl.getOpcode() == X86ISD::PEXTRW ||
8634          Scl.getOpcode() == X86ISD::PEXTRB) &&
8635         Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
8636       SrcExtract = Scl;
8637     }
8638     if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
8639       return false;
8640
8641     SDValue SrcVec = SrcExtract.getOperand(0);
8642     EVT SrcVT = SrcVec.getValueType();
8643     if (!SrcVT.getScalarType().isByteSized())
8644       return false;
8645     unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
8646     unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
8647     unsigned DstByte = DstIdx * NumBytesPerElt;
8648     MinBitsPerElt =
8649         std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
8650
8651     // Create 'identity' byte level shuffle mask and then add inserted bytes.
8652     if (Opcode == ISD::SCALAR_TO_VECTOR) {
8653       Ops.push_back(SrcVec);
8654       Mask.append(NumSizeInBytes, SM_SentinelUndef);
8655     } else {
8656       Ops.push_back(SrcVec);
8657       Ops.push_back(N.getOperand(0));
8658       for (int i = 0; i != (int)NumSizeInBytes; ++i)
8659         Mask.push_back(NumSizeInBytes + i);
8660     }
8661
8662     unsigned MinBytesPerElts = MinBitsPerElt / 8;
8663     MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
8664     for (unsigned i = 0; i != MinBytesPerElts; ++i)
8665       Mask[DstByte + i] = SrcByte + i;
8666     for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
8667       Mask[DstByte + i] = SM_SentinelZero;
8668     return true;
8669   }
8670   case X86ISD::PACKSS:
8671   case X86ISD::PACKUS: {
8672     SDValue N0 = N.getOperand(0);
8673     SDValue N1 = N.getOperand(1);
8674     assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
8675            N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
8676            "Unexpected input value type");
8677
8678     APInt EltsLHS, EltsRHS;
8679     getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
8680
8681     // If we know input saturation won't happen (or we don't care for particular
8682     // lanes), we can treat this as a truncation shuffle.
8683     bool Offset0 = false, Offset1 = false;
8684     if (Opcode == X86ISD::PACKSS) {
8685       if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8686            DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
8687           (!(N1.isUndef() || EltsRHS.isZero()) &&
8688            DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
8689         return false;
8690       // We can't easily fold ASHR into a shuffle, but if it was feeding a
8691       // PACKSS then it was likely being used for sign-extension for a
8692       // truncation, so just peek through and adjust the mask accordingly.
8693       if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
8694           N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
8695         Offset0 = true;
8696         N0 = N0.getOperand(0);
8697       }
8698       if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
8699           N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
8700         Offset1 = true;
8701         N1 = N1.getOperand(0);
8702       }
8703     } else {
8704       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
8705       if ((!(N0.isUndef() || EltsLHS.isZero()) &&
8706            !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
8707           (!(N1.isUndef() || EltsRHS.isZero()) &&
8708            !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
8709         return false;
8710     }
8711
8712     bool IsUnary = (N0 == N1);
8713
8714     Ops.push_back(N0);
8715     if (!IsUnary)
8716       Ops.push_back(N1);
8717
8718     createPackShuffleMask(VT, Mask, IsUnary);
8719
8720     if (Offset0 || Offset1) {
8721       for (int &M : Mask)
8722         if ((Offset0 && isInRange(M, 0, NumElts)) ||
8723             (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
8724           ++M;
8725     }
8726     return true;
8727   }
8728   case ISD::VSELECT:
8729   case X86ISD::BLENDV: {
8730     SDValue Cond = N.getOperand(0);
8731     if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
8732       Ops.push_back(N.getOperand(1));
8733       Ops.push_back(N.getOperand(2));
8734       return true;
8735     }
8736     return false;
8737   }
8738   case X86ISD::VTRUNC: {
8739     SDValue Src = N.getOperand(0);
8740     EVT SrcVT = Src.getValueType();
8741     // Truncated source must be a simple vector.
8742     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8743         (SrcVT.getScalarSizeInBits() % 8) != 0)
8744       return false;
8745     unsigned NumSrcElts = SrcVT.getVectorNumElements();
8746     unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
8747     unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
8748     assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
8749     for (unsigned i = 0; i != NumSrcElts; ++i)
8750       Mask.push_back(i * Scale);
8751     Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
8752     Ops.push_back(Src);
8753     return true;
8754   }
8755   case X86ISD::VSHLI:
8756   case X86ISD::VSRLI: {
8757     uint64_t ShiftVal = N.getConstantOperandVal(1);
8758     // Out of range bit shifts are guaranteed to be zero.
8759     if (NumBitsPerElt <= ShiftVal) {
8760       Mask.append(NumElts, SM_SentinelZero);
8761       return true;
8762     }
8763
8764     // We can only decode 'whole byte' bit shifts as shuffles.
8765     if ((ShiftVal % 8) != 0)
8766       break;
8767
8768     uint64_t ByteShift = ShiftVal / 8;
8769     Ops.push_back(N.getOperand(0));
8770
8771     // Clear mask to all zeros and insert the shifted byte indices.
8772     Mask.append(NumSizeInBytes, SM_SentinelZero);
8773
8774     if (X86ISD::VSHLI == Opcode) {
8775       for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8776         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8777           Mask[i + j] = i + j - ByteShift;
8778     } else {
8779       for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8780         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8781           Mask[i + j - ByteShift] = i + j;
8782     }
8783     return true;
8784   }
8785   case X86ISD::VROTLI:
8786   case X86ISD::VROTRI: {
8787     // We can only decode 'whole byte' bit rotates as shuffles.
8788     uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
8789     if ((RotateVal % 8) != 0)
8790       return false;
8791     Ops.push_back(N.getOperand(0));
8792     int Offset = RotateVal / 8;
8793     Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
8794     for (int i = 0; i != (int)NumElts; ++i) {
8795       int BaseIdx = i * NumBytesPerElt;
8796       for (int j = 0; j != (int)NumBytesPerElt; ++j) {
8797         Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
8798       }
8799     }
8800     return true;
8801   }
8802   case X86ISD::VBROADCAST: {
8803     SDValue Src = N.getOperand(0);
8804     if (!Src.getSimpleValueType().isVector()) {
8805       if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8806           !isNullConstant(Src.getOperand(1)) ||
8807           Src.getOperand(0).getValueType().getScalarType() !=
8808               VT.getScalarType())
8809         return false;
8810       Src = Src.getOperand(0);
8811     }
8812     Ops.push_back(Src);
8813     Mask.append(NumElts, 0);
8814     return true;
8815   }
8816   case ISD::ZERO_EXTEND:
8817   case ISD::ANY_EXTEND:
8818   case ISD::ZERO_EXTEND_VECTOR_INREG:
8819   case ISD::ANY_EXTEND_VECTOR_INREG: {
8820     SDValue Src = N.getOperand(0);
8821     EVT SrcVT = Src.getValueType();
8822
8823     // Extended source must be a simple vector.
8824     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8825         (SrcVT.getScalarSizeInBits() % 8) != 0)
8826       return false;
8827
8828     bool IsAnyExtend =
8829         (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
8830     DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
8831                          IsAnyExtend, Mask);
8832     Ops.push_back(Src);
8833     return true;
8834   }
8835   }
8836
8837   return false;
8838 }
8839
8840 /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
8841 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
8842                                               SmallVectorImpl<int> &Mask) {
8843   int MaskWidth = Mask.size();
8844   SmallVector<SDValue, 16> UsedInputs;
8845   for (int i = 0, e = Inputs.size(); i < e; ++i) {
8846     int lo = UsedInputs.size() * MaskWidth;
8847     int hi = lo + MaskWidth;
8848
8849     // Strip UNDEF input usage.
8850     if (Inputs[i].isUndef())
8851       for (int &M : Mask)
8852         if ((lo <= M) && (M < hi))
8853           M = SM_SentinelUndef;
8854
8855     // Check for unused inputs.
8856     if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
8857       for (int &M : Mask)
8858         if (lo <= M)
8859           M -= MaskWidth;
8860       continue;
8861     }
8862
8863     // Check for repeated inputs.
8864     bool IsRepeat = false;
8865     for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
8866       if (UsedInputs[j] != Inputs[i])
8867         continue;
8868       for (int &M : Mask)
8869         if (lo <= M)
8870           M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
8871       IsRepeat = true;
8872       break;
8873     }
8874     if (IsRepeat)
8875       continue;
8876
8877     UsedInputs.push_back(Inputs[i]);
8878   }
8879   Inputs = UsedInputs;
8880 }
8881
8882 /// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
8883 /// and then sets the SM_SentinelUndef and SM_SentinelZero values.
8884 /// Returns true if the target shuffle mask was decoded.
8885 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8886                                    SmallVectorImpl<SDValue> &Inputs,
8887                                    SmallVectorImpl<int> &Mask,
8888                                    APInt &KnownUndef, APInt &KnownZero,
8889                                    const SelectionDAG &DAG, unsigned Depth,
8890                                    bool ResolveKnownElts) {
8891   if (Depth >= SelectionDAG::MaxRecursionDepth)
8892     return false; // Limit search depth.
8893
8894   EVT VT = Op.getValueType();
8895   if (!VT.isSimple() || !VT.isVector())
8896     return false;
8897
8898   if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
8899     if (ResolveKnownElts)
8900       resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
8901     return true;
8902   }
8903   if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
8904                          ResolveKnownElts)) {
8905     resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
8906     return true;
8907   }
8908   return false;
8909 }
8910
8911 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8912                                    SmallVectorImpl<SDValue> &Inputs,
8913                                    SmallVectorImpl<int> &Mask,
8914                                    const SelectionDAG &DAG, unsigned Depth,
8915                                    bool ResolveKnownElts) {
8916   APInt KnownUndef, KnownZero;
8917   return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
8918                                 KnownZero, DAG, Depth, ResolveKnownElts);
8919 }
8920
8921 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
8922                                    SmallVectorImpl<int> &Mask,
8923                                    const SelectionDAG &DAG, unsigned Depth = 0,
8924                                    bool ResolveKnownElts = true) {
8925   EVT VT = Op.getValueType();
8926   if (!VT.isSimple() || !VT.isVector())
8927     return false;
8928
8929   unsigned NumElts = Op.getValueType().getVectorNumElements();
8930   APInt DemandedElts = APInt::getAllOnes(NumElts);
8931   return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
8932                                 ResolveKnownElts);
8933 }
8934
8935 // Attempt to create a scalar/subvector broadcast from the base MemSDNode.
8936 static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
8937                                  EVT MemVT, MemSDNode *Mem, unsigned Offset,
8938                                  SelectionDAG &DAG) {
8939   assert((Opcode == X86ISD::VBROADCAST_LOAD ||
8940           Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
8941          "Unknown broadcast load type");
8942
8943   // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8944   if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8945     return SDValue();
8946
8947   SDValue Ptr =
8948       DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8949   SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8950   SDValue Ops[] = {Mem->getChain(), Ptr};
8951   SDValue BcstLd = DAG.getMemIntrinsicNode(
8952       Opcode, DL, Tys, Ops, MemVT,
8953       DAG.getMachineFunction().getMachineMemOperand(
8954           Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8955   DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8956   return BcstLd;
8957 }
8958
8959 /// Returns the scalar element that will make up the i'th
8960 /// element of the result of the vector shuffle.
8961 static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8962                                    SelectionDAG &DAG, unsigned Depth) {
8963   if (Depth >= SelectionDAG::MaxRecursionDepth)
8964     return SDValue(); // Limit search depth.
8965
8966   EVT VT = Op.getValueType();
8967   unsigned Opcode = Op.getOpcode();
8968   unsigned NumElems = VT.getVectorNumElements();
8969
8970   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8971   if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8972     int Elt = SV->getMaskElt(Index);
8973
8974     if (Elt < 0)
8975       return DAG.getUNDEF(VT.getVectorElementType());
8976
8977     SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8978     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8979   }
8980
8981   // Recurse into target specific vector shuffles to find scalars.
8982   if (isTargetShuffle(Opcode)) {
8983     MVT ShufVT = VT.getSimpleVT();
8984     MVT ShufSVT = ShufVT.getVectorElementType();
8985     int NumElems = (int)ShufVT.getVectorNumElements();
8986     SmallVector<int, 16> ShuffleMask;
8987     SmallVector<SDValue, 16> ShuffleOps;
8988     if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8989                               ShuffleMask))
8990       return SDValue();
8991
8992     int Elt = ShuffleMask[Index];
8993     if (Elt == SM_SentinelZero)
8994       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8995                                  : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8996     if (Elt == SM_SentinelUndef)
8997       return DAG.getUNDEF(ShufSVT);
8998
8999     assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
9000     SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
9001     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
9002   }
9003
9004   // Recurse into insert_subvector base/sub vector to find scalars.
9005   if (Opcode == ISD::INSERT_SUBVECTOR) {
9006     SDValue Vec = Op.getOperand(0);
9007     SDValue Sub = Op.getOperand(1);
9008     uint64_t SubIdx = Op.getConstantOperandVal(2);
9009     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
9010
9011     if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
9012       return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
9013     return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
9014   }
9015
9016   // Recurse into concat_vectors sub vector to find scalars.
9017   if (Opcode == ISD::CONCAT_VECTORS) {
9018     EVT SubVT = Op.getOperand(0).getValueType();
9019     unsigned NumSubElts = SubVT.getVectorNumElements();
9020     uint64_t SubIdx = Index / NumSubElts;
9021     uint64_t SubElt = Index % NumSubElts;
9022     return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
9023   }
9024
9025   // Recurse into extract_subvector src vector to find scalars.
9026   if (Opcode == ISD::EXTRACT_SUBVECTOR) {
9027     SDValue Src = Op.getOperand(0);
9028     uint64_t SrcIdx = Op.getConstantOperandVal(1);
9029     return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
9030   }
9031
9032   // We only peek through bitcasts of the same vector width.
9033   if (Opcode == ISD::BITCAST) {
9034     SDValue Src = Op.getOperand(0);
9035     EVT SrcVT = Src.getValueType();
9036     if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
9037       return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
9038     return SDValue();
9039   }
9040
9041   // Actual nodes that may contain scalar elements
9042
9043   // For insert_vector_elt - either return the index matching scalar or recurse
9044   // into the base vector.
9045   if (Opcode == ISD::INSERT_VECTOR_ELT &&
9046       isa<ConstantSDNode>(Op.getOperand(2))) {
9047     if (Op.getConstantOperandAPInt(2) == Index)
9048       return Op.getOperand(1);
9049     return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
9050   }
9051
9052   if (Opcode == ISD::SCALAR_TO_VECTOR)
9053     return (Index == 0) ? Op.getOperand(0)
9054                         : DAG.getUNDEF(VT.getVectorElementType());
9055
9056   if (Opcode == ISD::BUILD_VECTOR)
9057     return Op.getOperand(Index);
9058
9059   return SDValue();
9060 }
9061
9062 // Use PINSRB/PINSRW/PINSRD to create a build vector.
9063 static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
9064                                         unsigned NumNonZero, unsigned NumZero,
9065                                         SelectionDAG &DAG,
9066                                         const X86Subtarget &Subtarget) {
9067   MVT VT = Op.getSimpleValueType();
9068   unsigned NumElts = VT.getVectorNumElements();
9069   assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
9070           ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
9071          "Illegal vector insertion");
9072
9073   SDLoc dl(Op);
9074   SDValue V;
9075   bool First = true;
9076
9077   for (unsigned i = 0; i < NumElts; ++i) {
9078     bool IsNonZero = NonZeroMask[i];
9079     if (!IsNonZero)
9080       continue;
9081
9082     // If the build vector contains zeros or our first insertion is not the
9083     // first index then insert into zero vector to break any register
9084     // dependency else use SCALAR_TO_VECTOR.
9085     if (First) {
9086       First = false;
9087       if (NumZero || 0 != i)
9088         V = getZeroVector(VT, Subtarget, DAG, dl);
9089       else {
9090         assert(0 == i && "Expected insertion into zero-index");
9091         V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9092         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
9093         V = DAG.getBitcast(VT, V);
9094         continue;
9095       }
9096     }
9097     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
9098                     DAG.getIntPtrConstant(i, dl));
9099   }
9100
9101   return V;
9102 }
9103
9104 /// Custom lower build_vector of v16i8.
9105 static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
9106                                      unsigned NumNonZero, unsigned NumZero,
9107                                      SelectionDAG &DAG,
9108                                      const X86Subtarget &Subtarget) {
9109   if (NumNonZero > 8 && !Subtarget.hasSSE41())
9110     return SDValue();
9111
9112   // SSE4.1 - use PINSRB to insert each byte directly.
9113   if (Subtarget.hasSSE41())
9114     return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
9115                                     Subtarget);
9116
9117   SDLoc dl(Op);
9118   SDValue V;
9119
9120   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
9121   for (unsigned i = 0; i < 16; i += 2) {
9122     bool ThisIsNonZero = NonZeroMask[i];
9123     bool NextIsNonZero = NonZeroMask[i + 1];
9124     if (!ThisIsNonZero && !NextIsNonZero)
9125       continue;
9126
9127     // FIXME: Investigate combining the first 4 bytes as a i32 instead.
9128     SDValue Elt;
9129     if (ThisIsNonZero) {
9130       if (NumZero || NextIsNonZero)
9131         Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9132       else
9133         Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
9134     }
9135
9136     if (NextIsNonZero) {
9137       SDValue NextElt = Op.getOperand(i + 1);
9138       if (i == 0 && NumZero)
9139         NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
9140       else
9141         NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
9142       NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
9143                             DAG.getConstant(8, dl, MVT::i8));
9144       if (ThisIsNonZero)
9145         Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
9146       else
9147         Elt = NextElt;
9148     }
9149
9150     // If our first insertion is not the first index or zeros are needed, then
9151     // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
9152     // elements undefined).
9153     if (!V) {
9154       if (i != 0 || NumZero)
9155         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
9156       else {
9157         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
9158         V = DAG.getBitcast(MVT::v8i16, V);
9159         continue;
9160       }
9161     }
9162     Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
9163     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
9164                     DAG.getIntPtrConstant(i / 2, dl));
9165   }
9166
9167   return DAG.getBitcast(MVT::v16i8, V);
9168 }
9169
9170 /// Custom lower build_vector of v8i16.
9171 static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
9172                                      unsigned NumNonZero, unsigned NumZero,
9173                                      SelectionDAG &DAG,
9174                                      const X86Subtarget &Subtarget) {
9175   if (NumNonZero > 4 && !Subtarget.hasSSE41())
9176     return SDValue();
9177
9178   // Use PINSRW to insert each byte directly.
9179   return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
9180                                   Subtarget);
9181 }
9182
9183 /// Custom lower build_vector of v4i32 or v4f32.
9184 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
9185                                      const X86Subtarget &Subtarget) {
9186   // If this is a splat of a pair of elements, use MOVDDUP (unless the target
9187   // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
9188   // Because we're creating a less complicated build vector here, we may enable
9189   // further folding of the MOVDDUP via shuffle transforms.
9190   if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
9191       Op.getOperand(0) == Op.getOperand(2) &&
9192       Op.getOperand(1) == Op.getOperand(3) &&
9193       Op.getOperand(0) != Op.getOperand(1)) {
9194     SDLoc DL(Op);
9195     MVT VT = Op.getSimpleValueType();
9196     MVT EltVT = VT.getVectorElementType();
9197     // Create a new build vector with the first 2 elements followed by undef
9198     // padding, bitcast to v2f64, duplicate, and bitcast back.
9199     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9200                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9201     SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
9202     SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
9203     return DAG.getBitcast(VT, Dup);
9204   }
9205
9206   // Find all zeroable elements.
9207   std::bitset<4> Zeroable, Undefs;
9208   for (int i = 0; i < 4; ++i) {
9209     SDValue Elt = Op.getOperand(i);
9210     Undefs[i] = Elt.isUndef();
9211     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
9212   }
9213   assert(Zeroable.size() - Zeroable.count() > 1 &&
9214          "We expect at least two non-zero elements!");
9215
9216   // We only know how to deal with build_vector nodes where elements are either
9217   // zeroable or extract_vector_elt with constant index.
9218   SDValue FirstNonZero;
9219   unsigned FirstNonZeroIdx;
9220   for (unsigned i = 0; i < 4; ++i) {
9221     if (Zeroable[i])
9222       continue;
9223     SDValue Elt = Op.getOperand(i);
9224     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9225         !isa<ConstantSDNode>(Elt.getOperand(1)))
9226       return SDValue();
9227     // Make sure that this node is extracting from a 128-bit vector.
9228     MVT VT = Elt.getOperand(0).getSimpleValueType();
9229     if (!VT.is128BitVector())
9230       return SDValue();
9231     if (!FirstNonZero.getNode()) {
9232       FirstNonZero = Elt;
9233       FirstNonZeroIdx = i;
9234     }
9235   }
9236
9237   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
9238   SDValue V1 = FirstNonZero.getOperand(0);
9239   MVT VT = V1.getSimpleValueType();
9240
9241   // See if this build_vector can be lowered as a blend with zero.
9242   SDValue Elt;
9243   unsigned EltMaskIdx, EltIdx;
9244   int Mask[4];
9245   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
9246     if (Zeroable[EltIdx]) {
9247       // The zero vector will be on the right hand side.
9248       Mask[EltIdx] = EltIdx+4;
9249       continue;
9250     }
9251
9252     Elt = Op->getOperand(EltIdx);
9253     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
9254     EltMaskIdx = Elt.getConstantOperandVal(1);
9255     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
9256       break;
9257     Mask[EltIdx] = EltIdx;
9258   }
9259
9260   if (EltIdx == 4) {
9261     // Let the shuffle legalizer deal with blend operations.
9262     SDValue VZeroOrUndef = (Zeroable == Undefs)
9263                                ? DAG.getUNDEF(VT)
9264                                : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
9265     if (V1.getSimpleValueType() != VT)
9266       V1 = DAG.getBitcast(VT, V1);
9267     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
9268   }
9269
9270   // See if we can lower this build_vector to a INSERTPS.
9271   if (!Subtarget.hasSSE41())
9272     return SDValue();
9273
9274   SDValue V2 = Elt.getOperand(0);
9275   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
9276     V1 = SDValue();
9277
9278   bool CanFold = true;
9279   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
9280     if (Zeroable[i])
9281       continue;
9282
9283     SDValue Current = Op->getOperand(i);
9284     SDValue SrcVector = Current->getOperand(0);
9285     if (!V1.getNode())
9286       V1 = SrcVector;
9287     CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
9288   }
9289
9290   if (!CanFold)
9291     return SDValue();
9292
9293   assert(V1.getNode() && "Expected at least two non-zero elements!");
9294   if (V1.getSimpleValueType() != MVT::v4f32)
9295     V1 = DAG.getBitcast(MVT::v4f32, V1);
9296   if (V2.getSimpleValueType() != MVT::v4f32)
9297     V2 = DAG.getBitcast(MVT::v4f32, V2);
9298
9299   // Ok, we can emit an INSERTPS instruction.
9300   unsigned ZMask = Zeroable.to_ulong();
9301
9302   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
9303   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
9304   SDLoc DL(Op);
9305   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
9306                                DAG.getIntPtrConstant(InsertPSMask, DL, true));
9307   return DAG.getBitcast(VT, Result);
9308 }
9309
9310 /// Return a vector logical shift node.
9311 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
9312                          SelectionDAG &DAG, const TargetLowering &TLI,
9313                          const SDLoc &dl) {
9314   assert(VT.is128BitVector() && "Unknown type for VShift");
9315   MVT ShVT = MVT::v16i8;
9316   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
9317   SrcOp = DAG.getBitcast(ShVT, SrcOp);
9318   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
9319   SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
9320   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
9321 }
9322
9323 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
9324                                       SelectionDAG &DAG) {
9325
9326   // Check if the scalar load can be widened into a vector load. And if
9327   // the address is "base + cst" see if the cst can be "absorbed" into
9328   // the shuffle mask.
9329   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
9330     SDValue Ptr = LD->getBasePtr();
9331     if (!ISD::isNormalLoad(LD) || !LD->isSimple())
9332       return SDValue();
9333     EVT PVT = LD->getValueType(0);
9334     if (PVT != MVT::i32 && PVT != MVT::f32)
9335       return SDValue();
9336
9337     int FI = -1;
9338     int64_t Offset = 0;
9339     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
9340       FI = FINode->getIndex();
9341       Offset = 0;
9342     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
9343                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
9344       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
9345       Offset = Ptr.getConstantOperandVal(1);
9346       Ptr = Ptr.getOperand(0);
9347     } else {
9348       return SDValue();
9349     }
9350
9351     // FIXME: 256-bit vector instructions don't require a strict alignment,
9352     // improve this code to support it better.
9353     Align RequiredAlign(VT.getSizeInBits() / 8);
9354     SDValue Chain = LD->getChain();
9355     // Make sure the stack object alignment is at least 16 or 32.
9356     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9357     MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
9358     if (!InferredAlign || *InferredAlign < RequiredAlign) {
9359       if (MFI.isFixedObjectIndex(FI)) {
9360         // Can't change the alignment. FIXME: It's possible to compute
9361         // the exact stack offset and reference FI + adjust offset instead.
9362         // If someone *really* cares about this. That's the way to implement it.
9363         return SDValue();
9364       } else {
9365         MFI.setObjectAlignment(FI, RequiredAlign);
9366       }
9367     }
9368
9369     // (Offset % 16 or 32) must be multiple of 4. Then address is then
9370     // Ptr + (Offset & ~15).
9371     if (Offset < 0)
9372       return SDValue();
9373     if ((Offset % RequiredAlign.value()) & 3)
9374       return SDValue();
9375     int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
9376     if (StartOffset) {
9377       SDLoc DL(Ptr);
9378       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9379                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
9380     }
9381
9382     int EltNo = (Offset - StartOffset) >> 2;
9383     unsigned NumElems = VT.getVectorNumElements();
9384
9385     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
9386     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
9387                              LD->getPointerInfo().getWithOffset(StartOffset));
9388
9389     SmallVector<int, 8> Mask(NumElems, EltNo);
9390
9391     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
9392   }
9393
9394   return SDValue();
9395 }
9396
9397 // Recurse to find a LoadSDNode source and the accumulated ByteOffest.
9398 static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
9399   if (ISD::isNON_EXTLoad(Elt.getNode())) {
9400     auto *BaseLd = cast<LoadSDNode>(Elt);
9401     if (!BaseLd->isSimple())
9402       return false;
9403     Ld = BaseLd;
9404     ByteOffset = 0;
9405     return true;
9406   }
9407
9408   switch (Elt.getOpcode()) {
9409   case ISD::BITCAST:
9410   case ISD::TRUNCATE:
9411   case ISD::SCALAR_TO_VECTOR:
9412     return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
9413   case ISD::SRL:
9414     if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
9415       uint64_t Amt = AmtC->getZExtValue();
9416       if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
9417         ByteOffset += Amt / 8;
9418         return true;
9419       }
9420     }
9421     break;
9422   case ISD::EXTRACT_VECTOR_ELT:
9423     if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
9424       SDValue Src = Elt.getOperand(0);
9425       unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
9426       unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
9427       if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
9428           findEltLoadSrc(Src, Ld, ByteOffset)) {
9429         uint64_t Idx = IdxC->getZExtValue();
9430         ByteOffset += Idx * (SrcSizeInBits / 8);
9431         return true;
9432       }
9433     }
9434     break;
9435   }
9436
9437   return false;
9438 }
9439
9440 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
9441 /// elements can be replaced by a single large load which has the same value as
9442 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
9443 ///
9444 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
9445 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
9446                                         const SDLoc &DL, SelectionDAG &DAG,
9447                                         const X86Subtarget &Subtarget,
9448                                         bool IsAfterLegalize) {
9449   if ((VT.getScalarSizeInBits() % 8) != 0)
9450     return SDValue();
9451
9452   unsigned NumElems = Elts.size();
9453
9454   int LastLoadedElt = -1;
9455   APInt LoadMask = APInt::getZero(NumElems);
9456   APInt ZeroMask = APInt::getZero(NumElems);
9457   APInt UndefMask = APInt::getZero(NumElems);
9458
9459   SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
9460   SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
9461
9462   // For each element in the initializer, see if we've found a load, zero or an
9463   // undef.
9464   for (unsigned i = 0; i < NumElems; ++i) {
9465     SDValue Elt = peekThroughBitcasts(Elts[i]);
9466     if (!Elt.getNode())
9467       return SDValue();
9468     if (Elt.isUndef()) {
9469       UndefMask.setBit(i);
9470       continue;
9471     }
9472     if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
9473       ZeroMask.setBit(i);
9474       continue;
9475     }
9476
9477     // Each loaded element must be the correct fractional portion of the
9478     // requested vector load.
9479     unsigned EltSizeInBits = Elt.getValueSizeInBits();
9480     if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
9481       return SDValue();
9482
9483     if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
9484       return SDValue();
9485     unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
9486     if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
9487       return SDValue();
9488
9489     LoadMask.setBit(i);
9490     LastLoadedElt = i;
9491   }
9492   assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
9493              NumElems &&
9494          "Incomplete element masks");
9495
9496   // Handle Special Cases - all undef or undef/zero.
9497   if (UndefMask.popcount() == NumElems)
9498     return DAG.getUNDEF(VT);
9499   if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
9500     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
9501                           : DAG.getConstantFP(0.0, DL, VT);
9502
9503   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9504   int FirstLoadedElt = LoadMask.countr_zero();
9505   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
9506   EVT EltBaseVT = EltBase.getValueType();
9507   assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
9508          "Register/Memory size mismatch");
9509   LoadSDNode *LDBase = Loads[FirstLoadedElt];
9510   assert(LDBase && "Did not find base load for merging consecutive loads");
9511   unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
9512   unsigned BaseSizeInBytes = BaseSizeInBits / 8;
9513   int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
9514   int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
9515   assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
9516
9517   // TODO: Support offsetting the base load.
9518   if (ByteOffsets[FirstLoadedElt] != 0)
9519     return SDValue();
9520
9521   // Check to see if the element's load is consecutive to the base load
9522   // or offset from a previous (already checked) load.
9523   auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
9524     LoadSDNode *Ld = Loads[EltIdx];
9525     int64_t ByteOffset = ByteOffsets[EltIdx];
9526     if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
9527       int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
9528       return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
9529               Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
9530     }
9531     return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
9532                                               EltIdx - FirstLoadedElt);
9533   };
9534
9535   // Consecutive loads can contain UNDEFS but not ZERO elements.
9536   // Consecutive loads with UNDEFs and ZEROs elements require a
9537   // an additional shuffle stage to clear the ZERO elements.
9538   bool IsConsecutiveLoad = true;
9539   bool IsConsecutiveLoadWithZeros = true;
9540   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
9541     if (LoadMask[i]) {
9542       if (!CheckConsecutiveLoad(LDBase, i)) {
9543         IsConsecutiveLoad = false;
9544         IsConsecutiveLoadWithZeros = false;
9545         break;
9546       }
9547     } else if (ZeroMask[i]) {
9548       IsConsecutiveLoad = false;
9549     }
9550   }
9551
9552   auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
9553     auto MMOFlags = LDBase->getMemOperand()->getFlags();
9554     assert(LDBase->isSimple() &&
9555            "Cannot merge volatile or atomic loads.");
9556     SDValue NewLd =
9557         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
9558                     LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
9559                     MMOFlags);
9560     for (auto *LD : Loads)
9561       if (LD)
9562         DAG.makeEquivalentMemoryOrdering(LD, NewLd);
9563     return NewLd;
9564   };
9565
9566   // Check if the base load is entirely dereferenceable.
9567   bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
9568       VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
9569
9570   // LOAD - all consecutive load/undefs (must start/end with a load or be
9571   // entirely dereferenceable). If we have found an entire vector of loads and
9572   // undefs, then return a large load of the entire vector width starting at the
9573   // base pointer. If the vector contains zeros, then attempt to shuffle those
9574   // elements.
9575   if (FirstLoadedElt == 0 &&
9576       (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
9577       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
9578     if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
9579       return SDValue();
9580
9581     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
9582     // will lower to regular temporal loads and use the cache.
9583     if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
9584         VT.is256BitVector() && !Subtarget.hasInt256())
9585       return SDValue();
9586
9587     if (NumElems == 1)
9588       return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
9589
9590     if (!ZeroMask)
9591       return CreateLoad(VT, LDBase);
9592
9593     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
9594     // vector and a zero vector to clear out the zero elements.
9595     if (!IsAfterLegalize && VT.isVector()) {
9596       unsigned NumMaskElts = VT.getVectorNumElements();
9597       if ((NumMaskElts % NumElems) == 0) {
9598         unsigned Scale = NumMaskElts / NumElems;
9599         SmallVector<int, 4> ClearMask(NumMaskElts, -1);
9600         for (unsigned i = 0; i < NumElems; ++i) {
9601           if (UndefMask[i])
9602             continue;
9603           int Offset = ZeroMask[i] ? NumMaskElts : 0;
9604           for (unsigned j = 0; j != Scale; ++j)
9605             ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
9606         }
9607         SDValue V = CreateLoad(VT, LDBase);
9608         SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
9609                                    : DAG.getConstantFP(0.0, DL, VT);
9610         return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
9611       }
9612     }
9613   }
9614
9615   // If the upper half of a ymm/zmm load is undef then just load the lower half.
9616   if (VT.is256BitVector() || VT.is512BitVector()) {
9617     unsigned HalfNumElems = NumElems / 2;
9618     if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
9619       EVT HalfVT =
9620           EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
9621       SDValue HalfLD =
9622           EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
9623                                    DAG, Subtarget, IsAfterLegalize);
9624       if (HalfLD)
9625         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
9626                            HalfLD, DAG.getIntPtrConstant(0, DL));
9627     }
9628   }
9629
9630   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
9631   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
9632       ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
9633        LoadSizeInBits == 64) &&
9634       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
9635     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
9636                                       : MVT::getIntegerVT(LoadSizeInBits);
9637     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
9638     // Allow v4f32 on SSE1 only targets.
9639     // FIXME: Add more isel patterns so we can just use VT directly.
9640     if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
9641       VecVT = MVT::v4f32;
9642     if (TLI.isTypeLegal(VecVT)) {
9643       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
9644       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
9645       SDValue ResNode = DAG.getMemIntrinsicNode(
9646           X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
9647           LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
9648       for (auto *LD : Loads)
9649         if (LD)
9650           DAG.makeEquivalentMemoryOrdering(LD, ResNode);
9651       return DAG.getBitcast(VT, ResNode);
9652     }
9653   }
9654
9655   // BROADCAST - match the smallest possible repetition pattern, load that
9656   // scalar/subvector element and then broadcast to the entire vector.
9657   if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
9658       (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
9659     for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
9660       unsigned RepeatSize = SubElems * BaseSizeInBits;
9661       unsigned ScalarSize = std::min(RepeatSize, 64u);
9662       if (!Subtarget.hasAVX2() && ScalarSize < 32)
9663         continue;
9664
9665       // Don't attempt a 1:N subvector broadcast - it should be caught by
9666       // combineConcatVectorOps, else will cause infinite loops.
9667       if (RepeatSize > ScalarSize && SubElems == 1)
9668         continue;
9669
9670       bool Match = true;
9671       SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
9672       for (unsigned i = 0; i != NumElems && Match; ++i) {
9673         if (!LoadMask[i])
9674           continue;
9675         SDValue Elt = peekThroughBitcasts(Elts[i]);
9676         if (RepeatedLoads[i % SubElems].isUndef())
9677           RepeatedLoads[i % SubElems] = Elt;
9678         else
9679           Match &= (RepeatedLoads[i % SubElems] == Elt);
9680       }
9681
9682       // We must have loads at both ends of the repetition.
9683       Match &= !RepeatedLoads.front().isUndef();
9684       Match &= !RepeatedLoads.back().isUndef();
9685       if (!Match)
9686         continue;
9687
9688       EVT RepeatVT =
9689           VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
9690               ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
9691               : EVT::getFloatingPointVT(ScalarSize);
9692       if (RepeatSize > ScalarSize)
9693         RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
9694                                     RepeatSize / ScalarSize);
9695       EVT BroadcastVT =
9696           EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
9697                            VT.getSizeInBits() / ScalarSize);
9698       if (TLI.isTypeLegal(BroadcastVT)) {
9699         if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
9700                 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
9701           SDValue Broadcast = RepeatLoad;
9702           if (RepeatSize > ScalarSize) {
9703             while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
9704               Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
9705           } else {
9706             if (!Subtarget.hasAVX2() &&
9707                 !X86::mayFoldLoadIntoBroadcastFromMem(
9708                     RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
9709                     Subtarget,
9710                     /*AssumeSingleUse=*/true))
9711               return SDValue();
9712             Broadcast =
9713                 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
9714           }
9715           return DAG.getBitcast(VT, Broadcast);
9716         }
9717       }
9718     }
9719   }
9720
9721   return SDValue();
9722 }
9723
9724 // Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
9725 // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
9726 // are consecutive, non-overlapping, and in the right order.
9727 static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
9728                                          SelectionDAG &DAG,
9729                                          const X86Subtarget &Subtarget,
9730                                          bool IsAfterLegalize) {
9731   SmallVector<SDValue, 64> Elts;
9732   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
9733     if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
9734       Elts.push_back(Elt);
9735       continue;
9736     }
9737     return SDValue();
9738   }
9739   assert(Elts.size() == VT.getVectorNumElements());
9740   return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
9741                                   IsAfterLegalize);
9742 }
9743
9744 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
9745                                    unsigned SplatBitSize, LLVMContext &C) {
9746   unsigned ScalarSize = VT.getScalarSizeInBits();
9747
9748   auto getConstantScalar = [&](const APInt &Val) -> Constant * {
9749     if (VT.isFloatingPoint()) {
9750       if (ScalarSize == 16)
9751         return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
9752       if (ScalarSize == 32)
9753         return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
9754       assert(ScalarSize == 64 && "Unsupported floating point scalar size");
9755       return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
9756     }
9757     return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
9758   };
9759
9760   if (ScalarSize == SplatBitSize)
9761     return getConstantScalar(SplatValue);
9762
9763   unsigned NumElm = SplatBitSize / ScalarSize;
9764   SmallVector<Constant *, 32> ConstantVec;
9765   for (unsigned I = 0; I != NumElm; ++I) {
9766     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
9767     ConstantVec.push_back(getConstantScalar(Val));
9768   }
9769   return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
9770 }
9771
9772 static bool isFoldableUseOfShuffle(SDNode *N) {
9773   for (auto *U : N->uses()) {
9774     unsigned Opc = U->getOpcode();
9775     // VPERMV/VPERMV3 shuffles can never fold their index operands.
9776     if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
9777       return false;
9778     if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
9779       return false;
9780     if (isTargetShuffle(Opc))
9781       return true;
9782     if (Opc == ISD::BITCAST) // Ignore bitcasts
9783       return isFoldableUseOfShuffle(U);
9784     if (N->hasOneUse()) {
9785       // TODO, there may be some general way to know if a SDNode can
9786       // be folded. We now only know whether an MI is foldable.
9787       if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
9788         return false;
9789       return true;
9790     }
9791   }
9792   return false;
9793 }
9794
9795 /// Attempt to use the vbroadcast instruction to generate a splat value
9796 /// from a splat BUILD_VECTOR which uses:
9797 ///  a. A single scalar load, or a constant.
9798 ///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
9799 ///
9800 /// The VBROADCAST node is returned when a pattern is found,
9801 /// or SDValue() otherwise.
9802 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
9803                                            const X86Subtarget &Subtarget,
9804                                            SelectionDAG &DAG) {
9805   // VBROADCAST requires AVX.
9806   // TODO: Splats could be generated for non-AVX CPUs using SSE
9807   // instructions, but there's less potential gain for only 128-bit vectors.
9808   if (!Subtarget.hasAVX())
9809     return SDValue();
9810
9811   MVT VT = BVOp->getSimpleValueType(0);
9812   unsigned NumElts = VT.getVectorNumElements();
9813   SDLoc dl(BVOp);
9814
9815   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
9816          "Unsupported vector type for broadcast.");
9817
9818   // See if the build vector is a repeating sequence of scalars (inc. splat).
9819   SDValue Ld;
9820   BitVector UndefElements;
9821   SmallVector<SDValue, 16> Sequence;
9822   if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
9823     assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
9824     if (Sequence.size() == 1)
9825       Ld = Sequence[0];
9826   }
9827
9828   // Attempt to use VBROADCASTM
9829   // From this pattern:
9830   // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
9831   // b. t1 = (build_vector t0 t0)
9832   //
9833   // Create (VBROADCASTM v2i1 X)
9834   if (!Sequence.empty() && Subtarget.hasCDI()) {
9835     // If not a splat, are the upper sequence values zeroable?
9836     unsigned SeqLen = Sequence.size();
9837     bool UpperZeroOrUndef =
9838         SeqLen == 1 ||
9839         llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
9840           return !V || V.isUndef() || isNullConstant(V);
9841         });
9842     SDValue Op0 = Sequence[0];
9843     if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
9844                              (Op0.getOpcode() == ISD::ZERO_EXTEND &&
9845                               Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
9846       SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
9847                              ? Op0.getOperand(0)
9848                              : Op0.getOperand(0).getOperand(0);
9849       MVT MaskVT = BOperand.getSimpleValueType();
9850       MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
9851       if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||  // for broadcastmb2q
9852           (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
9853         MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
9854         if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
9855           unsigned Scale = 512 / VT.getSizeInBits();
9856           BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
9857         }
9858         SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
9859         if (BcstVT.getSizeInBits() != VT.getSizeInBits())
9860           Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
9861         return DAG.getBitcast(VT, Bcst);
9862       }
9863     }
9864   }
9865
9866   unsigned NumUndefElts = UndefElements.count();
9867   if (!Ld || (NumElts - NumUndefElts) <= 1) {
9868     APInt SplatValue, Undef;
9869     unsigned SplatBitSize;
9870     bool HasUndef;
9871     // Check if this is a repeated constant pattern suitable for broadcasting.
9872     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
9873         SplatBitSize > VT.getScalarSizeInBits() &&
9874         SplatBitSize < VT.getSizeInBits()) {
9875       // Avoid replacing with broadcast when it's a use of a shuffle
9876       // instruction to preserve the present custom lowering of shuffles.
9877       if (isFoldableUseOfShuffle(BVOp))
9878         return SDValue();
9879       // replace BUILD_VECTOR with broadcast of the repeated constants.
9880       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9881       LLVMContext *Ctx = DAG.getContext();
9882       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
9883       if (SplatBitSize == 32 || SplatBitSize == 64 ||
9884           (SplatBitSize < 32 && Subtarget.hasAVX2())) {
9885         // Load the constant scalar/subvector and broadcast it.
9886         MVT CVT = MVT::getIntegerVT(SplatBitSize);
9887         Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
9888         SDValue CP = DAG.getConstantPool(C, PVT);
9889         unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
9890
9891         Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9892         SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
9893         SDValue Ops[] = {DAG.getEntryNode(), CP};
9894         MachinePointerInfo MPI =
9895             MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9896         SDValue Brdcst =
9897             DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9898                                     MPI, Alignment, MachineMemOperand::MOLoad);
9899         return DAG.getBitcast(VT, Brdcst);
9900       }
9901       if (SplatBitSize > 64) {
9902         // Load the vector of constants and broadcast it.
9903         Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
9904         SDValue VCP = DAG.getConstantPool(VecC, PVT);
9905         unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
9906         MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
9907         Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
9908         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9909         SDValue Ops[] = {DAG.getEntryNode(), VCP};
9910         MachinePointerInfo MPI =
9911             MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9912         return DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, dl, Tys,
9913                                        Ops, VVT, MPI, Alignment,
9914                                        MachineMemOperand::MOLoad);
9915       }
9916     }
9917
9918     // If we are moving a scalar into a vector (Ld must be set and all elements
9919     // but 1 are undef) and that operation is not obviously supported by
9920     // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
9921     // That's better than general shuffling and may eliminate a load to GPR and
9922     // move from scalar to vector register.
9923     if (!Ld || NumElts - NumUndefElts != 1)
9924       return SDValue();
9925     unsigned ScalarSize = Ld.getValueSizeInBits();
9926     if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
9927       return SDValue();
9928   }
9929
9930   bool ConstSplatVal =
9931       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
9932   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
9933
9934   // TODO: Handle broadcasts of non-constant sequences.
9935
9936   // Make sure that all of the users of a non-constant load are from the
9937   // BUILD_VECTOR node.
9938   // FIXME: Is the use count needed for non-constant, non-load case?
9939   if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
9940     return SDValue();
9941
9942   unsigned ScalarSize = Ld.getValueSizeInBits();
9943   bool IsGE256 = (VT.getSizeInBits() >= 256);
9944
9945   // When optimizing for size, generate up to 5 extra bytes for a broadcast
9946   // instruction to save 8 or more bytes of constant pool data.
9947   // TODO: If multiple splats are generated to load the same constant,
9948   // it may be detrimental to overall size. There needs to be a way to detect
9949   // that condition to know if this is truly a size win.
9950   bool OptForSize = DAG.shouldOptForSize();
9951
9952   // Handle broadcasting a single constant scalar from the constant pool
9953   // into a vector.
9954   // On Sandybridge (no AVX2), it is still better to load a constant vector
9955   // from the constant pool and not to broadcast it from a scalar.
9956   // But override that restriction when optimizing for size.
9957   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9958   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9959     EVT CVT = Ld.getValueType();
9960     assert(!CVT.isVector() && "Must not broadcast a vector type");
9961
9962     // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
9963     // For size optimization, also splat v2f64 and v2i64, and for size opt
9964     // with AVX2, also splat i8 and i16.
9965     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9966     if (ScalarSize == 32 ||
9967         (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
9968         CVT == MVT::f16 ||
9969         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9970       const Constant *C = nullptr;
9971       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9972         C = CI->getConstantIntValue();
9973       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9974         C = CF->getConstantFPValue();
9975
9976       assert(C && "Invalid constant type");
9977
9978       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9979       SDValue CP =
9980           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9981       Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9982
9983       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9984       SDValue Ops[] = {DAG.getEntryNode(), CP};
9985       MachinePointerInfo MPI =
9986           MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9987       return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9988                                      MPI, Alignment, MachineMemOperand::MOLoad);
9989     }
9990   }
9991
9992   // Handle AVX2 in-register broadcasts.
9993   if (!IsLoad && Subtarget.hasInt256() &&
9994       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9995     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9996
9997   // The scalar source must be a normal load.
9998   if (!IsLoad)
9999     return SDValue();
10000
10001   // Make sure the non-chain result is only used by this build vector.
10002   if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
10003     return SDValue();
10004
10005   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
10006       (Subtarget.hasVLX() && ScalarSize == 64)) {
10007     auto *LN = cast<LoadSDNode>(Ld);
10008     SDVTList Tys = DAG.getVTList(VT, MVT::Other);
10009     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
10010     SDValue BCast =
10011         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
10012                                 LN->getMemoryVT(), LN->getMemOperand());
10013     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
10014     return BCast;
10015   }
10016
10017   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
10018   // double since there is no vbroadcastsd xmm
10019   if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
10020       (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
10021     auto *LN = cast<LoadSDNode>(Ld);
10022     SDVTList Tys = DAG.getVTList(VT, MVT::Other);
10023     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
10024     SDValue BCast =
10025         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
10026                                 LN->getMemoryVT(), LN->getMemOperand());
10027     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
10028     return BCast;
10029   }
10030
10031   if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
10032     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
10033
10034   // Unsupported broadcast.
10035   return SDValue();
10036 }
10037
10038 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
10039 /// underlying vector and index.
10040 ///
10041 /// Modifies \p ExtractedFromVec to the real vector and returns the real
10042 /// index.
10043 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
10044                                          SDValue ExtIdx) {
10045   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
10046   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
10047     return Idx;
10048
10049   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
10050   // lowered this:
10051   //   (extract_vector_elt (v8f32 %1), Constant<6>)
10052   // to:
10053   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
10054   //                           (extract_subvector (v8f32 %0), Constant<4>),
10055   //                           undef)
10056   //                       Constant<0>)
10057   // In this case the vector is the extract_subvector expression and the index
10058   // is 2, as specified by the shuffle.
10059   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
10060   SDValue ShuffleVec = SVOp->getOperand(0);
10061   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
10062   assert(ShuffleVecVT.getVectorElementType() ==
10063          ExtractedFromVec.getSimpleValueType().getVectorElementType());
10064
10065   int ShuffleIdx = SVOp->getMaskElt(Idx);
10066   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
10067     ExtractedFromVec = ShuffleVec;
10068     return ShuffleIdx;
10069   }
10070   return Idx;
10071 }
10072
10073 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
10074   MVT VT = Op.getSimpleValueType();
10075
10076   // Skip if insert_vec_elt is not supported.
10077   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10078   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
10079     return SDValue();
10080
10081   SDLoc DL(Op);
10082   unsigned NumElems = Op.getNumOperands();
10083
10084   SDValue VecIn1;
10085   SDValue VecIn2;
10086   SmallVector<unsigned, 4> InsertIndices;
10087   SmallVector<int, 8> Mask(NumElems, -1);
10088
10089   for (unsigned i = 0; i != NumElems; ++i) {
10090     unsigned Opc = Op.getOperand(i).getOpcode();
10091
10092     if (Opc == ISD::UNDEF)
10093       continue;
10094
10095     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
10096       // Quit if more than 1 elements need inserting.
10097       if (InsertIndices.size() > 1)
10098         return SDValue();
10099
10100       InsertIndices.push_back(i);
10101       continue;
10102     }
10103
10104     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
10105     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
10106
10107     // Quit if non-constant index.
10108     if (!isa<ConstantSDNode>(ExtIdx))
10109       return SDValue();
10110     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
10111
10112     // Quit if extracted from vector of different type.
10113     if (ExtractedFromVec.getValueType() != VT)
10114       return SDValue();
10115
10116     if (!VecIn1.getNode())
10117       VecIn1 = ExtractedFromVec;
10118     else if (VecIn1 != ExtractedFromVec) {
10119       if (!VecIn2.getNode())
10120         VecIn2 = ExtractedFromVec;
10121       else if (VecIn2 != ExtractedFromVec)
10122         // Quit if more than 2 vectors to shuffle
10123         return SDValue();
10124     }
10125
10126     if (ExtractedFromVec == VecIn1)
10127       Mask[i] = Idx;
10128     else if (ExtractedFromVec == VecIn2)
10129       Mask[i] = Idx + NumElems;
10130   }
10131
10132   if (!VecIn1.getNode())
10133     return SDValue();
10134
10135   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
10136   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
10137
10138   for (unsigned Idx : InsertIndices)
10139     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
10140                      DAG.getIntPtrConstant(Idx, DL));
10141
10142   return NV;
10143 }
10144
10145 // Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
10146 static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
10147                                        const X86Subtarget &Subtarget) {
10148   MVT VT = Op.getSimpleValueType();
10149   MVT IVT = VT.changeVectorElementTypeToInteger();
10150   SmallVector<SDValue, 16> NewOps;
10151   for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
10152     NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
10153   SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
10154   return DAG.getBitcast(VT, Res);
10155 }
10156
10157 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
10158 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
10159                                      const X86Subtarget &Subtarget) {
10160
10161   MVT VT = Op.getSimpleValueType();
10162   assert((VT.getVectorElementType() == MVT::i1) &&
10163          "Unexpected type in LowerBUILD_VECTORvXi1!");
10164
10165   SDLoc dl(Op);
10166   if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
10167       ISD::isBuildVectorAllOnes(Op.getNode()))
10168     return Op;
10169
10170   uint64_t Immediate = 0;
10171   SmallVector<unsigned, 16> NonConstIdx;
10172   bool IsSplat = true;
10173   bool HasConstElts = false;
10174   int SplatIdx = -1;
10175   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
10176     SDValue In = Op.getOperand(idx);
10177     if (In.isUndef())
10178       continue;
10179     if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
10180       Immediate |= (InC->getZExtValue() & 0x1) << idx;
10181       HasConstElts = true;
10182     } else {
10183       NonConstIdx.push_back(idx);
10184     }
10185     if (SplatIdx < 0)
10186       SplatIdx = idx;
10187     else if (In != Op.getOperand(SplatIdx))
10188       IsSplat = false;
10189   }
10190
10191   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
10192   if (IsSplat) {
10193     // The build_vector allows the scalar element to be larger than the vector
10194     // element type. We need to mask it to use as a condition unless we know
10195     // the upper bits are zero.
10196     // FIXME: Use computeKnownBits instead of checking specific opcode?
10197     SDValue Cond = Op.getOperand(SplatIdx);
10198     assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
10199     if (Cond.getOpcode() != ISD::SETCC)
10200       Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
10201                          DAG.getConstant(1, dl, MVT::i8));
10202
10203     // Perform the select in the scalar domain so we can use cmov.
10204     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
10205       SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
10206                                      DAG.getAllOnesConstant(dl, MVT::i32),
10207                                      DAG.getConstant(0, dl, MVT::i32));
10208       Select = DAG.getBitcast(MVT::v32i1, Select);
10209       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
10210     } else {
10211       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
10212       SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
10213                                      DAG.getAllOnesConstant(dl, ImmVT),
10214                                      DAG.getConstant(0, dl, ImmVT));
10215       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
10216       Select = DAG.getBitcast(VecVT, Select);
10217       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
10218                          DAG.getIntPtrConstant(0, dl));
10219     }
10220   }
10221
10222   // insert elements one by one
10223   SDValue DstVec;
10224   if (HasConstElts) {
10225     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
10226       SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
10227       SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
10228       ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
10229       ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
10230       DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
10231     } else {
10232       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
10233       SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
10234       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
10235       DstVec = DAG.getBitcast(VecVT, Imm);
10236       DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
10237                            DAG.getIntPtrConstant(0, dl));
10238     }
10239   } else
10240     DstVec = DAG.getUNDEF(VT);
10241
10242   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
10243     unsigned InsertIdx = NonConstIdx[i];
10244     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
10245                          Op.getOperand(InsertIdx),
10246                          DAG.getIntPtrConstant(InsertIdx, dl));
10247   }
10248   return DstVec;
10249 }
10250
10251 LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
10252   switch (Opcode) {
10253   case X86ISD::PACKSS:
10254   case X86ISD::PACKUS:
10255   case X86ISD::FHADD:
10256   case X86ISD::FHSUB:
10257   case X86ISD::HADD:
10258   case X86ISD::HSUB:
10259     return true;
10260   }
10261   return false;
10262 }
10263
10264 /// This is a helper function of LowerToHorizontalOp().
10265 /// This function checks that the build_vector \p N in input implements a
10266 /// 128-bit partial horizontal operation on a 256-bit vector, but that operation
10267 /// may not match the layout of an x86 256-bit horizontal instruction.
10268 /// In other words, if this returns true, then some extraction/insertion will
10269 /// be required to produce a valid horizontal instruction.
10270 ///
10271 /// Parameter \p Opcode defines the kind of horizontal operation to match.
10272 /// For example, if \p Opcode is equal to ISD::ADD, then this function
10273 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
10274 /// is equal to ISD::SUB, then this function checks if this is a horizontal
10275 /// arithmetic sub.
10276 ///
10277 /// This function only analyzes elements of \p N whose indices are
10278 /// in range [BaseIdx, LastIdx).
10279 ///
10280 /// TODO: This function was originally used to match both real and fake partial
10281 /// horizontal operations, but the index-matching logic is incorrect for that.
10282 /// See the corrected implementation in isHopBuildVector(). Can we reduce this
10283 /// code because it is only used for partial h-op matching now?
10284 static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
10285                                   SelectionDAG &DAG,
10286                                   unsigned BaseIdx, unsigned LastIdx,
10287                                   SDValue &V0, SDValue &V1) {
10288   EVT VT = N->getValueType(0);
10289   assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
10290   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
10291   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
10292          "Invalid Vector in input!");
10293
10294   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
10295   bool CanFold = true;
10296   unsigned ExpectedVExtractIdx = BaseIdx;
10297   unsigned NumElts = LastIdx - BaseIdx;
10298   V0 = DAG.getUNDEF(VT);
10299   V1 = DAG.getUNDEF(VT);
10300
10301   // Check if N implements a horizontal binop.
10302   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
10303     SDValue Op = N->getOperand(i + BaseIdx);
10304
10305     // Skip UNDEFs.
10306     if (Op->isUndef()) {
10307       // Update the expected vector extract index.
10308       if (i * 2 == NumElts)
10309         ExpectedVExtractIdx = BaseIdx;
10310       ExpectedVExtractIdx += 2;
10311       continue;
10312     }
10313
10314     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
10315
10316     if (!CanFold)
10317       break;
10318
10319     SDValue Op0 = Op.getOperand(0);
10320     SDValue Op1 = Op.getOperand(1);
10321
10322     // Try to match the following pattern:
10323     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
10324     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
10325         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
10326         Op0.getOperand(0) == Op1.getOperand(0) &&
10327         isa<ConstantSDNode>(Op0.getOperand(1)) &&
10328         isa<ConstantSDNode>(Op1.getOperand(1)));
10329     if (!CanFold)
10330       break;
10331
10332     unsigned I0 = Op0.getConstantOperandVal(1);
10333     unsigned I1 = Op1.getConstantOperandVal(1);
10334
10335     if (i * 2 < NumElts) {
10336       if (V0.isUndef()) {
10337         V0 = Op0.getOperand(0);
10338         if (V0.getValueType() != VT)
10339           return false;
10340       }
10341     } else {
10342       if (V1.isUndef()) {
10343         V1 = Op0.getOperand(0);
10344         if (V1.getValueType() != VT)
10345           return false;
10346       }
10347       if (i * 2 == NumElts)
10348         ExpectedVExtractIdx = BaseIdx;
10349     }
10350
10351     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
10352     if (I0 == ExpectedVExtractIdx)
10353       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
10354     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
10355       // Try to match the following dag sequence:
10356       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
10357       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
10358     } else
10359       CanFold = false;
10360
10361     ExpectedVExtractIdx += 2;
10362   }
10363
10364   return CanFold;
10365 }
10366
10367 /// Emit a sequence of two 128-bit horizontal add/sub followed by
10368 /// a concat_vector.
10369 ///
10370 /// This is a helper function of LowerToHorizontalOp().
10371 /// This function expects two 256-bit vectors called V0 and V1.
10372 /// At first, each vector is split into two separate 128-bit vectors.
10373 /// Then, the resulting 128-bit vectors are used to implement two
10374 /// horizontal binary operations.
10375 ///
10376 /// The kind of horizontal binary operation is defined by \p X86Opcode.
10377 ///
10378 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
10379 /// the two new horizontal binop.
10380 /// When Mode is set, the first horizontal binop dag node would take as input
10381 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
10382 /// horizontal binop dag node would take as input the lower 128-bit of V1
10383 /// and the upper 128-bit of V1.
10384 ///   Example:
10385 ///     HADD V0_LO, V0_HI
10386 ///     HADD V1_LO, V1_HI
10387 ///
10388 /// Otherwise, the first horizontal binop dag node takes as input the lower
10389 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
10390 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
10391 ///   Example:
10392 ///     HADD V0_LO, V1_LO
10393 ///     HADD V0_HI, V1_HI
10394 ///
10395 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
10396 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
10397 /// the upper 128-bits of the result.
10398 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
10399                                      const SDLoc &DL, SelectionDAG &DAG,
10400                                      unsigned X86Opcode, bool Mode,
10401                                      bool isUndefLO, bool isUndefHI) {
10402   MVT VT = V0.getSimpleValueType();
10403   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
10404          "Invalid nodes in input!");
10405
10406   unsigned NumElts = VT.getVectorNumElements();
10407   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
10408   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
10409   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
10410   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
10411   MVT NewVT = V0_LO.getSimpleValueType();
10412
10413   SDValue LO = DAG.getUNDEF(NewVT);
10414   SDValue HI = DAG.getUNDEF(NewVT);
10415
10416   if (Mode) {
10417     // Don't emit a horizontal binop if the result is expected to be UNDEF.
10418     if (!isUndefLO && !V0->isUndef())
10419       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
10420     if (!isUndefHI && !V1->isUndef())
10421       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
10422   } else {
10423     // Don't emit a horizontal binop if the result is expected to be UNDEF.
10424     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
10425       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
10426
10427     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
10428       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
10429   }
10430
10431   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
10432 }
10433
10434 /// Returns true iff \p BV builds a vector with the result equivalent to
10435 /// the result of ADDSUB/SUBADD operation.
10436 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
10437 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
10438 /// \p Opnd0 and \p Opnd1.
10439 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
10440                              const X86Subtarget &Subtarget, SelectionDAG &DAG,
10441                              SDValue &Opnd0, SDValue &Opnd1,
10442                              unsigned &NumExtracts,
10443                              bool &IsSubAdd) {
10444
10445   MVT VT = BV->getSimpleValueType(0);
10446   if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
10447     return false;
10448
10449   unsigned NumElts = VT.getVectorNumElements();
10450   SDValue InVec0 = DAG.getUNDEF(VT);
10451   SDValue InVec1 = DAG.getUNDEF(VT);
10452
10453   NumExtracts = 0;
10454
10455   // Odd-numbered elements in the input build vector are obtained from
10456   // adding/subtracting two integer/float elements.
10457   // Even-numbered elements in the input build vector are obtained from
10458   // subtracting/adding two integer/float elements.
10459   unsigned Opc[2] = {0, 0};
10460   for (unsigned i = 0, e = NumElts; i != e; ++i) {
10461     SDValue Op = BV->getOperand(i);
10462
10463     // Skip 'undef' values.
10464     unsigned Opcode = Op.getOpcode();
10465     if (Opcode == ISD::UNDEF)
10466       continue;
10467
10468     // Early exit if we found an unexpected opcode.
10469     if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
10470       return false;
10471
10472     SDValue Op0 = Op.getOperand(0);
10473     SDValue Op1 = Op.getOperand(1);
10474
10475     // Try to match the following pattern:
10476     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
10477     // Early exit if we cannot match that sequence.
10478     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10479         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10480         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10481         Op0.getOperand(1) != Op1.getOperand(1))
10482       return false;
10483
10484     unsigned I0 = Op0.getConstantOperandVal(1);
10485     if (I0 != i)
10486       return false;
10487
10488     // We found a valid add/sub node, make sure its the same opcode as previous
10489     // elements for this parity.
10490     if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
10491       return false;
10492     Opc[i % 2] = Opcode;
10493
10494     // Update InVec0 and InVec1.
10495     if (InVec0.isUndef()) {
10496       InVec0 = Op0.getOperand(0);
10497       if (InVec0.getSimpleValueType() != VT)
10498         return false;
10499     }
10500     if (InVec1.isUndef()) {
10501       InVec1 = Op1.getOperand(0);
10502       if (InVec1.getSimpleValueType() != VT)
10503         return false;
10504     }
10505
10506     // Make sure that operands in input to each add/sub node always
10507     // come from a same pair of vectors.
10508     if (InVec0 != Op0.getOperand(0)) {
10509       if (Opcode == ISD::FSUB)
10510         return false;
10511
10512       // FADD is commutable. Try to commute the operands
10513       // and then test again.
10514       std::swap(Op0, Op1);
10515       if (InVec0 != Op0.getOperand(0))
10516         return false;
10517     }
10518
10519     if (InVec1 != Op1.getOperand(0))
10520       return false;
10521
10522     // Increment the number of extractions done.
10523     ++NumExtracts;
10524   }
10525
10526   // Ensure we have found an opcode for both parities and that they are
10527   // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
10528   // inputs are undef.
10529   if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
10530       InVec0.isUndef() || InVec1.isUndef())
10531     return false;
10532
10533   IsSubAdd = Opc[0] == ISD::FADD;
10534
10535   Opnd0 = InVec0;
10536   Opnd1 = InVec1;
10537   return true;
10538 }
10539
10540 /// Returns true if is possible to fold MUL and an idiom that has already been
10541 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
10542 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
10543 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
10544 ///
10545 /// Prior to calling this function it should be known that there is some
10546 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
10547 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
10548 /// before replacement of such SDNode with ADDSUB operation. Thus the number
10549 /// of \p Opnd0 uses is expected to be equal to 2.
10550 /// For example, this function may be called for the following IR:
10551 ///    %AB = fmul fast <2 x double> %A, %B
10552 ///    %Sub = fsub fast <2 x double> %AB, %C
10553 ///    %Add = fadd fast <2 x double> %AB, %C
10554 ///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
10555 ///                            <2 x i32> <i32 0, i32 3>
10556 /// There is a def for %Addsub here, which potentially can be replaced by
10557 /// X86ISD::ADDSUB operation:
10558 ///    %Addsub = X86ISD::ADDSUB %AB, %C
10559 /// and such ADDSUB can further be replaced with FMADDSUB:
10560 ///    %Addsub = FMADDSUB %A, %B, %C.
10561 ///
10562 /// The main reason why this method is called before the replacement of the
10563 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
10564 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
10565 /// FMADDSUB is.
10566 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
10567                                  SelectionDAG &DAG,
10568                                  SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
10569                                  unsigned ExpectedUses) {
10570   if (Opnd0.getOpcode() != ISD::FMUL ||
10571       !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
10572     return false;
10573
10574   // FIXME: These checks must match the similar ones in
10575   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
10576   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
10577   // or MUL + ADDSUB to FMADDSUB.
10578   const TargetOptions &Options = DAG.getTarget().Options;
10579   bool AllowFusion =
10580       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
10581   if (!AllowFusion)
10582     return false;
10583
10584   Opnd2 = Opnd1;
10585   Opnd1 = Opnd0.getOperand(1);
10586   Opnd0 = Opnd0.getOperand(0);
10587
10588   return true;
10589 }
10590
10591 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
10592 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
10593 /// X86ISD::FMSUBADD node.
10594 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
10595                                        const X86Subtarget &Subtarget,
10596                                        SelectionDAG &DAG) {
10597   SDValue Opnd0, Opnd1;
10598   unsigned NumExtracts;
10599   bool IsSubAdd;
10600   if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
10601                         IsSubAdd))
10602     return SDValue();
10603
10604   MVT VT = BV->getSimpleValueType(0);
10605   SDLoc DL(BV);
10606
10607   // Try to generate X86ISD::FMADDSUB node here.
10608   SDValue Opnd2;
10609   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
10610     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
10611     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
10612   }
10613
10614   // We only support ADDSUB.
10615   if (IsSubAdd)
10616     return SDValue();
10617
10618   // There are no known X86 targets with 512-bit ADDSUB instructions!
10619   // Convert to blend(fsub,fadd).
10620   if (VT.is512BitVector()) {
10621     SmallVector<int> Mask;
10622     for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
10623         Mask.push_back(I);
10624         Mask.push_back(I + E + 1);
10625     }
10626     SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
10627     SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
10628     return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
10629   }
10630
10631   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
10632 }
10633
10634 static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
10635                              unsigned &HOpcode, SDValue &V0, SDValue &V1) {
10636   // Initialize outputs to known values.
10637   MVT VT = BV->getSimpleValueType(0);
10638   HOpcode = ISD::DELETED_NODE;
10639   V0 = DAG.getUNDEF(VT);
10640   V1 = DAG.getUNDEF(VT);
10641
10642   // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
10643   // half of the result is calculated independently from the 128-bit halves of
10644   // the inputs, so that makes the index-checking logic below more complicated.
10645   unsigned NumElts = VT.getVectorNumElements();
10646   unsigned GenericOpcode = ISD::DELETED_NODE;
10647   unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
10648   unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
10649   unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
10650   for (unsigned i = 0; i != Num128BitChunks; ++i) {
10651     for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
10652       // Ignore undef elements.
10653       SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
10654       if (Op.isUndef())
10655         continue;
10656
10657       // If there's an opcode mismatch, we're done.
10658       if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
10659         return false;
10660
10661       // Initialize horizontal opcode.
10662       if (HOpcode == ISD::DELETED_NODE) {
10663         GenericOpcode = Op.getOpcode();
10664         switch (GenericOpcode) {
10665         case ISD::ADD: HOpcode = X86ISD::HADD; break;
10666         case ISD::SUB: HOpcode = X86ISD::HSUB; break;
10667         case ISD::FADD: HOpcode = X86ISD::FHADD; break;
10668         case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
10669         default: return false;
10670         }
10671       }
10672
10673       SDValue Op0 = Op.getOperand(0);
10674       SDValue Op1 = Op.getOperand(1);
10675       if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10676           Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10677           Op0.getOperand(0) != Op1.getOperand(0) ||
10678           !isa<ConstantSDNode>(Op0.getOperand(1)) ||
10679           !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
10680         return false;
10681
10682       // The source vector is chosen based on which 64-bit half of the
10683       // destination vector is being calculated.
10684       if (j < NumEltsIn64Bits) {
10685         if (V0.isUndef())
10686           V0 = Op0.getOperand(0);
10687       } else {
10688         if (V1.isUndef())
10689           V1 = Op0.getOperand(0);
10690       }
10691
10692       SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
10693       if (SourceVec != Op0.getOperand(0))
10694         return false;
10695
10696       // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
10697       unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
10698       unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
10699       unsigned ExpectedIndex = i * NumEltsIn128Bits +
10700                                (j % NumEltsIn64Bits) * 2;
10701       if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
10702         continue;
10703
10704       // If this is not a commutative op, this does not match.
10705       if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
10706         return false;
10707
10708       // Addition is commutative, so try swapping the extract indexes.
10709       // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
10710       if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
10711         continue;
10712
10713       // Extract indexes do not match horizontal requirement.
10714       return false;
10715     }
10716   }
10717   // We matched. Opcode and operands are returned by reference as arguments.
10718   return true;
10719 }
10720
10721 static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
10722                                     SelectionDAG &DAG, unsigned HOpcode,
10723                                     SDValue V0, SDValue V1) {
10724   // If either input vector is not the same size as the build vector,
10725   // extract/insert the low bits to the correct size.
10726   // This is free (examples: zmm --> xmm, xmm --> ymm).
10727   MVT VT = BV->getSimpleValueType(0);
10728   unsigned Width = VT.getSizeInBits();
10729   if (V0.getValueSizeInBits() > Width)
10730     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
10731   else if (V0.getValueSizeInBits() < Width)
10732     V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
10733
10734   if (V1.getValueSizeInBits() > Width)
10735     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
10736   else if (V1.getValueSizeInBits() < Width)
10737     V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
10738
10739   unsigned NumElts = VT.getVectorNumElements();
10740   APInt DemandedElts = APInt::getAllOnes(NumElts);
10741   for (unsigned i = 0; i != NumElts; ++i)
10742     if (BV->getOperand(i).isUndef())
10743       DemandedElts.clearBit(i);
10744
10745   // If we don't need the upper xmm, then perform as a xmm hop.
10746   unsigned HalfNumElts = NumElts / 2;
10747   if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
10748     MVT HalfVT = VT.getHalfNumVectorElementsVT();
10749     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
10750     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
10751     SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
10752     return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
10753   }
10754
10755   return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
10756 }
10757
10758 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
10759 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
10760                                    const X86Subtarget &Subtarget,
10761                                    SelectionDAG &DAG) {
10762   // We need at least 2 non-undef elements to make this worthwhile by default.
10763   unsigned NumNonUndefs =
10764       count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
10765   if (NumNonUndefs < 2)
10766     return SDValue();
10767
10768   // There are 4 sets of horizontal math operations distinguished by type:
10769   // int/FP at 128-bit/256-bit. Each type was introduced with a different
10770   // subtarget feature. Try to match those "native" patterns first.
10771   MVT VT = BV->getSimpleValueType(0);
10772   if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
10773       ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
10774       ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
10775       ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
10776     unsigned HOpcode;
10777     SDValue V0, V1;
10778     if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
10779       return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
10780   }
10781
10782   // Try harder to match 256-bit ops by using extract/concat.
10783   if (!Subtarget.hasAVX() || !VT.is256BitVector())
10784     return SDValue();
10785
10786   // Count the number of UNDEF operands in the build_vector in input.
10787   unsigned NumElts = VT.getVectorNumElements();
10788   unsigned Half = NumElts / 2;
10789   unsigned NumUndefsLO = 0;
10790   unsigned NumUndefsHI = 0;
10791   for (unsigned i = 0, e = Half; i != e; ++i)
10792     if (BV->getOperand(i)->isUndef())
10793       NumUndefsLO++;
10794
10795   for (unsigned i = Half, e = NumElts; i != e; ++i)
10796     if (BV->getOperand(i)->isUndef())
10797       NumUndefsHI++;
10798
10799   SDLoc DL(BV);
10800   SDValue InVec0, InVec1;
10801   if (VT == MVT::v8i32 || VT == MVT::v16i16) {
10802     SDValue InVec2, InVec3;
10803     unsigned X86Opcode;
10804     bool CanFold = true;
10805
10806     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
10807         isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
10808                               InVec3) &&
10809         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10810         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10811       X86Opcode = X86ISD::HADD;
10812     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
10813                                    InVec1) &&
10814              isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
10815                                    InVec3) &&
10816              ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10817              ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10818       X86Opcode = X86ISD::HSUB;
10819     else
10820       CanFold = false;
10821
10822     if (CanFold) {
10823       // Do not try to expand this build_vector into a pair of horizontal
10824       // add/sub if we can emit a pair of scalar add/sub.
10825       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10826         return SDValue();
10827
10828       // Convert this build_vector into a pair of horizontal binops followed by
10829       // a concat vector. We must adjust the outputs from the partial horizontal
10830       // matching calls above to account for undefined vector halves.
10831       SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
10832       SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
10833       assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
10834       bool isUndefLO = NumUndefsLO == Half;
10835       bool isUndefHI = NumUndefsHI == Half;
10836       return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
10837                                    isUndefHI);
10838     }
10839   }
10840
10841   if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
10842       VT == MVT::v16i16) {
10843     unsigned X86Opcode;
10844     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
10845       X86Opcode = X86ISD::HADD;
10846     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
10847                                    InVec1))
10848       X86Opcode = X86ISD::HSUB;
10849     else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
10850                                    InVec1))
10851       X86Opcode = X86ISD::FHADD;
10852     else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
10853                                    InVec1))
10854       X86Opcode = X86ISD::FHSUB;
10855     else
10856       return SDValue();
10857
10858     // Don't try to expand this build_vector into a pair of horizontal add/sub
10859     // if we can simply emit a pair of scalar add/sub.
10860     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10861       return SDValue();
10862
10863     // Convert this build_vector into two horizontal add/sub followed by
10864     // a concat vector.
10865     bool isUndefLO = NumUndefsLO == Half;
10866     bool isUndefHI = NumUndefsHI == Half;
10867     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
10868                                  isUndefLO, isUndefHI);
10869   }
10870
10871   return SDValue();
10872 }
10873
10874 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
10875                           SelectionDAG &DAG);
10876
10877 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
10878 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
10879 /// just apply the bit to the vectors.
10880 /// NOTE: Its not in our interest to start make a general purpose vectorizer
10881 /// from this, but enough scalar bit operations are created from the later
10882 /// legalization + scalarization stages to need basic support.
10883 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
10884                                        const X86Subtarget &Subtarget,
10885                                        SelectionDAG &DAG) {
10886   SDLoc DL(Op);
10887   MVT VT = Op->getSimpleValueType(0);
10888   unsigned NumElems = VT.getVectorNumElements();
10889   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10890
10891   // Check that all elements have the same opcode.
10892   // TODO: Should we allow UNDEFS and if so how many?
10893   unsigned Opcode = Op->getOperand(0).getOpcode();
10894   for (unsigned i = 1; i < NumElems; ++i)
10895     if (Opcode != Op->getOperand(i).getOpcode())
10896       return SDValue();
10897
10898   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
10899   bool IsShift = false;
10900   switch (Opcode) {
10901   default:
10902     return SDValue();
10903   case ISD::SHL:
10904   case ISD::SRL:
10905   case ISD::SRA:
10906     IsShift = true;
10907     break;
10908   case ISD::AND:
10909   case ISD::XOR:
10910   case ISD::OR:
10911     // Don't do this if the buildvector is a splat - we'd replace one
10912     // constant with an entire vector.
10913     if (Op->getSplatValue())
10914       return SDValue();
10915     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
10916       return SDValue();
10917     break;
10918   }
10919
10920   SmallVector<SDValue, 4> LHSElts, RHSElts;
10921   for (SDValue Elt : Op->ops()) {
10922     SDValue LHS = Elt.getOperand(0);
10923     SDValue RHS = Elt.getOperand(1);
10924
10925     // We expect the canonicalized RHS operand to be the constant.
10926     if (!isa<ConstantSDNode>(RHS))
10927       return SDValue();
10928
10929     // Extend shift amounts.
10930     if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
10931       if (!IsShift)
10932         return SDValue();
10933       RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
10934     }
10935
10936     LHSElts.push_back(LHS);
10937     RHSElts.push_back(RHS);
10938   }
10939
10940   // Limit to shifts by uniform immediates.
10941   // TODO: Only accept vXi8/vXi64 special cases?
10942   // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
10943   if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
10944     return SDValue();
10945
10946   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
10947   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
10948   SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
10949
10950   if (!IsShift)
10951     return Res;
10952
10953   // Immediately lower the shift to ensure the constant build vector doesn't
10954   // get converted to a constant pool before the shift is lowered.
10955   return LowerShift(Res, Subtarget, DAG);
10956 }
10957
10958 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
10959 /// functionality to do this, so it's all zeros, all ones, or some derivation
10960 /// that is cheap to calculate.
10961 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
10962                                          const X86Subtarget &Subtarget) {
10963   SDLoc DL(Op);
10964   MVT VT = Op.getSimpleValueType();
10965
10966   // Vectors containing all zeros can be matched by pxor and xorps.
10967   if (ISD::isBuildVectorAllZeros(Op.getNode()))
10968     return Op;
10969
10970   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
10971   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
10972   // vpcmpeqd on 256-bit vectors.
10973   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10974     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10975       return Op;
10976
10977     return getOnesVector(VT, DAG, DL);
10978   }
10979
10980   return SDValue();
10981 }
10982
10983 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10984 /// from a vector of source values and a vector of extraction indices.
10985 /// The vectors might be manipulated to match the type of the permute op.
10986 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10987                                      SDLoc &DL, SelectionDAG &DAG,
10988                                      const X86Subtarget &Subtarget) {
10989   MVT ShuffleVT = VT;
10990   EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10991   unsigned NumElts = VT.getVectorNumElements();
10992   unsigned SizeInBits = VT.getSizeInBits();
10993
10994   // Adjust IndicesVec to match VT size.
10995   assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
10996          "Illegal variable permute mask size");
10997   if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10998     // Narrow/widen the indices vector to the correct size.
10999     if (IndicesVec.getValueSizeInBits() > SizeInBits)
11000       IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
11001                                     NumElts * VT.getScalarSizeInBits());
11002     else if (IndicesVec.getValueSizeInBits() < SizeInBits)
11003       IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
11004                                   SDLoc(IndicesVec), SizeInBits);
11005     // Zero-extend the index elements within the vector.
11006     if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
11007       IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
11008                                IndicesVT, IndicesVec);
11009   }
11010   IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
11011
11012   // Handle SrcVec that don't match VT type.
11013   if (SrcVec.getValueSizeInBits() != SizeInBits) {
11014     if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
11015       // Handle larger SrcVec by treating it as a larger permute.
11016       unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
11017       VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
11018       IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
11019       IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
11020                                   Subtarget, DAG, SDLoc(IndicesVec));
11021       SDValue NewSrcVec =
11022           createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
11023       if (NewSrcVec)
11024         return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
11025       return SDValue();
11026     } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
11027       // Widen smaller SrcVec to match VT.
11028       SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
11029     } else
11030       return SDValue();
11031   }
11032
11033   auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
11034     assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
11035     EVT SrcVT = Idx.getValueType();
11036     unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
11037     uint64_t IndexScale = 0;
11038     uint64_t IndexOffset = 0;
11039
11040     // If we're scaling a smaller permute op, then we need to repeat the
11041     // indices, scaling and offsetting them as well.
11042     // e.g. v4i32 -> v16i8 (Scale = 4)
11043     // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
11044     // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
11045     for (uint64_t i = 0; i != Scale; ++i) {
11046       IndexScale |= Scale << (i * NumDstBits);
11047       IndexOffset |= i << (i * NumDstBits);
11048     }
11049
11050     Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
11051                       DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
11052     Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
11053                       DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
11054     return Idx;
11055   };
11056
11057   unsigned Opcode = 0;
11058   switch (VT.SimpleTy) {
11059   default:
11060     break;
11061   case MVT::v16i8:
11062     if (Subtarget.hasSSSE3())
11063       Opcode = X86ISD::PSHUFB;
11064     break;
11065   case MVT::v8i16:
11066     if (Subtarget.hasVLX() && Subtarget.hasBWI())
11067       Opcode = X86ISD::VPERMV;
11068     else if (Subtarget.hasSSSE3()) {
11069       Opcode = X86ISD::PSHUFB;
11070       ShuffleVT = MVT::v16i8;
11071     }
11072     break;
11073   case MVT::v4f32:
11074   case MVT::v4i32:
11075     if (Subtarget.hasAVX()) {
11076       Opcode = X86ISD::VPERMILPV;
11077       ShuffleVT = MVT::v4f32;
11078     } else if (Subtarget.hasSSSE3()) {
11079       Opcode = X86ISD::PSHUFB;
11080       ShuffleVT = MVT::v16i8;
11081     }
11082     break;
11083   case MVT::v2f64:
11084   case MVT::v2i64:
11085     if (Subtarget.hasAVX()) {
11086       // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
11087       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
11088       Opcode = X86ISD::VPERMILPV;
11089       ShuffleVT = MVT::v2f64;
11090     } else if (Subtarget.hasSSE41()) {
11091       // SSE41 can compare v2i64 - select between indices 0 and 1.
11092       return DAG.getSelectCC(
11093           DL, IndicesVec,
11094           getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
11095           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
11096           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
11097           ISD::CondCode::SETEQ);
11098     }
11099     break;
11100   case MVT::v32i8:
11101     if (Subtarget.hasVLX() && Subtarget.hasVBMI())
11102       Opcode = X86ISD::VPERMV;
11103     else if (Subtarget.hasXOP()) {
11104       SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
11105       SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
11106       SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
11107       SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
11108       return DAG.getNode(
11109           ISD::CONCAT_VECTORS, DL, VT,
11110           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
11111           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
11112     } else if (Subtarget.hasAVX()) {
11113       SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
11114       SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
11115       SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
11116       SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
11117       auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
11118                               ArrayRef<SDValue> Ops) {
11119         // Permute Lo and Hi and then select based on index range.
11120         // This works as SHUFB uses bits[3:0] to permute elements and we don't
11121         // care about the bit[7] as its just an index vector.
11122         SDValue Idx = Ops[2];
11123         EVT VT = Idx.getValueType();
11124         return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
11125                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
11126                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
11127                                ISD::CondCode::SETGT);
11128       };
11129       SDValue Ops[] = {LoLo, HiHi, IndicesVec};
11130       return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
11131                               PSHUFBBuilder);
11132     }
11133     break;
11134   case MVT::v16i16:
11135     if (Subtarget.hasVLX() && Subtarget.hasBWI())
11136       Opcode = X86ISD::VPERMV;
11137     else if (Subtarget.hasAVX()) {
11138       // Scale to v32i8 and perform as v32i8.
11139       IndicesVec = ScaleIndices(IndicesVec, 2);
11140       return DAG.getBitcast(
11141           VT, createVariablePermute(
11142                   MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
11143                   DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
11144     }
11145     break;
11146   case MVT::v8f32:
11147   case MVT::v8i32:
11148     if (Subtarget.hasAVX2())
11149       Opcode = X86ISD::VPERMV;
11150     else if (Subtarget.hasAVX()) {
11151       SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
11152       SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
11153                                           {0, 1, 2, 3, 0, 1, 2, 3});
11154       SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
11155                                           {4, 5, 6, 7, 4, 5, 6, 7});
11156       if (Subtarget.hasXOP())
11157         return DAG.getBitcast(
11158             VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
11159                             IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
11160       // Permute Lo and Hi and then select based on index range.
11161       // This works as VPERMILPS only uses index bits[0:1] to permute elements.
11162       SDValue Res = DAG.getSelectCC(
11163           DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
11164           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
11165           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
11166           ISD::CondCode::SETGT);
11167       return DAG.getBitcast(VT, Res);
11168     }
11169     break;
11170   case MVT::v4i64:
11171   case MVT::v4f64:
11172     if (Subtarget.hasAVX512()) {
11173       if (!Subtarget.hasVLX()) {
11174         MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
11175         SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
11176                                 SDLoc(SrcVec));
11177         IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
11178                                     DAG, SDLoc(IndicesVec));
11179         SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
11180                                             DAG, Subtarget);
11181         return extract256BitVector(Res, 0, DAG, DL);
11182       }
11183       Opcode = X86ISD::VPERMV;
11184     } else if (Subtarget.hasAVX()) {
11185       SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
11186       SDValue LoLo =
11187           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
11188       SDValue HiHi =
11189           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
11190       // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
11191       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
11192       if (Subtarget.hasXOP())
11193         return DAG.getBitcast(
11194             VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
11195                             IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
11196       // Permute Lo and Hi and then select based on index range.
11197       // This works as VPERMILPD only uses index bit[1] to permute elements.
11198       SDValue Res = DAG.getSelectCC(
11199           DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
11200           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
11201           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
11202           ISD::CondCode::SETGT);
11203       return DAG.getBitcast(VT, Res);
11204     }
11205     break;
11206   case MVT::v64i8:
11207     if (Subtarget.hasVBMI())
11208       Opcode = X86ISD::VPERMV;
11209     break;
11210   case MVT::v32i16:
11211     if (Subtarget.hasBWI())
11212       Opcode = X86ISD::VPERMV;
11213     break;
11214   case MVT::v16f32:
11215   case MVT::v16i32:
11216   case MVT::v8f64:
11217   case MVT::v8i64:
11218     if (Subtarget.hasAVX512())
11219       Opcode = X86ISD::VPERMV;
11220     break;
11221   }
11222   if (!Opcode)
11223     return SDValue();
11224
11225   assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
11226          (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
11227          "Illegal variable permute shuffle type");
11228
11229   uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
11230   if (Scale > 1)
11231     IndicesVec = ScaleIndices(IndicesVec, Scale);
11232
11233   EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
11234   IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
11235
11236   SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
11237   SDValue Res = Opcode == X86ISD::VPERMV
11238                     ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
11239                     : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
11240   return DAG.getBitcast(VT, Res);
11241 }
11242
11243 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
11244 // reasoned to be a permutation of a vector by indices in a non-constant vector.
11245 // (build_vector (extract_elt V, (extract_elt I, 0)),
11246 //               (extract_elt V, (extract_elt I, 1)),
11247 //                    ...
11248 // ->
11249 // (vpermv I, V)
11250 //
11251 // TODO: Handle undefs
11252 // TODO: Utilize pshufb and zero mask blending to support more efficient
11253 // construction of vectors with constant-0 elements.
11254 static SDValue
11255 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
11256                                    const X86Subtarget &Subtarget) {
11257   SDValue SrcVec, IndicesVec;
11258   // Check for a match of the permute source vector and permute index elements.
11259   // This is done by checking that the i-th build_vector operand is of the form:
11260   // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
11261   for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
11262     SDValue Op = V.getOperand(Idx);
11263     if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11264       return SDValue();
11265
11266     // If this is the first extract encountered in V, set the source vector,
11267     // otherwise verify the extract is from the previously defined source
11268     // vector.
11269     if (!SrcVec)
11270       SrcVec = Op.getOperand(0);
11271     else if (SrcVec != Op.getOperand(0))
11272       return SDValue();
11273     SDValue ExtractedIndex = Op->getOperand(1);
11274     // Peek through extends.
11275     if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
11276         ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
11277       ExtractedIndex = ExtractedIndex.getOperand(0);
11278     if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11279       return SDValue();
11280
11281     // If this is the first extract from the index vector candidate, set the
11282     // indices vector, otherwise verify the extract is from the previously
11283     // defined indices vector.
11284     if (!IndicesVec)
11285       IndicesVec = ExtractedIndex.getOperand(0);
11286     else if (IndicesVec != ExtractedIndex.getOperand(0))
11287       return SDValue();
11288
11289     auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
11290     if (!PermIdx || PermIdx->getAPIntValue() != Idx)
11291       return SDValue();
11292   }
11293
11294   SDLoc DL(V);
11295   MVT VT = V.getSimpleValueType();
11296   return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
11297 }
11298
11299 SDValue
11300 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
11301   SDLoc dl(Op);
11302
11303   MVT VT = Op.getSimpleValueType();
11304   MVT EltVT = VT.getVectorElementType();
11305   MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
11306   unsigned NumElems = Op.getNumOperands();
11307
11308   // Generate vectors for predicate vectors.
11309   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
11310     return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
11311
11312   if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())
11313     return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
11314
11315   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
11316     return VectorConstant;
11317
11318   unsigned EVTBits = EltVT.getSizeInBits();
11319   APInt UndefMask = APInt::getZero(NumElems);
11320   APInt FrozenUndefMask = APInt::getZero(NumElems);
11321   APInt ZeroMask = APInt::getZero(NumElems);
11322   APInt NonZeroMask = APInt::getZero(NumElems);
11323   bool IsAllConstants = true;
11324   bool OneUseFrozenUndefs = true;
11325   SmallSet<SDValue, 8> Values;
11326   unsigned NumConstants = NumElems;
11327   for (unsigned i = 0; i < NumElems; ++i) {
11328     SDValue Elt = Op.getOperand(i);
11329     if (Elt.isUndef()) {
11330       UndefMask.setBit(i);
11331       continue;
11332     }
11333     if (ISD::isFreezeUndef(Elt.getNode())) {
11334       OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
11335       FrozenUndefMask.setBit(i);
11336       continue;
11337     }
11338     Values.insert(Elt);
11339     if (!isIntOrFPConstant(Elt)) {
11340       IsAllConstants = false;
11341       NumConstants--;
11342     }
11343     if (X86::isZeroNode(Elt)) {
11344       ZeroMask.setBit(i);
11345     } else {
11346       NonZeroMask.setBit(i);
11347     }
11348   }
11349
11350   // All undef vector. Return an UNDEF.
11351   if (UndefMask.isAllOnes())
11352     return DAG.getUNDEF(VT);
11353
11354   // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
11355   if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
11356     return DAG.getFreeze(DAG.getUNDEF(VT));
11357
11358   // All undef/freeze(undef)/zero vector. Return a zero vector.
11359   if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
11360     return getZeroVector(VT, Subtarget, DAG, dl);
11361
11362   // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
11363   // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
11364   // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
11365   // and blend the FREEZE-UNDEF operands back in.
11366   // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
11367   if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
11368       NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
11369     SmallVector<int, 16> BlendMask(NumElems, -1);
11370     SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
11371     for (unsigned i = 0; i < NumElems; ++i) {
11372       if (UndefMask[i]) {
11373         BlendMask[i] = -1;
11374         continue;
11375       }
11376       BlendMask[i] = i;
11377       if (!FrozenUndefMask[i])
11378         Elts[i] = Op.getOperand(i);
11379       else
11380         BlendMask[i] += NumElems;
11381     }
11382     SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
11383     SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
11384     SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
11385     return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
11386   }
11387
11388   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
11389
11390   // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
11391   // be better off lowering to a smaller build vector and padding with
11392   // undef/zero.
11393   if ((VT.is256BitVector() || VT.is512BitVector()) &&
11394       !isFoldableUseOfShuffle(BV)) {
11395     unsigned UpperElems = NumElems / 2;
11396     APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
11397     unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
11398     if (NumUpperUndefsOrZeros >= UpperElems) {
11399       if (VT.is512BitVector() &&
11400           NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
11401         UpperElems = NumElems - (NumElems / 4);
11402       // If freeze(undef) is in any upper elements, force to zero.
11403       bool UndefUpper = UndefMask.countl_one() >= UpperElems;
11404       MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
11405       SDValue NewBV =
11406           DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
11407       return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
11408     }
11409   }
11410
11411   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
11412     return AddSub;
11413   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
11414     return HorizontalOp;
11415   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
11416     return Broadcast;
11417   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
11418     return BitOp;
11419
11420   unsigned NumZero = ZeroMask.popcount();
11421   unsigned NumNonZero = NonZeroMask.popcount();
11422
11423   // If we are inserting one variable into a vector of non-zero constants, try
11424   // to avoid loading each constant element as a scalar. Load the constants as a
11425   // vector and then insert the variable scalar element. If insertion is not
11426   // supported, fall back to a shuffle to get the scalar blended with the
11427   // constants. Insertion into a zero vector is handled as a special-case
11428   // somewhere below here.
11429   if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
11430       (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
11431        isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
11432     // Create an all-constant vector. The variable element in the old
11433     // build vector is replaced by undef in the constant vector. Save the
11434     // variable scalar element and its index for use in the insertelement.
11435     LLVMContext &Context = *DAG.getContext();
11436     Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
11437     SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
11438     SDValue VarElt;
11439     SDValue InsIndex;
11440     for (unsigned i = 0; i != NumElems; ++i) {
11441       SDValue Elt = Op.getOperand(i);
11442       if (auto *C = dyn_cast<ConstantSDNode>(Elt))
11443         ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
11444       else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
11445         ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
11446       else if (!Elt.isUndef()) {
11447         assert(!VarElt.getNode() && !InsIndex.getNode() &&
11448                "Expected one variable element in this vector");
11449         VarElt = Elt;
11450         InsIndex = DAG.getVectorIdxConstant(i, dl);
11451       }
11452     }
11453     Constant *CV = ConstantVector::get(ConstVecOps);
11454     SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
11455
11456     // The constants we just created may not be legal (eg, floating point). We
11457     // must lower the vector right here because we can not guarantee that we'll
11458     // legalize it before loading it. This is also why we could not just create
11459     // a new build vector here. If the build vector contains illegal constants,
11460     // it could get split back up into a series of insert elements.
11461     // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
11462     SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
11463     MachineFunction &MF = DAG.getMachineFunction();
11464     MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
11465     SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
11466     unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
11467     unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
11468     if (InsertC < NumEltsInLow128Bits)
11469       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
11470
11471     // There's no good way to insert into the high elements of a >128-bit
11472     // vector, so use shuffles to avoid an extract/insert sequence.
11473     assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
11474     assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
11475     SmallVector<int, 8> ShuffleMask;
11476     unsigned NumElts = VT.getVectorNumElements();
11477     for (unsigned i = 0; i != NumElts; ++i)
11478       ShuffleMask.push_back(i == InsertC ? NumElts : i);
11479     SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
11480     return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
11481   }
11482
11483   // Special case for single non-zero, non-undef, element.
11484   if (NumNonZero == 1) {
11485     unsigned Idx = NonZeroMask.countr_zero();
11486     SDValue Item = Op.getOperand(Idx);
11487
11488     // If we have a constant or non-constant insertion into the low element of
11489     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
11490     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
11491     // depending on what the source datatype is.
11492     if (Idx == 0) {
11493       if (NumZero == 0)
11494         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11495
11496       if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
11497           EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
11498           (EltVT == MVT::i16 && Subtarget.hasFP16())) {
11499         assert((VT.is128BitVector() || VT.is256BitVector() ||
11500                 VT.is512BitVector()) &&
11501                "Expected an SSE value type!");
11502         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11503         // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
11504         // zero vector.
11505         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
11506       }
11507
11508       // We can't directly insert an i8 or i16 into a vector, so zero extend
11509       // it to i32 first.
11510       if (EltVT == MVT::i16 || EltVT == MVT::i8) {
11511         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
11512         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
11513         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
11514         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
11515         return DAG.getBitcast(VT, Item);
11516       }
11517     }
11518
11519     // Is it a vector logical left shift?
11520     if (NumElems == 2 && Idx == 1 &&
11521         X86::isZeroNode(Op.getOperand(0)) &&
11522         !X86::isZeroNode(Op.getOperand(1))) {
11523       unsigned NumBits = VT.getSizeInBits();
11524       return getVShift(true, VT,
11525                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11526                                    VT, Op.getOperand(1)),
11527                        NumBits/2, DAG, *this, dl);
11528     }
11529
11530     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
11531       return SDValue();
11532
11533     // Otherwise, if this is a vector with i32 or f32 elements, and the element
11534     // is a non-constant being inserted into an element other than the low one,
11535     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
11536     // movd/movss) to move this into the low element, then shuffle it into
11537     // place.
11538     if (EVTBits == 32) {
11539       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
11540       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
11541     }
11542   }
11543
11544   // Splat is obviously ok. Let legalizer expand it to a shuffle.
11545   if (Values.size() == 1) {
11546     if (EVTBits == 32) {
11547       // Instead of a shuffle like this:
11548       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
11549       // Check if it's possible to issue this instead.
11550       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
11551       unsigned Idx = NonZeroMask.countr_zero();
11552       SDValue Item = Op.getOperand(Idx);
11553       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
11554         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
11555     }
11556     return SDValue();
11557   }
11558
11559   // A vector full of immediates; various special cases are already
11560   // handled, so this is best done with a single constant-pool load.
11561   if (IsAllConstants)
11562     return SDValue();
11563
11564   if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
11565       return V;
11566
11567   // See if we can use a vector load to get all of the elements.
11568   {
11569     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
11570     if (SDValue LD =
11571             EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
11572       return LD;
11573   }
11574
11575   // If this is a splat of pairs of 32-bit elements, we can use a narrower
11576   // build_vector and broadcast it.
11577   // TODO: We could probably generalize this more.
11578   if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
11579     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
11580                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
11581     auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
11582       // Make sure all the even/odd operands match.
11583       for (unsigned i = 2; i != NumElems; ++i)
11584         if (Ops[i % 2] != Op.getOperand(i))
11585           return false;
11586       return true;
11587     };
11588     if (CanSplat(Op, NumElems, Ops)) {
11589       MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
11590       MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
11591       // Create a new build vector and cast to v2i64/v2f64.
11592       SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
11593                                      DAG.getBuildVector(NarrowVT, dl, Ops));
11594       // Broadcast from v2i64/v2f64 and cast to final VT.
11595       MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
11596       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
11597                                             NewBV));
11598     }
11599   }
11600
11601   // For AVX-length vectors, build the individual 128-bit pieces and use
11602   // shuffles to put them in place.
11603   if (VT.getSizeInBits() > 128) {
11604     MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
11605
11606     // Build both the lower and upper subvector.
11607     SDValue Lower =
11608         DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
11609     SDValue Upper = DAG.getBuildVector(
11610         HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
11611
11612     // Recreate the wider vector with the lower and upper part.
11613     return concatSubVectors(Lower, Upper, DAG, dl);
11614   }
11615
11616   // Let legalizer expand 2-wide build_vectors.
11617   if (EVTBits == 64) {
11618     if (NumNonZero == 1) {
11619       // One half is zero or undef.
11620       unsigned Idx = NonZeroMask.countr_zero();
11621       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
11622                                Op.getOperand(Idx));
11623       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
11624     }
11625     return SDValue();
11626   }
11627
11628   // If element VT is < 32 bits, convert it to inserts into a zero vector.
11629   if (EVTBits == 8 && NumElems == 16)
11630     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
11631                                           DAG, Subtarget))
11632       return V;
11633
11634   if (EltVT == MVT::i16 && NumElems == 8)
11635     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
11636                                           DAG, Subtarget))
11637       return V;
11638
11639   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
11640   if (EVTBits == 32 && NumElems == 4)
11641     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
11642       return V;
11643
11644   // If element VT is == 32 bits, turn it into a number of shuffles.
11645   if (NumElems == 4 && NumZero > 0) {
11646     SmallVector<SDValue, 8> Ops(NumElems);
11647     for (unsigned i = 0; i < 4; ++i) {
11648       bool isZero = !NonZeroMask[i];
11649       if (isZero)
11650         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
11651       else
11652         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11653     }
11654
11655     for (unsigned i = 0; i < 2; ++i) {
11656       switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
11657         default: llvm_unreachable("Unexpected NonZero count");
11658         case 0:
11659           Ops[i] = Ops[i*2];  // Must be a zero vector.
11660           break;
11661         case 1:
11662           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
11663           break;
11664         case 2:
11665           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11666           break;
11667         case 3:
11668           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
11669           break;
11670       }
11671     }
11672
11673     bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
11674     bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
11675     int MaskVec[] = {
11676       Reverse1 ? 1 : 0,
11677       Reverse1 ? 0 : 1,
11678       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
11679       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
11680     };
11681     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
11682   }
11683
11684   assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
11685
11686   // Check for a build vector from mostly shuffle plus few inserting.
11687   if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
11688     return Sh;
11689
11690   // For SSE 4.1, use insertps to put the high elements into the low element.
11691   if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
11692     SDValue Result;
11693     if (!Op.getOperand(0).isUndef())
11694       Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
11695     else
11696       Result = DAG.getUNDEF(VT);
11697
11698     for (unsigned i = 1; i < NumElems; ++i) {
11699       if (Op.getOperand(i).isUndef()) continue;
11700       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
11701                            Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
11702     }
11703     return Result;
11704   }
11705
11706   // Otherwise, expand into a number of unpckl*, start by extending each of
11707   // our (non-undef) elements to the full vector width with the element in the
11708   // bottom slot of the vector (which generates no code for SSE).
11709   SmallVector<SDValue, 8> Ops(NumElems);
11710   for (unsigned i = 0; i < NumElems; ++i) {
11711     if (!Op.getOperand(i).isUndef())
11712       Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
11713     else
11714       Ops[i] = DAG.getUNDEF(VT);
11715   }
11716
11717   // Next, we iteratively mix elements, e.g. for v4f32:
11718   //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
11719   //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
11720   //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
11721   for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
11722     // Generate scaled UNPCKL shuffle mask.
11723     SmallVector<int, 16> Mask;
11724     for(unsigned i = 0; i != Scale; ++i)
11725       Mask.push_back(i);
11726     for (unsigned i = 0; i != Scale; ++i)
11727       Mask.push_back(NumElems+i);
11728     Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
11729
11730     for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
11731       Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
11732   }
11733   return Ops[0];
11734 }
11735
11736 // 256-bit AVX can use the vinsertf128 instruction
11737 // to create 256-bit vectors from two other 128-bit ones.
11738 // TODO: Detect subvector broadcast here instead of DAG combine?
11739 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
11740                                       const X86Subtarget &Subtarget) {
11741   SDLoc dl(Op);
11742   MVT ResVT = Op.getSimpleValueType();
11743
11744   assert((ResVT.is256BitVector() ||
11745           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
11746
11747   unsigned NumOperands = Op.getNumOperands();
11748   unsigned NumFreezeUndef = 0;
11749   unsigned NumZero = 0;
11750   unsigned NumNonZero = 0;
11751   unsigned NonZeros = 0;
11752   for (unsigned i = 0; i != NumOperands; ++i) {
11753     SDValue SubVec = Op.getOperand(i);
11754     if (SubVec.isUndef())
11755       continue;
11756     if (ISD::isFreezeUndef(SubVec.getNode())) {
11757         // If the freeze(undef) has multiple uses then we must fold to zero.
11758         if (SubVec.hasOneUse())
11759           ++NumFreezeUndef;
11760         else
11761           ++NumZero;
11762     }
11763     else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11764       ++NumZero;
11765     else {
11766       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
11767       NonZeros |= 1 << i;
11768       ++NumNonZero;
11769     }
11770   }
11771
11772   // If we have more than 2 non-zeros, build each half separately.
11773   if (NumNonZero > 2) {
11774     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11775     ArrayRef<SDUse> Ops = Op->ops();
11776     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11777                              Ops.slice(0, NumOperands/2));
11778     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11779                              Ops.slice(NumOperands/2));
11780     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11781   }
11782
11783   // Otherwise, build it up through insert_subvectors.
11784   SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
11785                         : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
11786                                           : DAG.getUNDEF(ResVT));
11787
11788   MVT SubVT = Op.getOperand(0).getSimpleValueType();
11789   unsigned NumSubElems = SubVT.getVectorNumElements();
11790   for (unsigned i = 0; i != NumOperands; ++i) {
11791     if ((NonZeros & (1 << i)) == 0)
11792       continue;
11793
11794     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
11795                       Op.getOperand(i),
11796                       DAG.getIntPtrConstant(i * NumSubElems, dl));
11797   }
11798
11799   return Vec;
11800 }
11801
11802 // Returns true if the given node is a type promotion (by concatenating i1
11803 // zeros) of the result of a node that already zeros all upper bits of
11804 // k-register.
11805 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
11806 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
11807                                        const X86Subtarget &Subtarget,
11808                                        SelectionDAG & DAG) {
11809   SDLoc dl(Op);
11810   MVT ResVT = Op.getSimpleValueType();
11811   unsigned NumOperands = Op.getNumOperands();
11812
11813   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
11814          "Unexpected number of operands in CONCAT_VECTORS");
11815
11816   uint64_t Zeros = 0;
11817   uint64_t NonZeros = 0;
11818   for (unsigned i = 0; i != NumOperands; ++i) {
11819     SDValue SubVec = Op.getOperand(i);
11820     if (SubVec.isUndef())
11821       continue;
11822     assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
11823     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11824       Zeros |= (uint64_t)1 << i;
11825     else
11826       NonZeros |= (uint64_t)1 << i;
11827   }
11828
11829   unsigned NumElems = ResVT.getVectorNumElements();
11830
11831   // If we are inserting non-zero vector and there are zeros in LSBs and undef
11832   // in the MSBs we need to emit a KSHIFTL. The generic lowering to
11833   // insert_subvector will give us two kshifts.
11834   if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
11835       Log2_64(NonZeros) != NumOperands - 1) {
11836     MVT ShiftVT = ResVT;
11837     if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
11838       ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
11839     unsigned Idx = Log2_64(NonZeros);
11840     SDValue SubVec = Op.getOperand(Idx);
11841     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11842     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
11843                          DAG.getUNDEF(ShiftVT), SubVec,
11844                          DAG.getIntPtrConstant(0, dl));
11845     Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
11846                      DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
11847     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
11848                        DAG.getIntPtrConstant(0, dl));
11849   }
11850
11851   // If there are zero or one non-zeros we can handle this very simply.
11852   if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
11853     SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
11854     if (!NonZeros)
11855       return Vec;
11856     unsigned Idx = Log2_64(NonZeros);
11857     SDValue SubVec = Op.getOperand(Idx);
11858     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11859     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
11860                        DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
11861   }
11862
11863   if (NumOperands > 2) {
11864     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11865     ArrayRef<SDUse> Ops = Op->ops();
11866     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11867                              Ops.slice(0, NumOperands/2));
11868     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11869                              Ops.slice(NumOperands/2));
11870     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11871   }
11872
11873   assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
11874
11875   if (ResVT.getVectorNumElements() >= 16)
11876     return Op; // The operation is legal with KUNPCK
11877
11878   SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
11879                             DAG.getUNDEF(ResVT), Op.getOperand(0),
11880                             DAG.getIntPtrConstant(0, dl));
11881   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
11882                      DAG.getIntPtrConstant(NumElems/2, dl));
11883 }
11884
11885 static SDValue LowerCONCAT_VECTORS(SDValue Op,
11886                                    const X86Subtarget &Subtarget,
11887                                    SelectionDAG &DAG) {
11888   MVT VT = Op.getSimpleValueType();
11889   if (VT.getVectorElementType() == MVT::i1)
11890     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
11891
11892   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
11893          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
11894           Op.getNumOperands() == 4)));
11895
11896   // AVX can use the vinsertf128 instruction to create 256-bit vectors
11897   // from two other 128-bit ones.
11898
11899   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
11900   return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
11901 }
11902
11903 //===----------------------------------------------------------------------===//
11904 // Vector shuffle lowering
11905 //
11906 // This is an experimental code path for lowering vector shuffles on x86. It is
11907 // designed to handle arbitrary vector shuffles and blends, gracefully
11908 // degrading performance as necessary. It works hard to recognize idiomatic
11909 // shuffles and lower them to optimal instruction patterns without leaving
11910 // a framework that allows reasonably efficient handling of all vector shuffle
11911 // patterns.
11912 //===----------------------------------------------------------------------===//
11913
11914 /// Tiny helper function to identify a no-op mask.
11915 ///
11916 /// This is a somewhat boring predicate function. It checks whether the mask
11917 /// array input, which is assumed to be a single-input shuffle mask of the kind
11918 /// used by the X86 shuffle instructions (not a fully general
11919 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
11920 /// in-place shuffle are 'no-op's.
11921 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
11922   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11923     assert(Mask[i] >= -1 && "Out of bound mask element!");
11924     if (Mask[i] >= 0 && Mask[i] != i)
11925       return false;
11926   }
11927   return true;
11928 }
11929
11930 /// Test whether there are elements crossing LaneSizeInBits lanes in this
11931 /// shuffle mask.
11932 ///
11933 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
11934 /// and we routinely test for these.
11935 static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
11936                                       unsigned ScalarSizeInBits,
11937                                       ArrayRef<int> Mask) {
11938   assert(LaneSizeInBits && ScalarSizeInBits &&
11939          (LaneSizeInBits % ScalarSizeInBits) == 0 &&
11940          "Illegal shuffle lane size");
11941   int LaneSize = LaneSizeInBits / ScalarSizeInBits;
11942   int Size = Mask.size();
11943   for (int i = 0; i < Size; ++i)
11944     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11945       return true;
11946   return false;
11947 }
11948
11949 /// Test whether there are elements crossing 128-bit lanes in this
11950 /// shuffle mask.
11951 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
11952   return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
11953 }
11954
11955 /// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
11956 /// from multiple lanes - this is different to isLaneCrossingShuffleMask to
11957 /// better support 'repeated mask + lane permute' style shuffles.
11958 static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
11959                                    unsigned ScalarSizeInBits,
11960                                    ArrayRef<int> Mask) {
11961   assert(LaneSizeInBits && ScalarSizeInBits &&
11962          (LaneSizeInBits % ScalarSizeInBits) == 0 &&
11963          "Illegal shuffle lane size");
11964   int NumElts = Mask.size();
11965   int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
11966   int NumLanes = NumElts / NumEltsPerLane;
11967   if (NumLanes > 1) {
11968     for (int i = 0; i != NumLanes; ++i) {
11969       int SrcLane = -1;
11970       for (int j = 0; j != NumEltsPerLane; ++j) {
11971         int M = Mask[(i * NumEltsPerLane) + j];
11972         if (M < 0)
11973           continue;
11974         int Lane = (M % NumElts) / NumEltsPerLane;
11975         if (SrcLane >= 0 && SrcLane != Lane)
11976           return true;
11977         SrcLane = Lane;
11978       }
11979     }
11980   }
11981   return false;
11982 }
11983
11984 /// Test whether a shuffle mask is equivalent within each sub-lane.
11985 ///
11986 /// This checks a shuffle mask to see if it is performing the same
11987 /// lane-relative shuffle in each sub-lane. This trivially implies
11988 /// that it is also not lane-crossing. It may however involve a blend from the
11989 /// same lane of a second vector.
11990 ///
11991 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
11992 /// non-trivial to compute in the face of undef lanes. The representation is
11993 /// suitable for use with existing 128-bit shuffles as entries from the second
11994 /// vector have been remapped to [LaneSize, 2*LaneSize).
11995 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
11996                                   ArrayRef<int> Mask,
11997                                   SmallVectorImpl<int> &RepeatedMask) {
11998   auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
11999   RepeatedMask.assign(LaneSize, -1);
12000   int Size = Mask.size();
12001   for (int i = 0; i < Size; ++i) {
12002     assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
12003     if (Mask[i] < 0)
12004       continue;
12005     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
12006       // This entry crosses lanes, so there is no way to model this shuffle.
12007       return false;
12008
12009     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
12010     // Adjust second vector indices to start at LaneSize instead of Size.
12011     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
12012                                 : Mask[i] % LaneSize + LaneSize;
12013     if (RepeatedMask[i % LaneSize] < 0)
12014       // This is the first non-undef entry in this slot of a 128-bit lane.
12015       RepeatedMask[i % LaneSize] = LocalM;
12016     else if (RepeatedMask[i % LaneSize] != LocalM)
12017       // Found a mismatch with the repeated mask.
12018       return false;
12019   }
12020   return true;
12021 }
12022
12023 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
12024 static bool
12025 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
12026                                 SmallVectorImpl<int> &RepeatedMask) {
12027   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
12028 }
12029
12030 static bool
12031 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
12032   SmallVector<int, 32> RepeatedMask;
12033   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
12034 }
12035
12036 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
12037 static bool
12038 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
12039                                 SmallVectorImpl<int> &RepeatedMask) {
12040   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
12041 }
12042
12043 /// Test whether a target shuffle mask is equivalent within each sub-lane.
12044 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
12045 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
12046                                         unsigned EltSizeInBits,
12047                                         ArrayRef<int> Mask,
12048                                         SmallVectorImpl<int> &RepeatedMask) {
12049   int LaneSize = LaneSizeInBits / EltSizeInBits;
12050   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
12051   int Size = Mask.size();
12052   for (int i = 0; i < Size; ++i) {
12053     assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
12054     if (Mask[i] == SM_SentinelUndef)
12055       continue;
12056     if (Mask[i] == SM_SentinelZero) {
12057       if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
12058         return false;
12059       RepeatedMask[i % LaneSize] = SM_SentinelZero;
12060       continue;
12061     }
12062     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
12063       // This entry crosses lanes, so there is no way to model this shuffle.
12064       return false;
12065
12066     // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
12067     // later vector indices to start at multiples of LaneSize instead of Size.
12068     int LaneM = Mask[i] / Size;
12069     int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
12070     if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
12071       // This is the first non-undef entry in this slot of a 128-bit lane.
12072       RepeatedMask[i % LaneSize] = LocalM;
12073     else if (RepeatedMask[i % LaneSize] != LocalM)
12074       // Found a mismatch with the repeated mask.
12075       return false;
12076   }
12077   return true;
12078 }
12079
12080 /// Test whether a target shuffle mask is equivalent within each sub-lane.
12081 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
12082 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
12083                                         ArrayRef<int> Mask,
12084                                         SmallVectorImpl<int> &RepeatedMask) {
12085   return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
12086                                      Mask, RepeatedMask);
12087 }
12088
12089 /// Checks whether the vector elements referenced by two shuffle masks are
12090 /// equivalent.
12091 static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
12092                                 int Idx, int ExpectedIdx) {
12093   assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
12094          ExpectedIdx < MaskSize && "Out of range element index");
12095   if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
12096     return false;
12097
12098   switch (Op.getOpcode()) {
12099   case ISD::BUILD_VECTOR:
12100     // If the values are build vectors, we can look through them to find
12101     // equivalent inputs that make the shuffles equivalent.
12102     // TODO: Handle MaskSize != Op.getNumOperands()?
12103     if (MaskSize == (int)Op.getNumOperands() &&
12104         MaskSize == (int)ExpectedOp.getNumOperands())
12105       return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
12106     break;
12107   case X86ISD::VBROADCAST:
12108   case X86ISD::VBROADCAST_LOAD:
12109     // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
12110     return (Op == ExpectedOp &&
12111             (int)Op.getValueType().getVectorNumElements() == MaskSize);
12112   case X86ISD::HADD:
12113   case X86ISD::HSUB:
12114   case X86ISD::FHADD:
12115   case X86ISD::FHSUB:
12116   case X86ISD::PACKSS:
12117   case X86ISD::PACKUS:
12118     // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
12119     // TODO: Handle MaskSize != NumElts?
12120     // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
12121     if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
12122       MVT VT = Op.getSimpleValueType();
12123       int NumElts = VT.getVectorNumElements();
12124       if (MaskSize == NumElts) {
12125         int NumLanes = VT.getSizeInBits() / 128;
12126         int NumEltsPerLane = NumElts / NumLanes;
12127         int NumHalfEltsPerLane = NumEltsPerLane / 2;
12128         bool SameLane =
12129             (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
12130         bool SameElt =
12131             (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
12132         return SameLane && SameElt;
12133       }
12134     }
12135     break;
12136   }
12137
12138   return false;
12139 }
12140
12141 /// Checks whether a shuffle mask is equivalent to an explicit list of
12142 /// arguments.
12143 ///
12144 /// This is a fast way to test a shuffle mask against a fixed pattern:
12145 ///
12146 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
12147 ///
12148 /// It returns true if the mask is exactly as wide as the argument list, and
12149 /// each element of the mask is either -1 (signifying undef) or the value given
12150 /// in the argument.
12151 static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
12152                                 SDValue V1 = SDValue(),
12153                                 SDValue V2 = SDValue()) {
12154   int Size = Mask.size();
12155   if (Size != (int)ExpectedMask.size())
12156     return false;
12157
12158   for (int i = 0; i < Size; ++i) {
12159     assert(Mask[i] >= -1 && "Out of bound mask element!");
12160     int MaskIdx = Mask[i];
12161     int ExpectedIdx = ExpectedMask[i];
12162     if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
12163       SDValue MaskV = MaskIdx < Size ? V1 : V2;
12164       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12165       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
12166       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12167       if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
12168         return false;
12169     }
12170   }
12171   return true;
12172 }
12173
12174 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
12175 ///
12176 /// The masks must be exactly the same width.
12177 ///
12178 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
12179 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
12180 ///
12181 /// SM_SentinelZero is accepted as a valid negative index but must match in
12182 /// both, or via a known bits test.
12183 static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
12184                                       ArrayRef<int> ExpectedMask,
12185                                       const SelectionDAG &DAG,
12186                                       SDValue V1 = SDValue(),
12187                                       SDValue V2 = SDValue()) {
12188   int Size = Mask.size();
12189   if (Size != (int)ExpectedMask.size())
12190     return false;
12191   assert(llvm::all_of(ExpectedMask,
12192                       [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
12193          "Illegal target shuffle mask");
12194
12195   // Check for out-of-range target shuffle mask indices.
12196   if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
12197     return false;
12198
12199   // Don't use V1/V2 if they're not the same size as the shuffle mask type.
12200   if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
12201              !V1.getValueType().isVector()))
12202     V1 = SDValue();
12203   if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
12204              !V2.getValueType().isVector()))
12205     V2 = SDValue();
12206
12207   APInt ZeroV1 = APInt::getZero(Size);
12208   APInt ZeroV2 = APInt::getZero(Size);
12209
12210   for (int i = 0; i < Size; ++i) {
12211     int MaskIdx = Mask[i];
12212     int ExpectedIdx = ExpectedMask[i];
12213     if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
12214       continue;
12215     if (MaskIdx == SM_SentinelZero) {
12216       // If we need this expected index to be a zero element, then update the
12217       // relevant zero mask and perform the known bits at the end to minimize
12218       // repeated computes.
12219       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12220       if (ExpectedV &&
12221           Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
12222         int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12223         APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
12224         ZeroMask.setBit(BitIdx);
12225         continue;
12226       }
12227     }
12228     if (MaskIdx >= 0) {
12229       SDValue MaskV = MaskIdx < Size ? V1 : V2;
12230       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
12231       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
12232       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
12233       if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
12234         continue;
12235     }
12236     return false;
12237   }
12238   return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
12239          (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
12240 }
12241
12242 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
12243 // instructions.
12244 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,
12245                                   const SelectionDAG &DAG) {
12246   if (VT != MVT::v8i32 && VT != MVT::v8f32)
12247     return false;
12248
12249   SmallVector<int, 8> Unpcklwd;
12250   createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
12251                           /* Unary = */ false);
12252   SmallVector<int, 8> Unpckhwd;
12253   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
12254                           /* Unary = */ false);
12255   bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
12256                          isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
12257   return IsUnpackwdMask;
12258 }
12259
12260 static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,
12261                                       const SelectionDAG &DAG) {
12262   // Create 128-bit vector type based on mask size.
12263   MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
12264   MVT VT = MVT::getVectorVT(EltVT, Mask.size());
12265
12266   // We can't assume a canonical shuffle mask, so try the commuted version too.
12267   SmallVector<int, 4> CommutedMask(Mask);
12268   ShuffleVectorSDNode::commuteMask(CommutedMask);
12269
12270   // Match any of unary/binary or low/high.
12271   for (unsigned i = 0; i != 4; ++i) {
12272     SmallVector<int, 16> UnpackMask;
12273     createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
12274     if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
12275         isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
12276       return true;
12277   }
12278   return false;
12279 }
12280
12281 /// Return true if a shuffle mask chooses elements identically in its top and
12282 /// bottom halves. For example, any splat mask has the same top and bottom
12283 /// halves. If an element is undefined in only one half of the mask, the halves
12284 /// are not considered identical.
12285 static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
12286   assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
12287   unsigned HalfSize = Mask.size() / 2;
12288   for (unsigned i = 0; i != HalfSize; ++i) {
12289     if (Mask[i] != Mask[i + HalfSize])
12290       return false;
12291   }
12292   return true;
12293 }
12294
12295 /// Get a 4-lane 8-bit shuffle immediate for a mask.
12296 ///
12297 /// This helper function produces an 8-bit shuffle immediate corresponding to
12298 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
12299 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
12300 /// example.
12301 ///
12302 /// NB: We rely heavily on "undef" masks preserving the input lane.
12303 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
12304   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
12305   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
12306   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
12307   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
12308   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
12309
12310   // If the mask only uses one non-undef element, then fully 'splat' it to
12311   // improve later broadcast matching.
12312   int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
12313   assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
12314
12315   int FirstElt = Mask[FirstIndex];
12316   if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
12317     return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
12318
12319   unsigned Imm = 0;
12320   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
12321   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
12322   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
12323   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
12324   return Imm;
12325 }
12326
12327 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
12328                                           SelectionDAG &DAG) {
12329   return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
12330 }
12331
12332 // The Shuffle result is as follow:
12333 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
12334 // Each Zeroable's element correspond to a particular Mask's element.
12335 // As described in computeZeroableShuffleElements function.
12336 //
12337 // The function looks for a sub-mask that the nonzero elements are in
12338 // increasing order. If such sub-mask exist. The function returns true.
12339 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
12340                                      ArrayRef<int> Mask, const EVT &VectorType,
12341                                      bool &IsZeroSideLeft) {
12342   int NextElement = -1;
12343   // Check if the Mask's nonzero elements are in increasing order.
12344   for (int i = 0, e = Mask.size(); i < e; i++) {
12345     // Checks if the mask's zeros elements are built from only zeros.
12346     assert(Mask[i] >= -1 && "Out of bound mask element!");
12347     if (Mask[i] < 0)
12348       return false;
12349     if (Zeroable[i])
12350       continue;
12351     // Find the lowest non zero element
12352     if (NextElement < 0) {
12353       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
12354       IsZeroSideLeft = NextElement != 0;
12355     }
12356     // Exit if the mask's non zero elements are not in increasing order.
12357     if (NextElement != Mask[i])
12358       return false;
12359     NextElement++;
12360   }
12361   return true;
12362 }
12363
12364 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
12365 static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
12366                                       ArrayRef<int> Mask, SDValue V1,
12367                                       SDValue V2, const APInt &Zeroable,
12368                                       const X86Subtarget &Subtarget,
12369                                       SelectionDAG &DAG) {
12370   int Size = Mask.size();
12371   int LaneSize = 128 / VT.getScalarSizeInBits();
12372   const int NumBytes = VT.getSizeInBits() / 8;
12373   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
12374
12375   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
12376          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
12377          (Subtarget.hasBWI() && VT.is512BitVector()));
12378
12379   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
12380   // Sign bit set in i8 mask means zero element.
12381   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
12382
12383   SDValue V;
12384   for (int i = 0; i < NumBytes; ++i) {
12385     int M = Mask[i / NumEltBytes];
12386     if (M < 0) {
12387       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
12388       continue;
12389     }
12390     if (Zeroable[i / NumEltBytes]) {
12391       PSHUFBMask[i] = ZeroMask;
12392       continue;
12393     }
12394
12395     // We can only use a single input of V1 or V2.
12396     SDValue SrcV = (M >= Size ? V2 : V1);
12397     if (V && V != SrcV)
12398       return SDValue();
12399     V = SrcV;
12400     M %= Size;
12401
12402     // PSHUFB can't cross lanes, ensure this doesn't happen.
12403     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
12404       return SDValue();
12405
12406     M = M % LaneSize;
12407     M = M * NumEltBytes + (i % NumEltBytes);
12408     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
12409   }
12410   assert(V && "Failed to find a source input");
12411
12412   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
12413   return DAG.getBitcast(
12414       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
12415                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
12416 }
12417
12418 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
12419                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
12420                            const SDLoc &dl);
12421
12422 // X86 has dedicated shuffle that can be lowered to VEXPAND
12423 static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
12424                                     const APInt &Zeroable,
12425                                     ArrayRef<int> Mask, SDValue &V1,
12426                                     SDValue &V2, SelectionDAG &DAG,
12427                                     const X86Subtarget &Subtarget) {
12428   bool IsLeftZeroSide = true;
12429   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
12430                                 IsLeftZeroSide))
12431     return SDValue();
12432   unsigned VEXPANDMask = (~Zeroable).getZExtValue();
12433   MVT IntegerType =
12434       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12435   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
12436   unsigned NumElts = VT.getVectorNumElements();
12437   assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
12438          "Unexpected number of vector elements");
12439   SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
12440                               Subtarget, DAG, DL);
12441   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
12442   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
12443   return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
12444 }
12445
12446 static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
12447                                   unsigned &UnpackOpcode, bool IsUnary,
12448                                   ArrayRef<int> TargetMask, const SDLoc &DL,
12449                                   SelectionDAG &DAG,
12450                                   const X86Subtarget &Subtarget) {
12451   int NumElts = VT.getVectorNumElements();
12452
12453   bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
12454   for (int i = 0; i != NumElts; i += 2) {
12455     int M1 = TargetMask[i + 0];
12456     int M2 = TargetMask[i + 1];
12457     Undef1 &= (SM_SentinelUndef == M1);
12458     Undef2 &= (SM_SentinelUndef == M2);
12459     Zero1 &= isUndefOrZero(M1);
12460     Zero2 &= isUndefOrZero(M2);
12461   }
12462   assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
12463          "Zeroable shuffle detected");
12464
12465   // Attempt to match the target mask against the unpack lo/hi mask patterns.
12466   SmallVector<int, 64> Unpckl, Unpckh;
12467   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
12468   if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
12469                                 (IsUnary ? V1 : V2))) {
12470     UnpackOpcode = X86ISD::UNPCKL;
12471     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
12472     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
12473     return true;
12474   }
12475
12476   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
12477   if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
12478                                 (IsUnary ? V1 : V2))) {
12479     UnpackOpcode = X86ISD::UNPCKH;
12480     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
12481     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
12482     return true;
12483   }
12484
12485   // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
12486   if (IsUnary && (Zero1 || Zero2)) {
12487     // Don't bother if we can blend instead.
12488     if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
12489         isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
12490       return false;
12491
12492     bool MatchLo = true, MatchHi = true;
12493     for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
12494       int M = TargetMask[i];
12495
12496       // Ignore if the input is known to be zero or the index is undef.
12497       if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
12498           (M == SM_SentinelUndef))
12499         continue;
12500
12501       MatchLo &= (M == Unpckl[i]);
12502       MatchHi &= (M == Unpckh[i]);
12503     }
12504
12505     if (MatchLo || MatchHi) {
12506       UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12507       V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
12508       V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
12509       return true;
12510     }
12511   }
12512
12513   // If a binary shuffle, commute and try again.
12514   if (!IsUnary) {
12515     ShuffleVectorSDNode::commuteMask(Unpckl);
12516     if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
12517       UnpackOpcode = X86ISD::UNPCKL;
12518       std::swap(V1, V2);
12519       return true;
12520     }
12521
12522     ShuffleVectorSDNode::commuteMask(Unpckh);
12523     if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
12524       UnpackOpcode = X86ISD::UNPCKH;
12525       std::swap(V1, V2);
12526       return true;
12527     }
12528   }
12529
12530   return false;
12531 }
12532
12533 // X86 has dedicated unpack instructions that can handle specific blend
12534 // operations: UNPCKH and UNPCKL.
12535 static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
12536                                      ArrayRef<int> Mask, SDValue V1, SDValue V2,
12537                                      SelectionDAG &DAG) {
12538   SmallVector<int, 8> Unpckl;
12539   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
12540   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12541     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
12542
12543   SmallVector<int, 8> Unpckh;
12544   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
12545   if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12546     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
12547
12548   // Commute and try again.
12549   ShuffleVectorSDNode::commuteMask(Unpckl);
12550   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12551     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
12552
12553   ShuffleVectorSDNode::commuteMask(Unpckh);
12554   if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12555     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
12556
12557   return SDValue();
12558 }
12559
12560 /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
12561 /// followed by unpack 256-bit.
12562 static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
12563                                         ArrayRef<int> Mask, SDValue V1,
12564                                         SDValue V2, SelectionDAG &DAG) {
12565   SmallVector<int, 32> Unpckl, Unpckh;
12566   createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
12567   createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
12568
12569   unsigned UnpackOpcode;
12570   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
12571     UnpackOpcode = X86ISD::UNPCKL;
12572   else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
12573     UnpackOpcode = X86ISD::UNPCKH;
12574   else
12575     return SDValue();
12576
12577   // This is a "natural" unpack operation (rather than the 128-bit sectored
12578   // operation implemented by AVX). We need to rearrange 64-bit chunks of the
12579   // input in order to use the x86 instruction.
12580   V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
12581                             DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
12582   V1 = DAG.getBitcast(VT, V1);
12583   return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
12584 }
12585
12586 // Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
12587 // source into the lower elements and zeroing the upper elements.
12588 static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
12589                                  ArrayRef<int> Mask, const APInt &Zeroable,
12590                                  const X86Subtarget &Subtarget) {
12591   if (!VT.is512BitVector() && !Subtarget.hasVLX())
12592     return false;
12593
12594   unsigned NumElts = Mask.size();
12595   unsigned EltSizeInBits = VT.getScalarSizeInBits();
12596   unsigned MaxScale = 64 / EltSizeInBits;
12597
12598   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12599     unsigned SrcEltBits = EltSizeInBits * Scale;
12600     if (SrcEltBits < 32 && !Subtarget.hasBWI())
12601       continue;
12602     unsigned NumSrcElts = NumElts / Scale;
12603     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
12604       continue;
12605     unsigned UpperElts = NumElts - NumSrcElts;
12606     if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12607       continue;
12608     SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
12609     SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
12610     DstVT = MVT::getIntegerVT(EltSizeInBits);
12611     if ((NumSrcElts * EltSizeInBits) >= 128) {
12612       // ISD::TRUNCATE
12613       DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
12614     } else {
12615       // X86ISD::VTRUNC
12616       DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
12617     }
12618     return true;
12619   }
12620
12621   return false;
12622 }
12623
12624 // Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
12625 // element padding to the final DstVT.
12626 static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
12627                                   const X86Subtarget &Subtarget,
12628                                   SelectionDAG &DAG, bool ZeroUppers) {
12629   MVT SrcVT = Src.getSimpleValueType();
12630   MVT DstSVT = DstVT.getScalarType();
12631   unsigned NumDstElts = DstVT.getVectorNumElements();
12632   unsigned NumSrcElts = SrcVT.getVectorNumElements();
12633   unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
12634
12635   if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
12636     return SDValue();
12637
12638   // Perform a direct ISD::TRUNCATE if possible.
12639   if (NumSrcElts == NumDstElts)
12640     return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
12641
12642   if (NumSrcElts > NumDstElts) {
12643     MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12644     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12645     return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
12646   }
12647
12648   if ((NumSrcElts * DstEltSizeInBits) >= 128) {
12649     MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
12650     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
12651     return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12652                           DstVT.getSizeInBits());
12653   }
12654
12655   // Non-VLX targets must truncate from a 512-bit type, so we need to
12656   // widen, truncate and then possibly extract the original subvector.
12657   if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
12658     SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
12659     return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
12660   }
12661
12662   // Fallback to a X86ISD::VTRUNC, padding if necessary.
12663   MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
12664   SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
12665   if (DstVT != TruncVT)
12666     Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
12667                            DstVT.getSizeInBits());
12668   return Trunc;
12669 }
12670
12671 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
12672 //
12673 // An example is the following:
12674 //
12675 // t0: ch = EntryToken
12676 //           t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
12677 //         t25: v4i32 = truncate t2
12678 //       t41: v8i16 = bitcast t25
12679 //       t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
12680 //       Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
12681 //     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
12682 //   t18: v2i64 = bitcast t51
12683 //
12684 // One can just use a single vpmovdw instruction, without avx512vl we need to
12685 // use the zmm variant and extract the lower subvector, padding with zeroes.
12686 // TODO: Merge with lowerShuffleAsVTRUNC.
12687 static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
12688                                      SDValue V2, ArrayRef<int> Mask,
12689                                      const APInt &Zeroable,
12690                                      const X86Subtarget &Subtarget,
12691                                      SelectionDAG &DAG) {
12692   assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
12693   if (!Subtarget.hasAVX512())
12694     return SDValue();
12695
12696   unsigned NumElts = VT.getVectorNumElements();
12697   unsigned EltSizeInBits = VT.getScalarSizeInBits();
12698   unsigned MaxScale = 64 / EltSizeInBits;
12699   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12700     unsigned SrcEltBits = EltSizeInBits * Scale;
12701     unsigned NumSrcElts = NumElts / Scale;
12702     unsigned UpperElts = NumElts - NumSrcElts;
12703     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
12704         !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12705       continue;
12706
12707     // Attempt to find a matching source truncation, but as a fall back VLX
12708     // cases can use the VPMOV directly.
12709     SDValue Src = peekThroughBitcasts(V1);
12710     if (Src.getOpcode() == ISD::TRUNCATE &&
12711         Src.getScalarValueSizeInBits() == SrcEltBits) {
12712       Src = Src.getOperand(0);
12713     } else if (Subtarget.hasVLX()) {
12714       MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12715       MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12716       Src = DAG.getBitcast(SrcVT, Src);
12717       // Don't do this if PACKSS/PACKUS could perform it cheaper.
12718       if (Scale == 2 &&
12719           ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
12720            (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
12721         return SDValue();
12722     } else
12723       return SDValue();
12724
12725     // VPMOVWB is only available with avx512bw.
12726     if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
12727       return SDValue();
12728
12729     bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
12730     return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12731   }
12732
12733   return SDValue();
12734 }
12735
12736 // Attempt to match binary shuffle patterns as a truncate.
12737 static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
12738                                     SDValue V2, ArrayRef<int> Mask,
12739                                     const APInt &Zeroable,
12740                                     const X86Subtarget &Subtarget,
12741                                     SelectionDAG &DAG) {
12742   assert((VT.is128BitVector() || VT.is256BitVector()) &&
12743          "Unexpected VTRUNC type");
12744   if (!Subtarget.hasAVX512())
12745     return SDValue();
12746
12747   unsigned NumElts = VT.getVectorNumElements();
12748   unsigned EltSizeInBits = VT.getScalarSizeInBits();
12749   unsigned MaxScale = 64 / EltSizeInBits;
12750   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12751     // TODO: Support non-BWI VPMOVWB truncations?
12752     unsigned SrcEltBits = EltSizeInBits * Scale;
12753     if (SrcEltBits < 32 && !Subtarget.hasBWI())
12754       continue;
12755
12756     // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
12757     // Bail if the V2 elements are undef.
12758     unsigned NumHalfSrcElts = NumElts / Scale;
12759     unsigned NumSrcElts = 2 * NumHalfSrcElts;
12760     for (unsigned Offset = 0; Offset != Scale; ++Offset) {
12761       if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
12762           isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
12763         continue;
12764
12765       // The elements beyond the truncation must be undef/zero.
12766       unsigned UpperElts = NumElts - NumSrcElts;
12767       if (UpperElts > 0 &&
12768           !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
12769         continue;
12770       bool UndefUppers =
12771           UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
12772
12773       // For offset truncations, ensure that the concat is cheap.
12774       if (Offset) {
12775         auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
12776           if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12777               Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
12778             return Lo.getOperand(0) == Hi.getOperand(0);
12779           if (ISD::isNormalLoad(Lo.getNode()) &&
12780               ISD::isNormalLoad(Hi.getNode())) {
12781             auto *LDLo = cast<LoadSDNode>(Lo);
12782             auto *LDHi = cast<LoadSDNode>(Hi);
12783             return DAG.areNonVolatileConsecutiveLoads(
12784                 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
12785           }
12786           return false;
12787         };
12788         if (!IsCheapConcat(V1, V2))
12789           continue;
12790       }
12791
12792       // As we're using both sources then we need to concat them together
12793       // and truncate from the double-sized src.
12794       MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
12795       SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
12796
12797       MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12798       MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12799       Src = DAG.getBitcast(SrcVT, Src);
12800
12801       // Shift the offset'd elements into place for the truncation.
12802       // TODO: Use getTargetVShiftByConstNode.
12803       if (Offset)
12804         Src = DAG.getNode(
12805             X86ISD::VSRLI, DL, SrcVT, Src,
12806             DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
12807
12808       return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
12809     }
12810   }
12811
12812   return SDValue();
12813 }
12814
12815 /// Check whether a compaction lowering can be done by dropping even/odd
12816 /// elements and compute how many times even/odd elements must be dropped.
12817 ///
12818 /// This handles shuffles which take every Nth element where N is a power of
12819 /// two. Example shuffle masks:
12820 ///
12821 /// (even)
12822 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
12823 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12824 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
12825 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
12826 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
12827 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
12828 ///
12829 /// (odd)
12830 ///  N = 1:  1,  3,  5,  7,  9, 11, 13, 15,  0,  2,  4,  6,  8, 10, 12, 14
12831 ///  N = 1:  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
12832 ///
12833 /// Any of these lanes can of course be undef.
12834 ///
12835 /// This routine only supports N <= 3.
12836 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12837 /// for larger N.
12838 ///
12839 /// \returns N above, or the number of times even/odd elements must be dropped
12840 /// if there is such a number. Otherwise returns zero.
12841 static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
12842                                       bool IsSingleInput) {
12843   // The modulus for the shuffle vector entries is based on whether this is
12844   // a single input or not.
12845   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12846   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
12847          "We should only be called with masks with a power-of-2 size!");
12848
12849   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12850   int Offset = MatchEven ? 0 : 1;
12851
12852   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12853   // and 2^3 simultaneously. This is because we may have ambiguity with
12854   // partially undef inputs.
12855   bool ViableForN[3] = {true, true, true};
12856
12857   for (int i = 0, e = Mask.size(); i < e; ++i) {
12858     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12859     // want.
12860     if (Mask[i] < 0)
12861       continue;
12862
12863     bool IsAnyViable = false;
12864     for (unsigned j = 0; j != std::size(ViableForN); ++j)
12865       if (ViableForN[j]) {
12866         uint64_t N = j + 1;
12867
12868         // The shuffle mask must be equal to (i * 2^N) % M.
12869         if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
12870           IsAnyViable = true;
12871         else
12872           ViableForN[j] = false;
12873       }
12874     // Early exit if we exhaust the possible powers of two.
12875     if (!IsAnyViable)
12876       break;
12877   }
12878
12879   for (unsigned j = 0; j != std::size(ViableForN); ++j)
12880     if (ViableForN[j])
12881       return j + 1;
12882
12883   // Return 0 as there is no viable power of two.
12884   return 0;
12885 }
12886
12887 // X86 has dedicated pack instructions that can handle specific truncation
12888 // operations: PACKSS and PACKUS.
12889 // Checks for compaction shuffle masks if MaxStages > 1.
12890 // TODO: Add support for matching multiple PACKSS/PACKUS stages.
12891 static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
12892                                  unsigned &PackOpcode, ArrayRef<int> TargetMask,
12893                                  const SelectionDAG &DAG,
12894                                  const X86Subtarget &Subtarget,
12895                                  unsigned MaxStages = 1) {
12896   unsigned NumElts = VT.getVectorNumElements();
12897   unsigned BitSize = VT.getScalarSizeInBits();
12898   assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
12899          "Illegal maximum compaction");
12900
12901   auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
12902     unsigned NumSrcBits = PackVT.getScalarSizeInBits();
12903     unsigned NumPackedBits = NumSrcBits - BitSize;
12904     N1 = peekThroughBitcasts(N1);
12905     N2 = peekThroughBitcasts(N2);
12906     unsigned NumBits1 = N1.getScalarValueSizeInBits();
12907     unsigned NumBits2 = N2.getScalarValueSizeInBits();
12908     bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
12909     bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
12910     if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
12911         (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
12912       return false;
12913     if (Subtarget.hasSSE41() || BitSize == 8) {
12914       APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
12915       if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
12916           (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
12917         V1 = N1;
12918         V2 = N2;
12919         SrcVT = PackVT;
12920         PackOpcode = X86ISD::PACKUS;
12921         return true;
12922       }
12923     }
12924     bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
12925     bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
12926     if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
12927          DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
12928         (N2.isUndef() || IsZero2 || IsAllOnes2 ||
12929          DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
12930       V1 = N1;
12931       V2 = N2;
12932       SrcVT = PackVT;
12933       PackOpcode = X86ISD::PACKSS;
12934       return true;
12935     }
12936     return false;
12937   };
12938
12939   // Attempt to match against wider and wider compaction patterns.
12940   for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
12941     MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
12942     MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
12943
12944     // Try binary shuffle.
12945     SmallVector<int, 32> BinaryMask;
12946     createPackShuffleMask(VT, BinaryMask, false, NumStages);
12947     if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
12948       if (MatchPACK(V1, V2, PackVT))
12949         return true;
12950
12951     // Try unary shuffle.
12952     SmallVector<int, 32> UnaryMask;
12953     createPackShuffleMask(VT, UnaryMask, true, NumStages);
12954     if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
12955       if (MatchPACK(V1, V1, PackVT))
12956         return true;
12957   }
12958
12959   return false;
12960 }
12961
12962 static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
12963                                     SDValue V1, SDValue V2, SelectionDAG &DAG,
12964                                     const X86Subtarget &Subtarget) {
12965   MVT PackVT;
12966   unsigned PackOpcode;
12967   unsigned SizeBits = VT.getSizeInBits();
12968   unsigned EltBits = VT.getScalarSizeInBits();
12969   unsigned MaxStages = Log2_32(64 / EltBits);
12970   if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
12971                             Subtarget, MaxStages))
12972     return SDValue();
12973
12974   unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
12975   unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
12976
12977   // Don't lower multi-stage packs on AVX512, truncation is better.
12978   if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
12979     return SDValue();
12980
12981   // Pack to the largest type possible:
12982   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
12983   unsigned MaxPackBits = 16;
12984   if (CurrentEltBits > 16 &&
12985       (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
12986     MaxPackBits = 32;
12987
12988   // Repeatedly pack down to the target size.
12989   SDValue Res;
12990   for (unsigned i = 0; i != NumStages; ++i) {
12991     unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
12992     unsigned NumSrcElts = SizeBits / SrcEltBits;
12993     MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12994     MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
12995     MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12996     MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
12997     Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
12998                       DAG.getBitcast(SrcVT, V2));
12999     V1 = V2 = Res;
13000     CurrentEltBits /= 2;
13001   }
13002   assert(Res && Res.getValueType() == VT &&
13003          "Failed to lower compaction shuffle");
13004   return Res;
13005 }
13006
13007 /// Try to emit a bitmask instruction for a shuffle.
13008 ///
13009 /// This handles cases where we can model a blend exactly as a bitmask due to
13010 /// one of the inputs being zeroable.
13011 static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
13012                                      SDValue V2, ArrayRef<int> Mask,
13013                                      const APInt &Zeroable,
13014                                      const X86Subtarget &Subtarget,
13015                                      SelectionDAG &DAG) {
13016   MVT MaskVT = VT;
13017   MVT EltVT = VT.getVectorElementType();
13018   SDValue Zero, AllOnes;
13019   // Use f64 if i64 isn't legal.
13020   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
13021     EltVT = MVT::f64;
13022     MaskVT = MVT::getVectorVT(EltVT, Mask.size());
13023   }
13024
13025   MVT LogicVT = VT;
13026   if (EltVT == MVT::f32 || EltVT == MVT::f64) {
13027     Zero = DAG.getConstantFP(0.0, DL, EltVT);
13028     APFloat AllOnesValue =
13029         APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));
13030     AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
13031     LogicVT =
13032         MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
13033   } else {
13034     Zero = DAG.getConstant(0, DL, EltVT);
13035     AllOnes = DAG.getAllOnesConstant(DL, EltVT);
13036   }
13037
13038   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
13039   SDValue V;
13040   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
13041     if (Zeroable[i])
13042       continue;
13043     if (Mask[i] % Size != i)
13044       return SDValue(); // Not a blend.
13045     if (!V)
13046       V = Mask[i] < Size ? V1 : V2;
13047     else if (V != (Mask[i] < Size ? V1 : V2))
13048       return SDValue(); // Can only let one input through the mask.
13049
13050     VMaskOps[i] = AllOnes;
13051   }
13052   if (!V)
13053     return SDValue(); // No non-zeroable elements!
13054
13055   SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
13056   VMask = DAG.getBitcast(LogicVT, VMask);
13057   V = DAG.getBitcast(LogicVT, V);
13058   SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
13059   return DAG.getBitcast(VT, And);
13060 }
13061
13062 /// Try to emit a blend instruction for a shuffle using bit math.
13063 ///
13064 /// This is used as a fallback approach when first class blend instructions are
13065 /// unavailable. Currently it is only suitable for integer vectors, but could
13066 /// be generalized for floating point vectors if desirable.
13067 static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
13068                                       SDValue V2, ArrayRef<int> Mask,
13069                                       SelectionDAG &DAG) {
13070   assert(VT.isInteger() && "Only supports integer vector types!");
13071   MVT EltVT = VT.getVectorElementType();
13072   SDValue Zero = DAG.getConstant(0, DL, EltVT);
13073   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
13074   SmallVector<SDValue, 16> MaskOps;
13075   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
13076     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
13077       return SDValue(); // Shuffled input!
13078     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
13079   }
13080
13081   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
13082   return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
13083 }
13084
13085 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
13086                                     SDValue PreservedSrc,
13087                                     const X86Subtarget &Subtarget,
13088                                     SelectionDAG &DAG);
13089
13090 static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,
13091                                 MutableArrayRef<int> Mask,
13092                                 const APInt &Zeroable, bool &ForceV1Zero,
13093                                 bool &ForceV2Zero, uint64_t &BlendMask) {
13094   bool V1IsZeroOrUndef =
13095       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
13096   bool V2IsZeroOrUndef =
13097       V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
13098
13099   BlendMask = 0;
13100   ForceV1Zero = false, ForceV2Zero = false;
13101   assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
13102
13103   int NumElts = Mask.size();
13104   int NumLanes = VT.getSizeInBits() / 128;
13105   int NumEltsPerLane = NumElts / NumLanes;
13106   assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
13107
13108   // For 32/64-bit elements, if we only reference one input (plus any undefs),
13109   // then ensure the blend mask part for that lane just references that input.
13110   bool ForceWholeLaneMasks =
13111       VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
13112
13113   // Attempt to generate the binary blend mask. If an input is zero then
13114   // we can use any lane.
13115   for (int Lane = 0; Lane != NumLanes; ++Lane) {
13116     // Keep track of the inputs used per lane.
13117     bool LaneV1InUse = false;
13118     bool LaneV2InUse = false;
13119     uint64_t LaneBlendMask = 0;
13120     for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
13121       int Elt = (Lane * NumEltsPerLane) + LaneElt;
13122       int M = Mask[Elt];
13123       if (M == SM_SentinelUndef)
13124         continue;
13125       if (M == Elt || (0 <= M && M < NumElts &&
13126                      IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
13127         Mask[Elt] = Elt;
13128         LaneV1InUse = true;
13129         continue;
13130       }
13131       if (M == (Elt + NumElts) ||
13132           (NumElts <= M &&
13133            IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
13134         LaneBlendMask |= 1ull << LaneElt;
13135         Mask[Elt] = Elt + NumElts;
13136         LaneV2InUse = true;
13137         continue;
13138       }
13139       if (Zeroable[Elt]) {
13140         if (V1IsZeroOrUndef) {
13141           ForceV1Zero = true;
13142           Mask[Elt] = Elt;
13143           LaneV1InUse = true;
13144           continue;
13145         }
13146         if (V2IsZeroOrUndef) {
13147           ForceV2Zero = true;
13148           LaneBlendMask |= 1ull << LaneElt;
13149           Mask[Elt] = Elt + NumElts;
13150           LaneV2InUse = true;
13151           continue;
13152         }
13153       }
13154       return false;
13155     }
13156
13157     // If we only used V2 then splat the lane blend mask to avoid any demanded
13158     // elts from V1 in this lane (the V1 equivalent is implicit with a zero
13159     // blend mask bit).
13160     if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
13161       LaneBlendMask = (1ull << NumEltsPerLane) - 1;
13162
13163     BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
13164   }
13165   return true;
13166 }
13167
13168 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
13169                                             int Scale) {
13170   uint64_t ScaledMask = 0;
13171   for (int i = 0; i != Size; ++i)
13172     if (BlendMask & (1ull << i))
13173       ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
13174   return ScaledMask;
13175 }
13176
13177 /// Try to emit a blend instruction for a shuffle.
13178 ///
13179 /// This doesn't do any checks for the availability of instructions for blending
13180 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
13181 /// be matched in the backend with the type given. What it does check for is
13182 /// that the shuffle mask is a blend, or convertible into a blend with zero.
13183 static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
13184                                    SDValue V2, ArrayRef<int> Original,
13185                                    const APInt &Zeroable,
13186                                    const X86Subtarget &Subtarget,
13187                                    SelectionDAG &DAG) {
13188   uint64_t BlendMask = 0;
13189   bool ForceV1Zero = false, ForceV2Zero = false;
13190   SmallVector<int, 64> Mask(Original);
13191   if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
13192                            BlendMask))
13193     return SDValue();
13194
13195   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
13196   if (ForceV1Zero)
13197     V1 = getZeroVector(VT, Subtarget, DAG, DL);
13198   if (ForceV2Zero)
13199     V2 = getZeroVector(VT, Subtarget, DAG, DL);
13200
13201   unsigned NumElts = VT.getVectorNumElements();
13202
13203   switch (VT.SimpleTy) {
13204   case MVT::v4i64:
13205   case MVT::v8i32:
13206     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
13207     [[fallthrough]];
13208   case MVT::v4f64:
13209   case MVT::v8f32:
13210     assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
13211     [[fallthrough]];
13212   case MVT::v2f64:
13213   case MVT::v2i64:
13214   case MVT::v4f32:
13215   case MVT::v4i32:
13216   case MVT::v8i16:
13217     assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
13218     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
13219                        DAG.getTargetConstant(BlendMask, DL, MVT::i8));
13220   case MVT::v16i16: {
13221     assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
13222     SmallVector<int, 8> RepeatedMask;
13223     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13224       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
13225       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
13226       BlendMask = 0;
13227       for (int i = 0; i < 8; ++i)
13228         if (RepeatedMask[i] >= 8)
13229           BlendMask |= 1ull << i;
13230       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13231                          DAG.getTargetConstant(BlendMask, DL, MVT::i8));
13232     }
13233     // Use PBLENDW for lower/upper lanes and then blend lanes.
13234     // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
13235     // merge to VSELECT where useful.
13236     uint64_t LoMask = BlendMask & 0xFF;
13237     uint64_t HiMask = (BlendMask >> 8) & 0xFF;
13238     if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
13239       SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13240                                DAG.getTargetConstant(LoMask, DL, MVT::i8));
13241       SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
13242                                DAG.getTargetConstant(HiMask, DL, MVT::i8));
13243       return DAG.getVectorShuffle(
13244           MVT::v16i16, DL, Lo, Hi,
13245           {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
13246     }
13247     [[fallthrough]];
13248   }
13249   case MVT::v32i8:
13250     assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
13251     [[fallthrough]];
13252   case MVT::v16i8: {
13253     assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
13254
13255     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
13256     if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
13257                                                Subtarget, DAG))
13258       return Masked;
13259
13260     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
13261       MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
13262       SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
13263       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
13264     }
13265
13266     // If we have VPTERNLOG, we can use that as a bit blend.
13267     if (Subtarget.hasVLX())
13268       if (SDValue BitBlend =
13269               lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13270         return BitBlend;
13271
13272     // Scale the blend by the number of bytes per element.
13273     int Scale = VT.getScalarSizeInBits() / 8;
13274
13275     // This form of blend is always done on bytes. Compute the byte vector
13276     // type.
13277     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13278
13279     // x86 allows load folding with blendvb from the 2nd source operand. But
13280     // we are still using LLVM select here (see comment below), so that's V1.
13281     // If V2 can be load-folded and V1 cannot be load-folded, then commute to
13282     // allow that load-folding possibility.
13283     if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
13284       ShuffleVectorSDNode::commuteMask(Mask);
13285       std::swap(V1, V2);
13286     }
13287
13288     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
13289     // mix of LLVM's code generator and the x86 backend. We tell the code
13290     // generator that boolean values in the elements of an x86 vector register
13291     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
13292     // mapping a select to operand #1, and 'false' mapping to operand #2. The
13293     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
13294     // of the element (the remaining are ignored) and 0 in that high bit would
13295     // mean operand #1 while 1 in the high bit would mean operand #2. So while
13296     // the LLVM model for boolean values in vector elements gets the relevant
13297     // bit set, it is set backwards and over constrained relative to x86's
13298     // actual model.
13299     SmallVector<SDValue, 32> VSELECTMask;
13300     for (int i = 0, Size = Mask.size(); i < Size; ++i)
13301       for (int j = 0; j < Scale; ++j)
13302         VSELECTMask.push_back(
13303             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
13304                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
13305                                           MVT::i8));
13306
13307     V1 = DAG.getBitcast(BlendVT, V1);
13308     V2 = DAG.getBitcast(BlendVT, V2);
13309     return DAG.getBitcast(
13310         VT,
13311         DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
13312                       V1, V2));
13313   }
13314   case MVT::v16f32:
13315   case MVT::v8f64:
13316   case MVT::v8i64:
13317   case MVT::v16i32:
13318   case MVT::v32i16:
13319   case MVT::v64i8: {
13320     // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
13321     bool OptForSize = DAG.shouldOptForSize();
13322     if (!OptForSize) {
13323       if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
13324                                                  Subtarget, DAG))
13325         return Masked;
13326     }
13327
13328     // Otherwise load an immediate into a GPR, cast to k-register, and use a
13329     // masked move.
13330     MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
13331     SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
13332     return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
13333   }
13334   default:
13335     llvm_unreachable("Not a supported integer vector type!");
13336   }
13337 }
13338
13339 /// Try to lower as a blend of elements from two inputs followed by
13340 /// a single-input permutation.
13341 ///
13342 /// This matches the pattern where we can blend elements from two inputs and
13343 /// then reduce the shuffle to a single-input permutation.
13344 static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
13345                                              SDValue V1, SDValue V2,
13346                                              ArrayRef<int> Mask,
13347                                              SelectionDAG &DAG,
13348                                              bool ImmBlends = false) {
13349   // We build up the blend mask while checking whether a blend is a viable way
13350   // to reduce the shuffle.
13351   SmallVector<int, 32> BlendMask(Mask.size(), -1);
13352   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
13353
13354   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
13355     if (Mask[i] < 0)
13356       continue;
13357
13358     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
13359
13360     if (BlendMask[Mask[i] % Size] < 0)
13361       BlendMask[Mask[i] % Size] = Mask[i];
13362     else if (BlendMask[Mask[i] % Size] != Mask[i])
13363       return SDValue(); // Can't blend in the needed input!
13364
13365     PermuteMask[i] = Mask[i] % Size;
13366   }
13367
13368   // If only immediate blends, then bail if the blend mask can't be widened to
13369   // i16.
13370   unsigned EltSize = VT.getScalarSizeInBits();
13371   if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
13372     return SDValue();
13373
13374   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
13375   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
13376 }
13377
13378 /// Try to lower as an unpack of elements from two inputs followed by
13379 /// a single-input permutation.
13380 ///
13381 /// This matches the pattern where we can unpack elements from two inputs and
13382 /// then reduce the shuffle to a single-input (wider) permutation.
13383 static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
13384                                              SDValue V1, SDValue V2,
13385                                              ArrayRef<int> Mask,
13386                                              SelectionDAG &DAG) {
13387   int NumElts = Mask.size();
13388   int NumLanes = VT.getSizeInBits() / 128;
13389   int NumLaneElts = NumElts / NumLanes;
13390   int NumHalfLaneElts = NumLaneElts / 2;
13391
13392   bool MatchLo = true, MatchHi = true;
13393   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13394
13395   // Determine UNPCKL/UNPCKH type and operand order.
13396   for (int Elt = 0; Elt != NumElts; ++Elt) {
13397     int M = Mask[Elt];
13398     if (M < 0)
13399       continue;
13400
13401     // Normalize the mask value depending on whether it's V1 or V2.
13402     int NormM = M;
13403     SDValue &Op = Ops[Elt & 1];
13404     if (M < NumElts && (Op.isUndef() || Op == V1))
13405       Op = V1;
13406     else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
13407       Op = V2;
13408       NormM -= NumElts;
13409     } else
13410       return SDValue();
13411
13412     bool MatchLoAnyLane = false, MatchHiAnyLane = false;
13413     for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
13414       int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
13415       MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
13416       MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
13417       if (MatchLoAnyLane || MatchHiAnyLane) {
13418         assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
13419                "Failed to match UNPCKLO/UNPCKHI");
13420         break;
13421       }
13422     }
13423     MatchLo &= MatchLoAnyLane;
13424     MatchHi &= MatchHiAnyLane;
13425     if (!MatchLo && !MatchHi)
13426       return SDValue();
13427   }
13428   assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
13429
13430   // Element indices have changed after unpacking. Calculate permute mask
13431   // so that they will be put back to the position as dictated by the
13432   // original shuffle mask indices.
13433   SmallVector<int, 32> PermuteMask(NumElts, -1);
13434   for (int Elt = 0; Elt != NumElts; ++Elt) {
13435     int M = Mask[Elt];
13436     if (M < 0)
13437       continue;
13438     int NormM = M;
13439     if (NumElts <= M)
13440       NormM -= NumElts;
13441     bool IsFirstOp = M < NumElts;
13442     int BaseMaskElt =
13443         NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
13444     if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
13445       PermuteMask[Elt] = BaseMaskElt;
13446     else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
13447       PermuteMask[Elt] = BaseMaskElt + 1;
13448     assert(PermuteMask[Elt] != -1 &&
13449            "Input mask element is defined but failed to assign permute mask");
13450   }
13451
13452   unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
13453   SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
13454   return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
13455 }
13456
13457 /// Try to lower a shuffle as a permute of the inputs followed by an
13458 /// UNPCK instruction.
13459 ///
13460 /// This specifically targets cases where we end up with alternating between
13461 /// the two inputs, and so can permute them into something that feeds a single
13462 /// UNPCK instruction. Note that this routine only targets integer vectors
13463 /// because for floating point vectors we have a generalized SHUFPS lowering
13464 /// strategy that handles everything that doesn't *exactly* match an unpack,
13465 /// making this clever lowering unnecessary.
13466 static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
13467                                               SDValue V1, SDValue V2,
13468                                               ArrayRef<int> Mask,
13469                                               const X86Subtarget &Subtarget,
13470                                               SelectionDAG &DAG) {
13471   int Size = Mask.size();
13472   assert(Mask.size() >= 2 && "Single element masks are invalid.");
13473
13474   // This routine only supports 128-bit integer dual input vectors.
13475   if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
13476     return SDValue();
13477
13478   int NumLoInputs =
13479       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13480   int NumHiInputs =
13481       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13482
13483   bool UnpackLo = NumLoInputs >= NumHiInputs;
13484
13485   auto TryUnpack = [&](int ScalarSize, int Scale) {
13486     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13487     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13488
13489     for (int i = 0; i < Size; ++i) {
13490       if (Mask[i] < 0)
13491         continue;
13492
13493       // Each element of the unpack contains Scale elements from this mask.
13494       int UnpackIdx = i / Scale;
13495
13496       // We only handle the case where V1 feeds the first slots of the unpack.
13497       // We rely on canonicalization to ensure this is the case.
13498       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
13499         return SDValue();
13500
13501       // Setup the mask for this input. The indexing is tricky as we have to
13502       // handle the unpack stride.
13503       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
13504       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
13505           Mask[i] % Size;
13506     }
13507
13508     // If we will have to shuffle both inputs to use the unpack, check whether
13509     // we can just unpack first and shuffle the result. If so, skip this unpack.
13510     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
13511         !isNoopShuffleMask(V2Mask))
13512       return SDValue();
13513
13514     // Shuffle the inputs into place.
13515     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13516     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13517
13518     // Cast the inputs to the type we will use to unpack them.
13519     MVT UnpackVT =
13520         MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
13521     V1 = DAG.getBitcast(UnpackVT, V1);
13522     V2 = DAG.getBitcast(UnpackVT, V2);
13523
13524     // Unpack the inputs and cast the result back to the desired type.
13525     return DAG.getBitcast(
13526         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
13527                         UnpackVT, V1, V2));
13528   };
13529
13530   // We try each unpack from the largest to the smallest to try and find one
13531   // that fits this mask.
13532   int OrigScalarSize = VT.getScalarSizeInBits();
13533   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
13534     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
13535       return Unpack;
13536
13537   // If we're shuffling with a zero vector then we're better off not doing
13538   // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
13539   if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
13540       ISD::isBuildVectorAllZeros(V2.getNode()))
13541     return SDValue();
13542
13543   // If none of the unpack-rooted lowerings worked (or were profitable) try an
13544   // initial unpack.
13545   if (NumLoInputs == 0 || NumHiInputs == 0) {
13546     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
13547            "We have to have *some* inputs!");
13548     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
13549
13550     // FIXME: We could consider the total complexity of the permute of each
13551     // possible unpacking. Or at the least we should consider how many
13552     // half-crossings are created.
13553     // FIXME: We could consider commuting the unpacks.
13554
13555     SmallVector<int, 32> PermMask((unsigned)Size, -1);
13556     for (int i = 0; i < Size; ++i) {
13557       if (Mask[i] < 0)
13558         continue;
13559
13560       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
13561
13562       PermMask[i] =
13563           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
13564     }
13565     return DAG.getVectorShuffle(
13566         VT, DL,
13567         DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
13568                     V1, V2),
13569         DAG.getUNDEF(VT), PermMask);
13570   }
13571
13572   return SDValue();
13573 }
13574
13575 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
13576 /// permuting the elements of the result in place.
13577 static SDValue lowerShuffleAsByteRotateAndPermute(
13578     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13579     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13580   if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
13581       (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
13582       (VT.is512BitVector() && !Subtarget.hasBWI()))
13583     return SDValue();
13584
13585   // We don't currently support lane crossing permutes.
13586   if (is128BitLaneCrossingShuffleMask(VT, Mask))
13587     return SDValue();
13588
13589   int Scale = VT.getScalarSizeInBits() / 8;
13590   int NumLanes = VT.getSizeInBits() / 128;
13591   int NumElts = VT.getVectorNumElements();
13592   int NumEltsPerLane = NumElts / NumLanes;
13593
13594   // Determine range of mask elts.
13595   bool Blend1 = true;
13596   bool Blend2 = true;
13597   std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
13598   std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
13599   for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
13600     for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
13601       int M = Mask[Lane + Elt];
13602       if (M < 0)
13603         continue;
13604       if (M < NumElts) {
13605         Blend1 &= (M == (Lane + Elt));
13606         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
13607         M = M % NumEltsPerLane;
13608         Range1.first = std::min(Range1.first, M);
13609         Range1.second = std::max(Range1.second, M);
13610       } else {
13611         M -= NumElts;
13612         Blend2 &= (M == (Lane + Elt));
13613         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
13614         M = M % NumEltsPerLane;
13615         Range2.first = std::min(Range2.first, M);
13616         Range2.second = std::max(Range2.second, M);
13617       }
13618     }
13619   }
13620
13621   // Bail if we don't need both elements.
13622   // TODO - it might be worth doing this for unary shuffles if the permute
13623   // can be widened.
13624   if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
13625       !(0 <= Range2.first && Range2.second < NumEltsPerLane))
13626     return SDValue();
13627
13628   if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
13629     return SDValue();
13630
13631   // Rotate the 2 ops so we can access both ranges, then permute the result.
13632   auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
13633     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
13634     SDValue Rotate = DAG.getBitcast(
13635         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
13636                         DAG.getBitcast(ByteVT, Lo),
13637                         DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
13638     SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
13639     for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
13640       for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
13641         int M = Mask[Lane + Elt];
13642         if (M < 0)
13643           continue;
13644         if (M < NumElts)
13645           PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
13646         else
13647           PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
13648       }
13649     }
13650     return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
13651   };
13652
13653   // Check if the ranges are small enough to rotate from either direction.
13654   if (Range2.second < Range1.first)
13655     return RotateAndPermute(V1, V2, Range1.first, 0);
13656   if (Range1.second < Range2.first)
13657     return RotateAndPermute(V2, V1, Range2.first, NumElts);
13658   return SDValue();
13659 }
13660
13661 static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
13662   return isUndefOrEqual(Mask, 0);
13663 }
13664
13665 static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
13666   return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
13667 }
13668
13669 /// Check if the Mask consists of the same element repeated multiple times.
13670 static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) {
13671   size_t NumUndefs = 0;
13672   std::optional<int> UniqueElt;
13673   for (int Elt : Mask) {
13674     if (Elt == SM_SentinelUndef) {
13675       NumUndefs++;
13676       continue;
13677     }
13678     if (UniqueElt.has_value() && UniqueElt.value() != Elt)
13679       return false;
13680     UniqueElt = Elt;
13681   }
13682   // Make sure the element is repeated enough times by checking the number of
13683   // undefs is small.
13684   return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
13685 }
13686
13687 /// Generic routine to decompose a shuffle and blend into independent
13688 /// blends and permutes.
13689 ///
13690 /// This matches the extremely common pattern for handling combined
13691 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
13692 /// operations. It will try to pick the best arrangement of shuffles and
13693 /// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
13694 static SDValue lowerShuffleAsDecomposedShuffleMerge(
13695     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13696     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13697   int NumElts = Mask.size();
13698   int NumLanes = VT.getSizeInBits() / 128;
13699   int NumEltsPerLane = NumElts / NumLanes;
13700
13701   // Shuffle the input elements into the desired positions in V1 and V2 and
13702   // unpack/blend them together.
13703   bool IsAlternating = true;
13704   SmallVector<int, 32> V1Mask(NumElts, -1);
13705   SmallVector<int, 32> V2Mask(NumElts, -1);
13706   SmallVector<int, 32> FinalMask(NumElts, -1);
13707   for (int i = 0; i < NumElts; ++i) {
13708     int M = Mask[i];
13709     if (M >= 0 && M < NumElts) {
13710       V1Mask[i] = M;
13711       FinalMask[i] = i;
13712       IsAlternating &= (i & 1) == 0;
13713     } else if (M >= NumElts) {
13714       V2Mask[i] = M - NumElts;
13715       FinalMask[i] = i + NumElts;
13716       IsAlternating &= (i & 1) == 1;
13717     }
13718   }
13719
13720   // If we effectively only demand the 0'th element of \p Input, and not only
13721   // as 0'th element, then broadcast said input,
13722   // and change \p InputMask to be a no-op (identity) mask.
13723   auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
13724                                          &DAG](SDValue &Input,
13725                                                MutableArrayRef<int> InputMask) {
13726     unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
13727     if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
13728                                  !X86::mayFoldLoad(Input, Subtarget)))
13729       return;
13730     if (isNoopShuffleMask(InputMask))
13731       return;
13732     assert(isBroadcastShuffleMask(InputMask) &&
13733            "Expected to demand only the 0'th element.");
13734     Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
13735     for (auto I : enumerate(InputMask)) {
13736       int &InputMaskElt = I.value();
13737       if (InputMaskElt >= 0)
13738         InputMaskElt = I.index();
13739     }
13740   };
13741
13742   // Currently, we may need to produce one shuffle per input, and blend results.
13743   // It is possible that the shuffle for one of the inputs is already a no-op.
13744   // See if we can simplify non-no-op shuffles into broadcasts,
13745   // which we consider to be strictly better than an arbitrary shuffle.
13746   if (isNoopOrBroadcastShuffleMask(V1Mask) &&
13747       isNoopOrBroadcastShuffleMask(V2Mask)) {
13748     canonicalizeBroadcastableInput(V1, V1Mask);
13749     canonicalizeBroadcastableInput(V2, V2Mask);
13750   }
13751
13752   // Try to lower with the simpler initial blend/unpack/rotate strategies unless
13753   // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
13754   // the shuffle may be able to fold with a load or other benefit. However, when
13755   // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
13756   // pre-shuffle first is a better strategy.
13757   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
13758     // Only prefer immediate blends to unpack/rotate.
13759     if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13760                                                           DAG, true))
13761       return BlendPerm;
13762     // If either input vector provides only a single element which is repeated
13763     // multiple times, unpacking from both input vectors would generate worse
13764     // code. e.g. for
13765     // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
13766     // it is better to process t4 first to create a vector of t4[0], then unpack
13767     // that vector with t2.
13768     if (!isSingleElementRepeatedMask(V1Mask) &&
13769         !isSingleElementRepeatedMask(V2Mask))
13770       if (SDValue UnpackPerm =
13771               lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
13772         return UnpackPerm;
13773     if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
13774             DL, VT, V1, V2, Mask, Subtarget, DAG))
13775       return RotatePerm;
13776     // Unpack/rotate failed - try again with variable blends.
13777     if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
13778                                                           DAG))
13779       return BlendPerm;
13780     if (VT.getScalarSizeInBits() >= 32)
13781       if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
13782               DL, VT, V1, V2, Mask, Subtarget, DAG))
13783         return PermUnpack;
13784   }
13785
13786   // If the final mask is an alternating blend of vXi8/vXi16, convert to an
13787   // UNPCKL(SHUFFLE, SHUFFLE) pattern.
13788   // TODO: It doesn't have to be alternating - but each lane mustn't have more
13789   // than half the elements coming from each source.
13790   if (IsAlternating && VT.getScalarSizeInBits() < 32) {
13791     V1Mask.assign(NumElts, -1);
13792     V2Mask.assign(NumElts, -1);
13793     FinalMask.assign(NumElts, -1);
13794     for (int i = 0; i != NumElts; i += NumEltsPerLane)
13795       for (int j = 0; j != NumEltsPerLane; ++j) {
13796         int M = Mask[i + j];
13797         if (M >= 0 && M < NumElts) {
13798           V1Mask[i + (j / 2)] = M;
13799           FinalMask[i + j] = i + (j / 2);
13800         } else if (M >= NumElts) {
13801           V2Mask[i + (j / 2)] = M - NumElts;
13802           FinalMask[i + j] = i + (j / 2) + NumElts;
13803         }
13804       }
13805   }
13806
13807   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13808   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13809   return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
13810 }
13811
13812 /// Try to lower a vector shuffle as a bit rotation.
13813 ///
13814 /// Look for a repeated rotation pattern in each sub group.
13815 /// Returns a ISD::ROTL element rotation amount or -1 if failed.
13816 static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
13817   int NumElts = Mask.size();
13818   assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
13819
13820   int RotateAmt = -1;
13821   for (int i = 0; i != NumElts; i += NumSubElts) {
13822     for (int j = 0; j != NumSubElts; ++j) {
13823       int M = Mask[i + j];
13824       if (M < 0)
13825         continue;
13826       if (!isInRange(M, i, i + NumSubElts))
13827         return -1;
13828       int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
13829       if (0 <= RotateAmt && Offset != RotateAmt)
13830         return -1;
13831       RotateAmt = Offset;
13832     }
13833   }
13834   return RotateAmt;
13835 }
13836
13837 static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
13838                                    const X86Subtarget &Subtarget,
13839                                    ArrayRef<int> Mask) {
13840   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
13841   assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
13842
13843   // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
13844   int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
13845   int MaxSubElts = 64 / EltSizeInBits;
13846   for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
13847     int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
13848     if (RotateAmt < 0)
13849       continue;
13850
13851     int NumElts = Mask.size();
13852     MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
13853     RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
13854     return RotateAmt * EltSizeInBits;
13855   }
13856
13857   return -1;
13858 }
13859
13860 /// Lower shuffle using X86ISD::VROTLI rotations.
13861 static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
13862                                        ArrayRef<int> Mask,
13863                                        const X86Subtarget &Subtarget,
13864                                        SelectionDAG &DAG) {
13865   // Only XOP + AVX512 targets have bit rotation instructions.
13866   // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
13867   bool IsLegal =
13868       (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
13869   if (!IsLegal && Subtarget.hasSSE3())
13870     return SDValue();
13871
13872   MVT RotateVT;
13873   int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
13874                                           Subtarget, Mask);
13875   if (RotateAmt < 0)
13876     return SDValue();
13877
13878   // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
13879   // expanded to OR(SRL,SHL), will be more efficient, but if they can
13880   // widen to vXi16 or more then existing lowering should will be better.
13881   if (!IsLegal) {
13882     if ((RotateAmt % 16) == 0)
13883       return SDValue();
13884     // TODO: Use getTargetVShiftByConstNode.
13885     unsigned ShlAmt = RotateAmt;
13886     unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
13887     V1 = DAG.getBitcast(RotateVT, V1);
13888     SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
13889                               DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
13890     SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
13891                               DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
13892     SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
13893     return DAG.getBitcast(VT, Rot);
13894   }
13895
13896   SDValue Rot =
13897       DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
13898                   DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
13899   return DAG.getBitcast(VT, Rot);
13900 }
13901
13902 /// Try to match a vector shuffle as an element rotation.
13903 ///
13904 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
13905 static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
13906                                        ArrayRef<int> Mask) {
13907   int NumElts = Mask.size();
13908
13909   // We need to detect various ways of spelling a rotation:
13910   //   [11, 12, 13, 14, 15,  0,  1,  2]
13911   //   [-1, 12, 13, 14, -1, -1,  1, -1]
13912   //   [-1, -1, -1, -1, -1, -1,  1,  2]
13913   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
13914   //   [-1,  4,  5,  6, -1, -1,  9, -1]
13915   //   [-1,  4,  5,  6, -1, -1, -1, -1]
13916   int Rotation = 0;
13917   SDValue Lo, Hi;
13918   for (int i = 0; i < NumElts; ++i) {
13919     int M = Mask[i];
13920     assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
13921            "Unexpected mask index.");
13922     if (M < 0)
13923       continue;
13924
13925     // Determine where a rotated vector would have started.
13926     int StartIdx = i - (M % NumElts);
13927     if (StartIdx == 0)
13928       // The identity rotation isn't interesting, stop.
13929       return -1;
13930
13931     // If we found the tail of a vector the rotation must be the missing
13932     // front. If we found the head of a vector, it must be how much of the
13933     // head.
13934     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
13935
13936     if (Rotation == 0)
13937       Rotation = CandidateRotation;
13938     else if (Rotation != CandidateRotation)
13939       // The rotations don't match, so we can't match this mask.
13940       return -1;
13941
13942     // Compute which value this mask is pointing at.
13943     SDValue MaskV = M < NumElts ? V1 : V2;
13944
13945     // Compute which of the two target values this index should be assigned
13946     // to. This reflects whether the high elements are remaining or the low
13947     // elements are remaining.
13948     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
13949
13950     // Either set up this value if we've not encountered it before, or check
13951     // that it remains consistent.
13952     if (!TargetV)
13953       TargetV = MaskV;
13954     else if (TargetV != MaskV)
13955       // This may be a rotation, but it pulls from the inputs in some
13956       // unsupported interleaving.
13957       return -1;
13958   }
13959
13960   // Check that we successfully analyzed the mask, and normalize the results.
13961   assert(Rotation != 0 && "Failed to locate a viable rotation!");
13962   assert((Lo || Hi) && "Failed to find a rotated input vector!");
13963   if (!Lo)
13964     Lo = Hi;
13965   else if (!Hi)
13966     Hi = Lo;
13967
13968   V1 = Lo;
13969   V2 = Hi;
13970
13971   return Rotation;
13972 }
13973
13974 /// Try to lower a vector shuffle as a byte rotation.
13975 ///
13976 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
13977 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
13978 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
13979 /// try to generically lower a vector shuffle through such an pattern. It
13980 /// does not check for the profitability of lowering either as PALIGNR or
13981 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
13982 /// This matches shuffle vectors that look like:
13983 ///
13984 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
13985 ///
13986 /// Essentially it concatenates V1 and V2, shifts right by some number of
13987 /// elements, and takes the low elements as the result. Note that while this is
13988 /// specified as a *right shift* because x86 is little-endian, it is a *left
13989 /// rotate* of the vector lanes.
13990 static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
13991                                     ArrayRef<int> Mask) {
13992   // Don't accept any shuffles with zero elements.
13993   if (isAnyZero(Mask))
13994     return -1;
13995
13996   // PALIGNR works on 128-bit lanes.
13997   SmallVector<int, 16> RepeatedMask;
13998   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
13999     return -1;
14000
14001   int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
14002   if (Rotation <= 0)
14003     return -1;
14004
14005   // PALIGNR rotates bytes, so we need to scale the
14006   // rotation based on how many bytes are in the vector lane.
14007   int NumElts = RepeatedMask.size();
14008   int Scale = 16 / NumElts;
14009   return Rotation * Scale;
14010 }
14011
14012 static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
14013                                         SDValue V2, ArrayRef<int> Mask,
14014                                         const X86Subtarget &Subtarget,
14015                                         SelectionDAG &DAG) {
14016   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
14017
14018   SDValue Lo = V1, Hi = V2;
14019   int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
14020   if (ByteRotation <= 0)
14021     return SDValue();
14022
14023   // Cast the inputs to i8 vector of correct length to match PALIGNR or
14024   // PSLLDQ/PSRLDQ.
14025   MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
14026   Lo = DAG.getBitcast(ByteVT, Lo);
14027   Hi = DAG.getBitcast(ByteVT, Hi);
14028
14029   // SSSE3 targets can use the palignr instruction.
14030   if (Subtarget.hasSSSE3()) {
14031     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
14032            "512-bit PALIGNR requires BWI instructions");
14033     return DAG.getBitcast(
14034         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
14035                         DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
14036   }
14037
14038   assert(VT.is128BitVector() &&
14039          "Rotate-based lowering only supports 128-bit lowering!");
14040   assert(Mask.size() <= 16 &&
14041          "Can shuffle at most 16 bytes in a 128-bit vector!");
14042   assert(ByteVT == MVT::v16i8 &&
14043          "SSE2 rotate lowering only needed for v16i8!");
14044
14045   // Default SSE2 implementation
14046   int LoByteShift = 16 - ByteRotation;
14047   int HiByteShift = ByteRotation;
14048
14049   SDValue LoShift =
14050       DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
14051                   DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
14052   SDValue HiShift =
14053       DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
14054                   DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
14055   return DAG.getBitcast(VT,
14056                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
14057 }
14058
14059 /// Try to lower a vector shuffle as a dword/qword rotation.
14060 ///
14061 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
14062 /// rotation of the concatenation of two vectors; This routine will
14063 /// try to generically lower a vector shuffle through such an pattern.
14064 ///
14065 /// Essentially it concatenates V1 and V2, shifts right by some number of
14066 /// elements, and takes the low elements as the result. Note that while this is
14067 /// specified as a *right shift* because x86 is little-endian, it is a *left
14068 /// rotate* of the vector lanes.
14069 static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
14070                                     SDValue V2, ArrayRef<int> Mask,
14071                                     const X86Subtarget &Subtarget,
14072                                     SelectionDAG &DAG) {
14073   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
14074          "Only 32-bit and 64-bit elements are supported!");
14075
14076   // 128/256-bit vectors are only supported with VLX.
14077   assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
14078          && "VLX required for 128/256-bit vectors");
14079
14080   SDValue Lo = V1, Hi = V2;
14081   int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
14082   if (Rotation <= 0)
14083     return SDValue();
14084
14085   return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
14086                      DAG.getTargetConstant(Rotation, DL, MVT::i8));
14087 }
14088
14089 /// Try to lower a vector shuffle as a byte shift sequence.
14090 static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
14091                                            SDValue V2, ArrayRef<int> Mask,
14092                                            const APInt &Zeroable,
14093                                            const X86Subtarget &Subtarget,
14094                                            SelectionDAG &DAG) {
14095   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
14096   assert(VT.is128BitVector() && "Only 128-bit vectors supported");
14097
14098   // We need a shuffle that has zeros at one/both ends and a sequential
14099   // shuffle from one source within.
14100   unsigned ZeroLo = Zeroable.countr_one();
14101   unsigned ZeroHi = Zeroable.countl_one();
14102   if (!ZeroLo && !ZeroHi)
14103     return SDValue();
14104
14105   unsigned NumElts = Mask.size();
14106   unsigned Len = NumElts - (ZeroLo + ZeroHi);
14107   if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
14108     return SDValue();
14109
14110   unsigned Scale = VT.getScalarSizeInBits() / 8;
14111   ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
14112   if (!isUndefOrInRange(StubMask, 0, NumElts) &&
14113       !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
14114     return SDValue();
14115
14116   SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
14117   Res = DAG.getBitcast(MVT::v16i8, Res);
14118
14119   // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
14120   // inner sequential set of elements, possibly offset:
14121   // 01234567 --> zzzzzz01 --> 1zzzzzzz
14122   // 01234567 --> 4567zzzz --> zzzzz456
14123   // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
14124   if (ZeroLo == 0) {
14125     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
14126     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14127                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14128     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14129                       DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
14130   } else if (ZeroHi == 0) {
14131     unsigned Shift = Mask[ZeroLo] % NumElts;
14132     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14133                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14134     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14135                       DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
14136   } else if (!Subtarget.hasSSSE3()) {
14137     // If we don't have PSHUFB then its worth avoiding an AND constant mask
14138     // by performing 3 byte shifts. Shuffle combining can kick in above that.
14139     // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
14140     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
14141     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14142                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14143     Shift += Mask[ZeroLo] % NumElts;
14144     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
14145                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
14146     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
14147                       DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
14148   } else
14149     return SDValue();
14150
14151   return DAG.getBitcast(VT, Res);
14152 }
14153
14154 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
14155 ///
14156 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
14157 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
14158 /// matches elements from one of the input vectors shuffled to the left or
14159 /// right with zeroable elements 'shifted in'. It handles both the strictly
14160 /// bit-wise element shifts and the byte shift across an entire 128-bit double
14161 /// quad word lane.
14162 ///
14163 /// PSHL : (little-endian) left bit shift.
14164 /// [ zz, 0, zz,  2 ]
14165 /// [ -1, 4, zz, -1 ]
14166 /// PSRL : (little-endian) right bit shift.
14167 /// [  1, zz,  3, zz]
14168 /// [ -1, -1,  7, zz]
14169 /// PSLLDQ : (little-endian) left byte shift
14170 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
14171 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
14172 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
14173 /// PSRLDQ : (little-endian) right byte shift
14174 /// [  5, 6,  7, zz, zz, zz, zz, zz]
14175 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
14176 /// [  1, 2, -1, -1, -1, -1, zz, zz]
14177 static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
14178                                unsigned ScalarSizeInBits, ArrayRef<int> Mask,
14179                                int MaskOffset, const APInt &Zeroable,
14180                                const X86Subtarget &Subtarget) {
14181   int Size = Mask.size();
14182   unsigned SizeInBits = Size * ScalarSizeInBits;
14183
14184   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
14185     for (int i = 0; i < Size; i += Scale)
14186       for (int j = 0; j < Shift; ++j)
14187         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
14188           return false;
14189
14190     return true;
14191   };
14192
14193   auto MatchShift = [&](int Shift, int Scale, bool Left) {
14194     for (int i = 0; i != Size; i += Scale) {
14195       unsigned Pos = Left ? i + Shift : i;
14196       unsigned Low = Left ? i : i + Shift;
14197       unsigned Len = Scale - Shift;
14198       if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
14199         return -1;
14200     }
14201
14202     int ShiftEltBits = ScalarSizeInBits * Scale;
14203     bool ByteShift = ShiftEltBits > 64;
14204     Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
14205                   : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
14206     int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
14207
14208     // Normalize the scale for byte shifts to still produce an i64 element
14209     // type.
14210     Scale = ByteShift ? Scale / 2 : Scale;
14211
14212     // We need to round trip through the appropriate type for the shift.
14213     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
14214     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
14215                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
14216     return (int)ShiftAmt;
14217   };
14218
14219   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
14220   // keep doubling the size of the integer elements up to that. We can
14221   // then shift the elements of the integer vector by whole multiples of
14222   // their width within the elements of the larger integer vector. Test each
14223   // multiple to see if we can find a match with the moved element indices
14224   // and that the shifted in elements are all zeroable.
14225   unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
14226   for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
14227     for (int Shift = 1; Shift != Scale; ++Shift)
14228       for (bool Left : {true, false})
14229         if (CheckZeros(Shift, Scale, Left)) {
14230           int ShiftAmt = MatchShift(Shift, Scale, Left);
14231           if (0 < ShiftAmt)
14232             return ShiftAmt;
14233         }
14234
14235   // no match
14236   return -1;
14237 }
14238
14239 static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
14240                                    SDValue V2, ArrayRef<int> Mask,
14241                                    const APInt &Zeroable,
14242                                    const X86Subtarget &Subtarget,
14243                                    SelectionDAG &DAG, bool BitwiseOnly) {
14244   int Size = Mask.size();
14245   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
14246
14247   MVT ShiftVT;
14248   SDValue V = V1;
14249   unsigned Opcode;
14250
14251   // Try to match shuffle against V1 shift.
14252   int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
14253                                      Mask, 0, Zeroable, Subtarget);
14254
14255   // If V1 failed, try to match shuffle against V2 shift.
14256   if (ShiftAmt < 0) {
14257     ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
14258                                    Mask, Size, Zeroable, Subtarget);
14259     V = V2;
14260   }
14261
14262   if (ShiftAmt < 0)
14263     return SDValue();
14264
14265   if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
14266     return SDValue();
14267
14268   assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
14269          "Illegal integer vector type");
14270   V = DAG.getBitcast(ShiftVT, V);
14271   V = DAG.getNode(Opcode, DL, ShiftVT, V,
14272                   DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
14273   return DAG.getBitcast(VT, V);
14274 }
14275
14276 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
14277 // Remainder of lower half result is zero and upper half is all undef.
14278 static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
14279                                 ArrayRef<int> Mask, uint64_t &BitLen,
14280                                 uint64_t &BitIdx, const APInt &Zeroable) {
14281   int Size = Mask.size();
14282   int HalfSize = Size / 2;
14283   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
14284   assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
14285
14286   // Upper half must be undefined.
14287   if (!isUndefUpperHalf(Mask))
14288     return false;
14289
14290   // Determine the extraction length from the part of the
14291   // lower half that isn't zeroable.
14292   int Len = HalfSize;
14293   for (; Len > 0; --Len)
14294     if (!Zeroable[Len - 1])
14295       break;
14296   assert(Len > 0 && "Zeroable shuffle mask");
14297
14298   // Attempt to match first Len sequential elements from the lower half.
14299   SDValue Src;
14300   int Idx = -1;
14301   for (int i = 0; i != Len; ++i) {
14302     int M = Mask[i];
14303     if (M == SM_SentinelUndef)
14304       continue;
14305     SDValue &V = (M < Size ? V1 : V2);
14306     M = M % Size;
14307
14308     // The extracted elements must start at a valid index and all mask
14309     // elements must be in the lower half.
14310     if (i > M || M >= HalfSize)
14311       return false;
14312
14313     if (Idx < 0 || (Src == V && Idx == (M - i))) {
14314       Src = V;
14315       Idx = M - i;
14316       continue;
14317     }
14318     return false;
14319   }
14320
14321   if (!Src || Idx < 0)
14322     return false;
14323
14324   assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
14325   BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
14326   BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
14327   V1 = Src;
14328   return true;
14329 }
14330
14331 // INSERTQ: Extract lowest Len elements from lower half of second source and
14332 // insert over first source, starting at Idx.
14333 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
14334 static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
14335                                   ArrayRef<int> Mask, uint64_t &BitLen,
14336                                   uint64_t &BitIdx) {
14337   int Size = Mask.size();
14338   int HalfSize = Size / 2;
14339   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
14340
14341   // Upper half must be undefined.
14342   if (!isUndefUpperHalf(Mask))
14343     return false;
14344
14345   for (int Idx = 0; Idx != HalfSize; ++Idx) {
14346     SDValue Base;
14347
14348     // Attempt to match first source from mask before insertion point.
14349     if (isUndefInRange(Mask, 0, Idx)) {
14350       /* EMPTY */
14351     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
14352       Base = V1;
14353     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
14354       Base = V2;
14355     } else {
14356       continue;
14357     }
14358
14359     // Extend the extraction length looking to match both the insertion of
14360     // the second source and the remaining elements of the first.
14361     for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
14362       SDValue Insert;
14363       int Len = Hi - Idx;
14364
14365       // Match insertion.
14366       if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
14367         Insert = V1;
14368       } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
14369         Insert = V2;
14370       } else {
14371         continue;
14372       }
14373
14374       // Match the remaining elements of the lower half.
14375       if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
14376         /* EMPTY */
14377       } else if ((!Base || (Base == V1)) &&
14378                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
14379         Base = V1;
14380       } else if ((!Base || (Base == V2)) &&
14381                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
14382                                             Size + Hi)) {
14383         Base = V2;
14384       } else {
14385         continue;
14386       }
14387
14388       BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
14389       BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
14390       V1 = Base;
14391       V2 = Insert;
14392       return true;
14393     }
14394   }
14395
14396   return false;
14397 }
14398
14399 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
14400 static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
14401                                      SDValue V2, ArrayRef<int> Mask,
14402                                      const APInt &Zeroable, SelectionDAG &DAG) {
14403   uint64_t BitLen, BitIdx;
14404   if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
14405     return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
14406                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
14407                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
14408
14409   if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
14410     return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
14411                        V2 ? V2 : DAG.getUNDEF(VT),
14412                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
14413                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
14414
14415   return SDValue();
14416 }
14417
14418 /// Lower a vector shuffle as a zero or any extension.
14419 ///
14420 /// Given a specific number of elements, element bit width, and extension
14421 /// stride, produce either a zero or any extension based on the available
14422 /// features of the subtarget. The extended elements are consecutive and
14423 /// begin and can start from an offsetted element index in the input; to
14424 /// avoid excess shuffling the offset must either being in the bottom lane
14425 /// or at the start of a higher lane. All extended elements must be from
14426 /// the same lane.
14427 static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
14428     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
14429     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14430   assert(Scale > 1 && "Need a scale to extend.");
14431   int EltBits = VT.getScalarSizeInBits();
14432   int NumElements = VT.getVectorNumElements();
14433   int NumEltsPerLane = 128 / EltBits;
14434   int OffsetLane = Offset / NumEltsPerLane;
14435   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
14436          "Only 8, 16, and 32 bit elements can be extended.");
14437   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
14438   assert(0 <= Offset && "Extension offset must be positive.");
14439   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
14440          "Extension offset must be in the first lane or start an upper lane.");
14441
14442   // Check that an index is in same lane as the base offset.
14443   auto SafeOffset = [&](int Idx) {
14444     return OffsetLane == (Idx / NumEltsPerLane);
14445   };
14446
14447   // Shift along an input so that the offset base moves to the first element.
14448   auto ShuffleOffset = [&](SDValue V) {
14449     if (!Offset)
14450       return V;
14451
14452     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
14453     for (int i = 0; i * Scale < NumElements; ++i) {
14454       int SrcIdx = i + Offset;
14455       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
14456     }
14457     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
14458   };
14459
14460   // Found a valid a/zext mask! Try various lowering strategies based on the
14461   // input type and available ISA extensions.
14462   if (Subtarget.hasSSE41()) {
14463     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
14464     // PUNPCK will catch this in a later shuffle match.
14465     if (Offset && Scale == 2 && VT.is128BitVector())
14466       return SDValue();
14467     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
14468                                  NumElements / Scale);
14469     InputV = DAG.getBitcast(VT, InputV);
14470     InputV = ShuffleOffset(InputV);
14471     InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
14472                                     DL, ExtVT, InputV, DAG);
14473     return DAG.getBitcast(VT, InputV);
14474   }
14475
14476   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
14477   InputV = DAG.getBitcast(VT, InputV);
14478
14479   // For any extends we can cheat for larger element sizes and use shuffle
14480   // instructions that can fold with a load and/or copy.
14481   if (AnyExt && EltBits == 32) {
14482     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
14483                          -1};
14484     return DAG.getBitcast(
14485         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
14486                         DAG.getBitcast(MVT::v4i32, InputV),
14487                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14488   }
14489   if (AnyExt && EltBits == 16 && Scale > 2) {
14490     int PSHUFDMask[4] = {Offset / 2, -1,
14491                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
14492     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
14493                          DAG.getBitcast(MVT::v4i32, InputV),
14494                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14495     int PSHUFWMask[4] = {1, -1, -1, -1};
14496     unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
14497     return DAG.getBitcast(
14498         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
14499                         DAG.getBitcast(MVT::v8i16, InputV),
14500                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
14501   }
14502
14503   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
14504   // to 64-bits.
14505   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
14506     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
14507     assert(VT.is128BitVector() && "Unexpected vector width!");
14508
14509     int LoIdx = Offset * EltBits;
14510     SDValue Lo = DAG.getBitcast(
14511         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
14512                                 DAG.getTargetConstant(EltBits, DL, MVT::i8),
14513                                 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
14514
14515     if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
14516       return DAG.getBitcast(VT, Lo);
14517
14518     int HiIdx = (Offset + 1) * EltBits;
14519     SDValue Hi = DAG.getBitcast(
14520         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
14521                                 DAG.getTargetConstant(EltBits, DL, MVT::i8),
14522                                 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
14523     return DAG.getBitcast(VT,
14524                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
14525   }
14526
14527   // If this would require more than 2 unpack instructions to expand, use
14528   // pshufb when available. We can only use more than 2 unpack instructions
14529   // when zero extending i8 elements which also makes it easier to use pshufb.
14530   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
14531     assert(NumElements == 16 && "Unexpected byte vector width!");
14532     SDValue PSHUFBMask[16];
14533     for (int i = 0; i < 16; ++i) {
14534       int Idx = Offset + (i / Scale);
14535       if ((i % Scale == 0 && SafeOffset(Idx))) {
14536         PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
14537         continue;
14538       }
14539       PSHUFBMask[i] =
14540           AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
14541     }
14542     InputV = DAG.getBitcast(MVT::v16i8, InputV);
14543     return DAG.getBitcast(
14544         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
14545                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
14546   }
14547
14548   // If we are extending from an offset, ensure we start on a boundary that
14549   // we can unpack from.
14550   int AlignToUnpack = Offset % (NumElements / Scale);
14551   if (AlignToUnpack) {
14552     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
14553     for (int i = AlignToUnpack; i < NumElements; ++i)
14554       ShMask[i - AlignToUnpack] = i;
14555     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
14556     Offset -= AlignToUnpack;
14557   }
14558
14559   // Otherwise emit a sequence of unpacks.
14560   do {
14561     unsigned UnpackLoHi = X86ISD::UNPCKL;
14562     if (Offset >= (NumElements / 2)) {
14563       UnpackLoHi = X86ISD::UNPCKH;
14564       Offset -= (NumElements / 2);
14565     }
14566
14567     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
14568     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
14569                          : getZeroVector(InputVT, Subtarget, DAG, DL);
14570     InputV = DAG.getBitcast(InputVT, InputV);
14571     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
14572     Scale /= 2;
14573     EltBits *= 2;
14574     NumElements /= 2;
14575   } while (Scale > 1);
14576   return DAG.getBitcast(VT, InputV);
14577 }
14578
14579 /// Try to lower a vector shuffle as a zero extension on any microarch.
14580 ///
14581 /// This routine will try to do everything in its power to cleverly lower
14582 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
14583 /// check for the profitability of this lowering,  it tries to aggressively
14584 /// match this pattern. It will use all of the micro-architectural details it
14585 /// can to emit an efficient lowering. It handles both blends with all-zero
14586 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
14587 /// masking out later).
14588 ///
14589 /// The reason we have dedicated lowering for zext-style shuffles is that they
14590 /// are both incredibly common and often quite performance sensitive.
14591 static SDValue lowerShuffleAsZeroOrAnyExtend(
14592     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14593     const APInt &Zeroable, const X86Subtarget &Subtarget,
14594     SelectionDAG &DAG) {
14595   int Bits = VT.getSizeInBits();
14596   int NumLanes = Bits / 128;
14597   int NumElements = VT.getVectorNumElements();
14598   int NumEltsPerLane = NumElements / NumLanes;
14599   assert(VT.getScalarSizeInBits() <= 32 &&
14600          "Exceeds 32-bit integer zero extension limit");
14601   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
14602
14603   // Define a helper function to check a particular ext-scale and lower to it if
14604   // valid.
14605   auto Lower = [&](int Scale) -> SDValue {
14606     SDValue InputV;
14607     bool AnyExt = true;
14608     int Offset = 0;
14609     int Matches = 0;
14610     for (int i = 0; i < NumElements; ++i) {
14611       int M = Mask[i];
14612       if (M < 0)
14613         continue; // Valid anywhere but doesn't tell us anything.
14614       if (i % Scale != 0) {
14615         // Each of the extended elements need to be zeroable.
14616         if (!Zeroable[i])
14617           return SDValue();
14618
14619         // We no longer are in the anyext case.
14620         AnyExt = false;
14621         continue;
14622       }
14623
14624       // Each of the base elements needs to be consecutive indices into the
14625       // same input vector.
14626       SDValue V = M < NumElements ? V1 : V2;
14627       M = M % NumElements;
14628       if (!InputV) {
14629         InputV = V;
14630         Offset = M - (i / Scale);
14631       } else if (InputV != V)
14632         return SDValue(); // Flip-flopping inputs.
14633
14634       // Offset must start in the lowest 128-bit lane or at the start of an
14635       // upper lane.
14636       // FIXME: Is it ever worth allowing a negative base offset?
14637       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
14638             (Offset % NumEltsPerLane) == 0))
14639         return SDValue();
14640
14641       // If we are offsetting, all referenced entries must come from the same
14642       // lane.
14643       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
14644         return SDValue();
14645
14646       if ((M % NumElements) != (Offset + (i / Scale)))
14647         return SDValue(); // Non-consecutive strided elements.
14648       Matches++;
14649     }
14650
14651     // If we fail to find an input, we have a zero-shuffle which should always
14652     // have already been handled.
14653     // FIXME: Maybe handle this here in case during blending we end up with one?
14654     if (!InputV)
14655       return SDValue();
14656
14657     // If we are offsetting, don't extend if we only match a single input, we
14658     // can always do better by using a basic PSHUF or PUNPCK.
14659     if (Offset != 0 && Matches < 2)
14660       return SDValue();
14661
14662     return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
14663                                                  InputV, Mask, Subtarget, DAG);
14664   };
14665
14666   // The widest scale possible for extending is to a 64-bit integer.
14667   assert(Bits % 64 == 0 &&
14668          "The number of bits in a vector must be divisible by 64 on x86!");
14669   int NumExtElements = Bits / 64;
14670
14671   // Each iteration, try extending the elements half as much, but into twice as
14672   // many elements.
14673   for (; NumExtElements < NumElements; NumExtElements *= 2) {
14674     assert(NumElements % NumExtElements == 0 &&
14675            "The input vector size must be divisible by the extended size.");
14676     if (SDValue V = Lower(NumElements / NumExtElements))
14677       return V;
14678   }
14679
14680   // General extends failed, but 128-bit vectors may be able to use MOVQ.
14681   if (Bits != 128)
14682     return SDValue();
14683
14684   // Returns one of the source operands if the shuffle can be reduced to a
14685   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
14686   auto CanZExtLowHalf = [&]() {
14687     for (int i = NumElements / 2; i != NumElements; ++i)
14688       if (!Zeroable[i])
14689         return SDValue();
14690     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
14691       return V1;
14692     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
14693       return V2;
14694     return SDValue();
14695   };
14696
14697   if (SDValue V = CanZExtLowHalf()) {
14698     V = DAG.getBitcast(MVT::v2i64, V);
14699     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
14700     return DAG.getBitcast(VT, V);
14701   }
14702
14703   // No viable ext lowering found.
14704   return SDValue();
14705 }
14706
14707 /// Try to get a scalar value for a specific element of a vector.
14708 ///
14709 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
14710 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
14711                                               SelectionDAG &DAG) {
14712   MVT VT = V.getSimpleValueType();
14713   MVT EltVT = VT.getVectorElementType();
14714   V = peekThroughBitcasts(V);
14715
14716   // If the bitcasts shift the element size, we can't extract an equivalent
14717   // element from it.
14718   MVT NewVT = V.getSimpleValueType();
14719   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
14720     return SDValue();
14721
14722   if (V.getOpcode() == ISD::BUILD_VECTOR ||
14723       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
14724     // Ensure the scalar operand is the same size as the destination.
14725     // FIXME: Add support for scalar truncation where possible.
14726     SDValue S = V.getOperand(Idx);
14727     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
14728       return DAG.getBitcast(EltVT, S);
14729   }
14730
14731   return SDValue();
14732 }
14733
14734 /// Helper to test for a load that can be folded with x86 shuffles.
14735 ///
14736 /// This is particularly important because the set of instructions varies
14737 /// significantly based on whether the operand is a load or not.
14738 static bool isShuffleFoldableLoad(SDValue V) {
14739   return V->hasOneUse() &&
14740          ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
14741 }
14742
14743 template<typename T>
14744 static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
14745   return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();
14746 }
14747
14748 template<typename T>
14749 bool X86TargetLowering::isSoftFP16(T VT) const {
14750   return ::isSoftFP16(VT, Subtarget);
14751 }
14752
14753 /// Try to lower insertion of a single element into a zero vector.
14754 ///
14755 /// This is a common pattern that we have especially efficient patterns to lower
14756 /// across all subtarget feature sets.
14757 static SDValue lowerShuffleAsElementInsertion(
14758     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14759     const APInt &Zeroable, const X86Subtarget &Subtarget,
14760     SelectionDAG &DAG) {
14761   MVT ExtVT = VT;
14762   MVT EltVT = VT.getVectorElementType();
14763   unsigned NumElts = VT.getVectorNumElements();
14764   unsigned EltBits = VT.getScalarSizeInBits();
14765
14766   if (isSoftFP16(EltVT, Subtarget))
14767     return SDValue();
14768
14769   int V2Index =
14770       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
14771       Mask.begin();
14772   bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
14773   bool IsV1Zeroable = true;
14774   for (int i = 0, Size = Mask.size(); i < Size; ++i)
14775     if (i != V2Index && !Zeroable[i]) {
14776       IsV1Zeroable = false;
14777       break;
14778     }
14779
14780   // Bail if a non-zero V1 isn't used in place.
14781   if (!IsV1Zeroable) {
14782     SmallVector<int, 8> V1Mask(Mask);
14783     V1Mask[V2Index] = -1;
14784     if (!isNoopShuffleMask(V1Mask))
14785       return SDValue();
14786   }
14787
14788   // Check for a single input from a SCALAR_TO_VECTOR node.
14789   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
14790   // all the smarts here sunk into that routine. However, the current
14791   // lowering of BUILD_VECTOR makes that nearly impossible until the old
14792   // vector shuffle lowering is dead.
14793   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
14794                                                DAG);
14795   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
14796     // We need to zext the scalar if it is smaller than an i32.
14797     V2S = DAG.getBitcast(EltVT, V2S);
14798     if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
14799       // Using zext to expand a narrow element won't work for non-zero
14800       // insertions. But we can use a masked constant vector if we're
14801       // inserting V2 into the bottom of V1.
14802       if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
14803         return SDValue();
14804
14805       // Zero-extend directly to i32.
14806       ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
14807       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
14808
14809       // If we're inserting into a constant, mask off the inserted index
14810       // and OR with the zero-extended scalar.
14811       if (!IsV1Zeroable) {
14812         SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
14813         Bits[V2Index] = APInt::getZero(EltBits);
14814         SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
14815         V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
14816         V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
14817         V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
14818         return DAG.getNode(ISD::OR, DL, VT, V1, V2);
14819       }
14820     }
14821     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
14822   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
14823              EltVT == MVT::i16) {
14824     // Either not inserting from the low element of the input or the input
14825     // element size is too small to use VZEXT_MOVL to clear the high bits.
14826     return SDValue();
14827   }
14828
14829   if (!IsV1Zeroable) {
14830     // If V1 can't be treated as a zero vector we have fewer options to lower
14831     // this. We can't support integer vectors or non-zero targets cheaply.
14832     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
14833     if (!VT.isFloatingPoint() || V2Index != 0)
14834       return SDValue();
14835     if (!VT.is128BitVector())
14836       return SDValue();
14837
14838     // Otherwise, use MOVSD, MOVSS or MOVSH.
14839     unsigned MovOpc = 0;
14840     if (EltVT == MVT::f16)
14841       MovOpc = X86ISD::MOVSH;
14842     else if (EltVT == MVT::f32)
14843       MovOpc = X86ISD::MOVSS;
14844     else if (EltVT == MVT::f64)
14845       MovOpc = X86ISD::MOVSD;
14846     else
14847       llvm_unreachable("Unsupported floating point element type to handle!");
14848     return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
14849   }
14850
14851   // This lowering only works for the low element with floating point vectors.
14852   if (VT.isFloatingPoint() && V2Index != 0)
14853     return SDValue();
14854
14855   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
14856   if (ExtVT != VT)
14857     V2 = DAG.getBitcast(VT, V2);
14858
14859   if (V2Index != 0) {
14860     // If we have 4 or fewer lanes we can cheaply shuffle the element into
14861     // the desired position. Otherwise it is more efficient to do a vector
14862     // shift left. We know that we can do a vector shift left because all
14863     // the inputs are zero.
14864     if (VT.isFloatingPoint() || NumElts <= 4) {
14865       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
14866       V2Shuffle[V2Index] = 0;
14867       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
14868     } else {
14869       V2 = DAG.getBitcast(MVT::v16i8, V2);
14870       V2 = DAG.getNode(
14871           X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
14872           DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
14873       V2 = DAG.getBitcast(VT, V2);
14874     }
14875   }
14876   return V2;
14877 }
14878
14879 /// Try to lower broadcast of a single - truncated - integer element,
14880 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
14881 ///
14882 /// This assumes we have AVX2.
14883 static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
14884                                             int BroadcastIdx,
14885                                             const X86Subtarget &Subtarget,
14886                                             SelectionDAG &DAG) {
14887   assert(Subtarget.hasAVX2() &&
14888          "We can only lower integer broadcasts with AVX2!");
14889
14890   MVT EltVT = VT.getVectorElementType();
14891   MVT V0VT = V0.getSimpleValueType();
14892
14893   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
14894   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
14895
14896   MVT V0EltVT = V0VT.getVectorElementType();
14897   if (!V0EltVT.isInteger())
14898     return SDValue();
14899
14900   const unsigned EltSize = EltVT.getSizeInBits();
14901   const unsigned V0EltSize = V0EltVT.getSizeInBits();
14902
14903   // This is only a truncation if the original element type is larger.
14904   if (V0EltSize <= EltSize)
14905     return SDValue();
14906
14907   assert(((V0EltSize % EltSize) == 0) &&
14908          "Scalar type sizes must all be powers of 2 on x86!");
14909
14910   const unsigned V0Opc = V0.getOpcode();
14911   const unsigned Scale = V0EltSize / EltSize;
14912   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
14913
14914   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
14915       V0Opc != ISD::BUILD_VECTOR)
14916     return SDValue();
14917
14918   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
14919
14920   // If we're extracting non-least-significant bits, shift so we can truncate.
14921   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
14922   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
14923   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
14924   if (const int OffsetIdx = BroadcastIdx % Scale)
14925     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
14926                          DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
14927
14928   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
14929                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
14930 }
14931
14932 /// Test whether this can be lowered with a single SHUFPS instruction.
14933 ///
14934 /// This is used to disable more specialized lowerings when the shufps lowering
14935 /// will happen to be efficient.
14936 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
14937   // This routine only handles 128-bit shufps.
14938   assert(Mask.size() == 4 && "Unsupported mask size!");
14939   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
14940   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
14941   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
14942   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
14943
14944   // To lower with a single SHUFPS we need to have the low half and high half
14945   // each requiring a single input.
14946   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
14947     return false;
14948   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
14949     return false;
14950
14951   return true;
14952 }
14953
14954 /// Test whether the specified input (0 or 1) is in-place blended by the
14955 /// given mask.
14956 ///
14957 /// This returns true if the elements from a particular input are already in the
14958 /// slot required by the given mask and require no permutation.
14959 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
14960   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
14961   int Size = Mask.size();
14962   for (int i = 0; i < Size; ++i)
14963     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
14964       return false;
14965
14966   return true;
14967 }
14968
14969 /// If we are extracting two 128-bit halves of a vector and shuffling the
14970 /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
14971 /// multi-shuffle lowering.
14972 static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
14973                                              SDValue N1, ArrayRef<int> Mask,
14974                                              SelectionDAG &DAG) {
14975   MVT VT = N0.getSimpleValueType();
14976   assert((VT.is128BitVector() &&
14977           (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
14978          "VPERM* family of shuffles requires 32-bit or 64-bit elements");
14979
14980   // Check that both sources are extracts of the same source vector.
14981   if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14982       N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14983       N0.getOperand(0) != N1.getOperand(0) ||
14984       !N0.hasOneUse() || !N1.hasOneUse())
14985     return SDValue();
14986
14987   SDValue WideVec = N0.getOperand(0);
14988   MVT WideVT = WideVec.getSimpleValueType();
14989   if (!WideVT.is256BitVector())
14990     return SDValue();
14991
14992   // Match extracts of each half of the wide source vector. Commute the shuffle
14993   // if the extract of the low half is N1.
14994   unsigned NumElts = VT.getVectorNumElements();
14995   SmallVector<int, 4> NewMask(Mask);
14996   const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
14997   const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
14998   if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
14999     ShuffleVectorSDNode::commuteMask(NewMask);
15000   else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
15001     return SDValue();
15002
15003   // Final bailout: if the mask is simple, we are better off using an extract
15004   // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
15005   // because that avoids a constant load from memory.
15006   if (NumElts == 4 &&
15007       (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
15008     return SDValue();
15009
15010   // Extend the shuffle mask with undef elements.
15011   NewMask.append(NumElts, -1);
15012
15013   // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
15014   SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
15015                                       NewMask);
15016   // This is free: ymm -> xmm.
15017   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
15018                      DAG.getIntPtrConstant(0, DL));
15019 }
15020
15021 /// Try to lower broadcast of a single element.
15022 ///
15023 /// For convenience, this code also bundles all of the subtarget feature set
15024 /// filtering. While a little annoying to re-dispatch on type here, there isn't
15025 /// a convenient way to factor it out.
15026 static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
15027                                        SDValue V2, ArrayRef<int> Mask,
15028                                        const X86Subtarget &Subtarget,
15029                                        SelectionDAG &DAG) {
15030   MVT EltVT = VT.getVectorElementType();
15031   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
15032         (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
15033         (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
15034     return SDValue();
15035
15036   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
15037   // we can only broadcast from a register with AVX2.
15038   unsigned NumEltBits = VT.getScalarSizeInBits();
15039   unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
15040                         ? X86ISD::MOVDDUP
15041                         : X86ISD::VBROADCAST;
15042   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
15043
15044   // Check that the mask is a broadcast.
15045   int BroadcastIdx = getSplatIndex(Mask);
15046   if (BroadcastIdx < 0)
15047     return SDValue();
15048   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
15049                                             "a sorted mask where the broadcast "
15050                                             "comes from V1.");
15051
15052   // Go up the chain of (vector) values to find a scalar load that we can
15053   // combine with the broadcast.
15054   // TODO: Combine this logic with findEltLoadSrc() used by
15055   //       EltsFromConsecutiveLoads().
15056   int BitOffset = BroadcastIdx * NumEltBits;
15057   SDValue V = V1;
15058   for (;;) {
15059     switch (V.getOpcode()) {
15060     case ISD::BITCAST: {
15061       V = V.getOperand(0);
15062       continue;
15063     }
15064     case ISD::CONCAT_VECTORS: {
15065       int OpBitWidth = V.getOperand(0).getValueSizeInBits();
15066       int OpIdx = BitOffset / OpBitWidth;
15067       V = V.getOperand(OpIdx);
15068       BitOffset %= OpBitWidth;
15069       continue;
15070     }
15071     case ISD::EXTRACT_SUBVECTOR: {
15072       // The extraction index adds to the existing offset.
15073       unsigned EltBitWidth = V.getScalarValueSizeInBits();
15074       unsigned Idx = V.getConstantOperandVal(1);
15075       unsigned BeginOffset = Idx * EltBitWidth;
15076       BitOffset += BeginOffset;
15077       V = V.getOperand(0);
15078       continue;
15079     }
15080     case ISD::INSERT_SUBVECTOR: {
15081       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
15082       int EltBitWidth = VOuter.getScalarValueSizeInBits();
15083       int Idx = (int)V.getConstantOperandVal(2);
15084       int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
15085       int BeginOffset = Idx * EltBitWidth;
15086       int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
15087       if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
15088         BitOffset -= BeginOffset;
15089         V = VInner;
15090       } else {
15091         V = VOuter;
15092       }
15093       continue;
15094     }
15095     }
15096     break;
15097   }
15098   assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
15099   BroadcastIdx = BitOffset / NumEltBits;
15100
15101   // Do we need to bitcast the source to retrieve the original broadcast index?
15102   bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
15103
15104   // Check if this is a broadcast of a scalar. We special case lowering
15105   // for scalars so that we can more effectively fold with loads.
15106   // If the original value has a larger element type than the shuffle, the
15107   // broadcast element is in essence truncated. Make that explicit to ease
15108   // folding.
15109   if (BitCastSrc && VT.isInteger())
15110     if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
15111             DL, VT, V, BroadcastIdx, Subtarget, DAG))
15112       return TruncBroadcast;
15113
15114   // Also check the simpler case, where we can directly reuse the scalar.
15115   if (!BitCastSrc &&
15116       ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
15117        (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
15118     V = V.getOperand(BroadcastIdx);
15119
15120     // If we can't broadcast from a register, check that the input is a load.
15121     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
15122       return SDValue();
15123   } else if (ISD::isNormalLoad(V.getNode()) &&
15124              cast<LoadSDNode>(V)->isSimple()) {
15125     // We do not check for one-use of the vector load because a broadcast load
15126     // is expected to be a win for code size, register pressure, and possibly
15127     // uops even if the original vector load is not eliminated.
15128
15129     // Reduce the vector load and shuffle to a broadcasted scalar load.
15130     LoadSDNode *Ld = cast<LoadSDNode>(V);
15131     SDValue BaseAddr = Ld->getOperand(1);
15132     MVT SVT = VT.getScalarType();
15133     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
15134     assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
15135     SDValue NewAddr =
15136         DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
15137
15138     // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
15139     // than MOVDDUP.
15140     // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
15141     if (Opcode == X86ISD::VBROADCAST) {
15142       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
15143       SDValue Ops[] = {Ld->getChain(), NewAddr};
15144       V = DAG.getMemIntrinsicNode(
15145           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
15146           DAG.getMachineFunction().getMachineMemOperand(
15147               Ld->getMemOperand(), Offset, SVT.getStoreSize()));
15148       DAG.makeEquivalentMemoryOrdering(Ld, V);
15149       return DAG.getBitcast(VT, V);
15150     }
15151     assert(SVT == MVT::f64 && "Unexpected VT!");
15152     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
15153                     DAG.getMachineFunction().getMachineMemOperand(
15154                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
15155     DAG.makeEquivalentMemoryOrdering(Ld, V);
15156   } else if (!BroadcastFromReg) {
15157     // We can't broadcast from a vector register.
15158     return SDValue();
15159   } else if (BitOffset != 0) {
15160     // We can only broadcast from the zero-element of a vector register,
15161     // but it can be advantageous to broadcast from the zero-element of a
15162     // subvector.
15163     if (!VT.is256BitVector() && !VT.is512BitVector())
15164       return SDValue();
15165
15166     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
15167     if (VT == MVT::v4f64 || VT == MVT::v4i64)
15168       return SDValue();
15169
15170     // Only broadcast the zero-element of a 128-bit subvector.
15171     if ((BitOffset % 128) != 0)
15172       return SDValue();
15173
15174     assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
15175            "Unexpected bit-offset");
15176     assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
15177            "Unexpected vector size");
15178     unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
15179     V = extract128BitVector(V, ExtractIdx, DAG, DL);
15180   }
15181
15182   // On AVX we can use VBROADCAST directly for scalar sources.
15183   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
15184     V = DAG.getBitcast(MVT::f64, V);
15185     if (Subtarget.hasAVX()) {
15186       V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
15187       return DAG.getBitcast(VT, V);
15188     }
15189     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
15190   }
15191
15192   // If this is a scalar, do the broadcast on this type and bitcast.
15193   if (!V.getValueType().isVector()) {
15194     assert(V.getScalarValueSizeInBits() == NumEltBits &&
15195            "Unexpected scalar size");
15196     MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
15197                                        VT.getVectorNumElements());
15198     return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
15199   }
15200
15201   // We only support broadcasting from 128-bit vectors to minimize the
15202   // number of patterns we need to deal with in isel. So extract down to
15203   // 128-bits, removing as many bitcasts as possible.
15204   if (V.getValueSizeInBits() > 128)
15205     V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
15206
15207   // Otherwise cast V to a vector with the same element type as VT, but
15208   // possibly narrower than VT. Then perform the broadcast.
15209   unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
15210   MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
15211   return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
15212 }
15213
15214 // Check for whether we can use INSERTPS to perform the shuffle. We only use
15215 // INSERTPS when the V1 elements are already in the correct locations
15216 // because otherwise we can just always use two SHUFPS instructions which
15217 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
15218 // perform INSERTPS if a single V1 element is out of place and all V2
15219 // elements are zeroable.
15220 static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
15221                                    unsigned &InsertPSMask,
15222                                    const APInt &Zeroable,
15223                                    ArrayRef<int> Mask, SelectionDAG &DAG) {
15224   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
15225   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
15226   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15227
15228   // Attempt to match INSERTPS with one element from VA or VB being
15229   // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
15230   // are updated.
15231   auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
15232                              ArrayRef<int> CandidateMask) {
15233     unsigned ZMask = 0;
15234     int VADstIndex = -1;
15235     int VBDstIndex = -1;
15236     bool VAUsedInPlace = false;
15237
15238     for (int i = 0; i < 4; ++i) {
15239       // Synthesize a zero mask from the zeroable elements (includes undefs).
15240       if (Zeroable[i]) {
15241         ZMask |= 1 << i;
15242         continue;
15243       }
15244
15245       // Flag if we use any VA inputs in place.
15246       if (i == CandidateMask[i]) {
15247         VAUsedInPlace = true;
15248         continue;
15249       }
15250
15251       // We can only insert a single non-zeroable element.
15252       if (VADstIndex >= 0 || VBDstIndex >= 0)
15253         return false;
15254
15255       if (CandidateMask[i] < 4) {
15256         // VA input out of place for insertion.
15257         VADstIndex = i;
15258       } else {
15259         // VB input for insertion.
15260         VBDstIndex = i;
15261       }
15262     }
15263
15264     // Don't bother if we have no (non-zeroable) element for insertion.
15265     if (VADstIndex < 0 && VBDstIndex < 0)
15266       return false;
15267
15268     // Determine element insertion src/dst indices. The src index is from the
15269     // start of the inserted vector, not the start of the concatenated vector.
15270     unsigned VBSrcIndex = 0;
15271     if (VADstIndex >= 0) {
15272       // If we have a VA input out of place, we use VA as the V2 element
15273       // insertion and don't use the original V2 at all.
15274       VBSrcIndex = CandidateMask[VADstIndex];
15275       VBDstIndex = VADstIndex;
15276       VB = VA;
15277     } else {
15278       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
15279     }
15280
15281     // If no V1 inputs are used in place, then the result is created only from
15282     // the zero mask and the V2 insertion - so remove V1 dependency.
15283     if (!VAUsedInPlace)
15284       VA = DAG.getUNDEF(MVT::v4f32);
15285
15286     // Update V1, V2 and InsertPSMask accordingly.
15287     V1 = VA;
15288     V2 = VB;
15289
15290     // Insert the V2 element into the desired position.
15291     InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
15292     assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
15293     return true;
15294   };
15295
15296   if (matchAsInsertPS(V1, V2, Mask))
15297     return true;
15298
15299   // Commute and try again.
15300   SmallVector<int, 4> CommutedMask(Mask);
15301   ShuffleVectorSDNode::commuteMask(CommutedMask);
15302   if (matchAsInsertPS(V2, V1, CommutedMask))
15303     return true;
15304
15305   return false;
15306 }
15307
15308 static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
15309                                       ArrayRef<int> Mask, const APInt &Zeroable,
15310                                       SelectionDAG &DAG) {
15311   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
15312   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
15313
15314   // Attempt to match the insertps pattern.
15315   unsigned InsertPSMask = 0;
15316   if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
15317     return SDValue();
15318
15319   // Insert the V2 element into the desired position.
15320   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
15321                      DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
15322 }
15323
15324 /// Handle lowering of 2-lane 64-bit floating point shuffles.
15325 ///
15326 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
15327 /// support for floating point shuffles but not integer shuffles. These
15328 /// instructions will incur a domain crossing penalty on some chips though so
15329 /// it is better to avoid lowering through this for integer vectors where
15330 /// possible.
15331 static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15332                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15333                                  const X86Subtarget &Subtarget,
15334                                  SelectionDAG &DAG) {
15335   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
15336   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
15337   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
15338
15339   if (V2.isUndef()) {
15340     // Check for being able to broadcast a single element.
15341     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
15342                                                     Mask, Subtarget, DAG))
15343       return Broadcast;
15344
15345     // Straight shuffle of a single input vector. Simulate this by using the
15346     // single input as both of the "inputs" to this instruction..
15347     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
15348
15349     if (Subtarget.hasAVX()) {
15350       // If we have AVX, we can use VPERMILPS which will allow folding a load
15351       // into the shuffle.
15352       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
15353                          DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15354     }
15355
15356     return DAG.getNode(
15357         X86ISD::SHUFP, DL, MVT::v2f64,
15358         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
15359         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
15360         DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15361   }
15362   assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
15363   assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
15364   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
15365   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
15366
15367   if (Subtarget.hasAVX2())
15368     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15369       return Extract;
15370
15371   // When loading a scalar and then shuffling it into a vector we can often do
15372   // the insertion cheaply.
15373   if (SDValue Insertion = lowerShuffleAsElementInsertion(
15374           DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
15375     return Insertion;
15376   // Try inverting the insertion since for v2 masks it is easy to do and we
15377   // can't reliably sort the mask one way or the other.
15378   int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
15379                         Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
15380   if (SDValue Insertion = lowerShuffleAsElementInsertion(
15381           DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
15382     return Insertion;
15383
15384   // Try to use one of the special instruction patterns to handle two common
15385   // blend patterns if a zero-blend above didn't work.
15386   if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
15387       isShuffleEquivalent(Mask, {1, 3}, V1, V2))
15388     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
15389       // We can either use a special instruction to load over the low double or
15390       // to move just the low double.
15391       return DAG.getNode(
15392           X86ISD::MOVSD, DL, MVT::v2f64, V2,
15393           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
15394
15395   if (Subtarget.hasSSE41())
15396     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
15397                                             Zeroable, Subtarget, DAG))
15398       return Blend;
15399
15400   // Use dedicated unpack instructions for masks that match their pattern.
15401   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
15402     return V;
15403
15404   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
15405   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
15406                      DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
15407 }
15408
15409 /// Handle lowering of 2-lane 64-bit integer shuffles.
15410 ///
15411 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
15412 /// the integer unit to minimize domain crossing penalties. However, for blends
15413 /// it falls back to the floating point shuffle operation with appropriate bit
15414 /// casting.
15415 static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15416                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15417                                  const X86Subtarget &Subtarget,
15418                                  SelectionDAG &DAG) {
15419   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
15420   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
15421   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
15422
15423   if (V2.isUndef()) {
15424     // Check for being able to broadcast a single element.
15425     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
15426                                                     Mask, Subtarget, DAG))
15427       return Broadcast;
15428
15429     // Straight shuffle of a single input vector. For everything from SSE2
15430     // onward this has a single fast instruction with no scary immediates.
15431     // We have to map the mask as it is actually a v4i32 shuffle instruction.
15432     V1 = DAG.getBitcast(MVT::v4i32, V1);
15433     int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
15434                           Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
15435                           Mask[1] < 0 ? -1 : (Mask[1] * 2),
15436                           Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
15437     return DAG.getBitcast(
15438         MVT::v2i64,
15439         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15440                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
15441   }
15442   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
15443   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
15444   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
15445   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
15446
15447   if (Subtarget.hasAVX2())
15448     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15449       return Extract;
15450
15451   // Try to use shift instructions.
15452   if (SDValue Shift =
15453           lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
15454                               DAG, /*BitwiseOnly*/ false))
15455     return Shift;
15456
15457   // When loading a scalar and then shuffling it into a vector we can often do
15458   // the insertion cheaply.
15459   if (SDValue Insertion = lowerShuffleAsElementInsertion(
15460           DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
15461     return Insertion;
15462   // Try inverting the insertion since for v2 masks it is easy to do and we
15463   // can't reliably sort the mask one way or the other.
15464   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
15465   if (SDValue Insertion = lowerShuffleAsElementInsertion(
15466           DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
15467     return Insertion;
15468
15469   // We have different paths for blend lowering, but they all must use the
15470   // *exact* same predicate.
15471   bool IsBlendSupported = Subtarget.hasSSE41();
15472   if (IsBlendSupported)
15473     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
15474                                             Zeroable, Subtarget, DAG))
15475       return Blend;
15476
15477   // Use dedicated unpack instructions for masks that match their pattern.
15478   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
15479     return V;
15480
15481   // Try to use byte rotation instructions.
15482   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15483   if (Subtarget.hasSSSE3()) {
15484     if (Subtarget.hasVLX())
15485       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
15486                                                 Subtarget, DAG))
15487         return Rotate;
15488
15489     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
15490                                                   Subtarget, DAG))
15491       return Rotate;
15492   }
15493
15494   // If we have direct support for blends, we should lower by decomposing into
15495   // a permute. That will be faster than the domain cross.
15496   if (IsBlendSupported)
15497     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
15498                                                 Subtarget, DAG);
15499
15500   // We implement this with SHUFPD which is pretty lame because it will likely
15501   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
15502   // However, all the alternatives are still more cycles and newer chips don't
15503   // have this problem. It would be really nice if x86 had better shuffles here.
15504   V1 = DAG.getBitcast(MVT::v2f64, V1);
15505   V2 = DAG.getBitcast(MVT::v2f64, V2);
15506   return DAG.getBitcast(MVT::v2i64,
15507                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
15508 }
15509
15510 /// Lower a vector shuffle using the SHUFPS instruction.
15511 ///
15512 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
15513 /// It makes no assumptions about whether this is the *best* lowering, it simply
15514 /// uses it.
15515 static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
15516                                       ArrayRef<int> Mask, SDValue V1,
15517                                       SDValue V2, SelectionDAG &DAG) {
15518   SDValue LowV = V1, HighV = V2;
15519   SmallVector<int, 4> NewMask(Mask);
15520   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15521
15522   if (NumV2Elements == 1) {
15523     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
15524
15525     // Compute the index adjacent to V2Index and in the same half by toggling
15526     // the low bit.
15527     int V2AdjIndex = V2Index ^ 1;
15528
15529     if (Mask[V2AdjIndex] < 0) {
15530       // Handles all the cases where we have a single V2 element and an undef.
15531       // This will only ever happen in the high lanes because we commute the
15532       // vector otherwise.
15533       if (V2Index < 2)
15534         std::swap(LowV, HighV);
15535       NewMask[V2Index] -= 4;
15536     } else {
15537       // Handle the case where the V2 element ends up adjacent to a V1 element.
15538       // To make this work, blend them together as the first step.
15539       int V1Index = V2AdjIndex;
15540       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
15541       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
15542                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
15543
15544       // Now proceed to reconstruct the final blend as we have the necessary
15545       // high or low half formed.
15546       if (V2Index < 2) {
15547         LowV = V2;
15548         HighV = V1;
15549       } else {
15550         HighV = V2;
15551       }
15552       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
15553       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
15554     }
15555   } else if (NumV2Elements == 2) {
15556     if (Mask[0] < 4 && Mask[1] < 4) {
15557       // Handle the easy case where we have V1 in the low lanes and V2 in the
15558       // high lanes.
15559       NewMask[2] -= 4;
15560       NewMask[3] -= 4;
15561     } else if (Mask[2] < 4 && Mask[3] < 4) {
15562       // We also handle the reversed case because this utility may get called
15563       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
15564       // arrange things in the right direction.
15565       NewMask[0] -= 4;
15566       NewMask[1] -= 4;
15567       HighV = V1;
15568       LowV = V2;
15569     } else {
15570       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
15571       // trying to place elements directly, just blend them and set up the final
15572       // shuffle to place them.
15573
15574       // The first two blend mask elements are for V1, the second two are for
15575       // V2.
15576       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
15577                           Mask[2] < 4 ? Mask[2] : Mask[3],
15578                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
15579                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
15580       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15581                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
15582
15583       // Now we do a normal shuffle of V1 by giving V1 as both operands to
15584       // a blend.
15585       LowV = HighV = V1;
15586       NewMask[0] = Mask[0] < 4 ? 0 : 2;
15587       NewMask[1] = Mask[0] < 4 ? 2 : 0;
15588       NewMask[2] = Mask[2] < 4 ? 1 : 3;
15589       NewMask[3] = Mask[2] < 4 ? 3 : 1;
15590     }
15591   } else if (NumV2Elements == 3) {
15592     // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
15593     // we can get here due to other paths (e.g repeated mask matching) that we
15594     // don't want to do another round of lowerVECTOR_SHUFFLE.
15595     ShuffleVectorSDNode::commuteMask(NewMask);
15596     return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
15597   }
15598   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
15599                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
15600 }
15601
15602 /// Lower 4-lane 32-bit floating point shuffles.
15603 ///
15604 /// Uses instructions exclusively from the floating point unit to minimize
15605 /// domain crossing penalties, as these are sufficient to implement all v4f32
15606 /// shuffles.
15607 static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15608                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15609                                  const X86Subtarget &Subtarget,
15610                                  SelectionDAG &DAG) {
15611   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
15612   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
15613   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15614
15615   if (Subtarget.hasSSE41())
15616     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
15617                                             Zeroable, Subtarget, DAG))
15618       return Blend;
15619
15620   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15621
15622   if (NumV2Elements == 0) {
15623     // Check for being able to broadcast a single element.
15624     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
15625                                                     Mask, Subtarget, DAG))
15626       return Broadcast;
15627
15628     // Use even/odd duplicate instructions for masks that match their pattern.
15629     if (Subtarget.hasSSE3()) {
15630       if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15631         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
15632       if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
15633         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
15634     }
15635
15636     if (Subtarget.hasAVX()) {
15637       // If we have AVX, we can use VPERMILPS which will allow folding a load
15638       // into the shuffle.
15639       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
15640                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15641     }
15642
15643     // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
15644     // in SSE1 because otherwise they are widened to v2f64 and never get here.
15645     if (!Subtarget.hasSSE2()) {
15646       if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
15647         return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
15648       if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
15649         return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
15650     }
15651
15652     // Otherwise, use a straight shuffle of a single input vector. We pass the
15653     // input vector to both operands to simulate this with a SHUFPS.
15654     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
15655                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15656   }
15657
15658   if (Subtarget.hasSSE2())
15659     if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
15660             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
15661       ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
15662       return ZExt;
15663     }
15664
15665   if (Subtarget.hasAVX2())
15666     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15667       return Extract;
15668
15669   // There are special ways we can lower some single-element blends. However, we
15670   // have custom ways we can lower more complex single-element blends below that
15671   // we defer to if both this and BLENDPS fail to match, so restrict this to
15672   // when the V2 input is targeting element 0 of the mask -- that is the fast
15673   // case here.
15674   if (NumV2Elements == 1 && Mask[0] >= 4)
15675     if (SDValue V = lowerShuffleAsElementInsertion(
15676             DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15677       return V;
15678
15679   if (Subtarget.hasSSE41()) {
15680     // Use INSERTPS if we can complete the shuffle efficiently.
15681     if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
15682       return V;
15683
15684     if (!isSingleSHUFPSMask(Mask))
15685       if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
15686                                                             V2, Mask, DAG))
15687         return BlendPerm;
15688   }
15689
15690   // Use low/high mov instructions. These are only valid in SSE1 because
15691   // otherwise they are widened to v2f64 and never get here.
15692   if (!Subtarget.hasSSE2()) {
15693     if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
15694       return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
15695     if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
15696       return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
15697   }
15698
15699   // Use dedicated unpack instructions for masks that match their pattern.
15700   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
15701     return V;
15702
15703   // Otherwise fall back to a SHUFPS lowering strategy.
15704   return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
15705 }
15706
15707 /// Lower 4-lane i32 vector shuffles.
15708 ///
15709 /// We try to handle these with integer-domain shuffles where we can, but for
15710 /// blends we use the floating point domain blend instructions.
15711 static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15712                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15713                                  const X86Subtarget &Subtarget,
15714                                  SelectionDAG &DAG) {
15715   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
15716   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
15717   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15718
15719   // Whenever we can lower this as a zext, that instruction is strictly faster
15720   // than any alternative. It also allows us to fold memory operands into the
15721   // shuffle in many cases.
15722   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
15723                                                    Zeroable, Subtarget, DAG))
15724     return ZExt;
15725
15726   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
15727
15728   // Try to use shift instructions if fast.
15729   if (Subtarget.preferLowerShuffleAsShift()) {
15730     if (SDValue Shift =
15731             lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
15732                                 Subtarget, DAG, /*BitwiseOnly*/ true))
15733       return Shift;
15734     if (NumV2Elements == 0)
15735       if (SDValue Rotate =
15736               lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
15737         return Rotate;
15738   }
15739
15740   if (NumV2Elements == 0) {
15741     // Try to use broadcast unless the mask only has one non-undef element.
15742     if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
15743       if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
15744                                                       Mask, Subtarget, DAG))
15745         return Broadcast;
15746     }
15747
15748     // Straight shuffle of a single input vector. For everything from SSE2
15749     // onward this has a single fast instruction with no scary immediates.
15750     // We coerce the shuffle pattern to be compatible with UNPCK instructions
15751     // but we aren't actually going to use the UNPCK instruction because doing
15752     // so prevents folding a load into this instruction or making a copy.
15753     const int UnpackLoMask[] = {0, 0, 1, 1};
15754     const int UnpackHiMask[] = {2, 2, 3, 3};
15755     if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
15756       Mask = UnpackLoMask;
15757     else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
15758       Mask = UnpackHiMask;
15759
15760     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
15761                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15762   }
15763
15764   if (Subtarget.hasAVX2())
15765     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15766       return Extract;
15767
15768   // Try to use shift instructions.
15769   if (SDValue Shift =
15770           lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
15771                               DAG, /*BitwiseOnly*/ false))
15772     return Shift;
15773
15774   // There are special ways we can lower some single-element blends.
15775   if (NumV2Elements == 1)
15776     if (SDValue V = lowerShuffleAsElementInsertion(
15777             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
15778       return V;
15779
15780   // We have different paths for blend lowering, but they all must use the
15781   // *exact* same predicate.
15782   bool IsBlendSupported = Subtarget.hasSSE41();
15783   if (IsBlendSupported)
15784     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
15785                                             Zeroable, Subtarget, DAG))
15786       return Blend;
15787
15788   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
15789                                              Zeroable, Subtarget, DAG))
15790     return Masked;
15791
15792   // Use dedicated unpack instructions for masks that match their pattern.
15793   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
15794     return V;
15795
15796   // Try to use byte rotation instructions.
15797   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
15798   if (Subtarget.hasSSSE3()) {
15799     if (Subtarget.hasVLX())
15800       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
15801                                                 Subtarget, DAG))
15802         return Rotate;
15803
15804     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
15805                                                   Subtarget, DAG))
15806       return Rotate;
15807   }
15808
15809   // Assume that a single SHUFPS is faster than an alternative sequence of
15810   // multiple instructions (even if the CPU has a domain penalty).
15811   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
15812   if (!isSingleSHUFPSMask(Mask)) {
15813     // If we have direct support for blends, we should lower by decomposing into
15814     // a permute. That will be faster than the domain cross.
15815     if (IsBlendSupported)
15816       return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
15817                                                   Subtarget, DAG);
15818
15819     // Try to lower by permuting the inputs into an unpack instruction.
15820     if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
15821                                                         Mask, Subtarget, DAG))
15822       return Unpack;
15823   }
15824
15825   // We implement this with SHUFPS because it can blend from two vectors.
15826   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
15827   // up the inputs, bypassing domain shift penalties that we would incur if we
15828   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
15829   // relevant.
15830   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
15831   SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
15832   SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
15833   return DAG.getBitcast(MVT::v4i32, ShufPS);
15834 }
15835
15836 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
15837 /// shuffle lowering, and the most complex part.
15838 ///
15839 /// The lowering strategy is to try to form pairs of input lanes which are
15840 /// targeted at the same half of the final vector, and then use a dword shuffle
15841 /// to place them onto the right half, and finally unpack the paired lanes into
15842 /// their final position.
15843 ///
15844 /// The exact breakdown of how to form these dword pairs and align them on the
15845 /// correct sides is really tricky. See the comments within the function for
15846 /// more of the details.
15847 ///
15848 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
15849 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
15850 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
15851 /// vector, form the analogous 128-bit 8-element Mask.
15852 static SDValue lowerV8I16GeneralSingleInputShuffle(
15853     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
15854     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15855   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
15856   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
15857
15858   assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
15859   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
15860   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
15861
15862   // Attempt to directly match PSHUFLW or PSHUFHW.
15863   if (isUndefOrInRange(LoMask, 0, 4) &&
15864       isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
15865     return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15866                        getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15867   }
15868   if (isUndefOrInRange(HiMask, 4, 8) &&
15869       isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
15870     for (int i = 0; i != 4; ++i)
15871       HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
15872     return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15873                        getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15874   }
15875
15876   SmallVector<int, 4> LoInputs;
15877   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
15878   array_pod_sort(LoInputs.begin(), LoInputs.end());
15879   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
15880   SmallVector<int, 4> HiInputs;
15881   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
15882   array_pod_sort(HiInputs.begin(), HiInputs.end());
15883   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
15884   int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
15885   int NumHToL = LoInputs.size() - NumLToL;
15886   int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
15887   int NumHToH = HiInputs.size() - NumLToH;
15888   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
15889   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
15890   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
15891   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
15892
15893   // If we are shuffling values from one half - check how many different DWORD
15894   // pairs we need to create. If only 1 or 2 then we can perform this as a
15895   // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
15896   auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
15897                                ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
15898     V = DAG.getNode(ShufWOp, DL, VT, V,
15899                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15900     V = DAG.getBitcast(PSHUFDVT, V);
15901     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
15902                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
15903     return DAG.getBitcast(VT, V);
15904   };
15905
15906   if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
15907     int PSHUFDMask[4] = { -1, -1, -1, -1 };
15908     SmallVector<std::pair<int, int>, 4> DWordPairs;
15909     int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
15910
15911     // Collect the different DWORD pairs.
15912     for (int DWord = 0; DWord != 4; ++DWord) {
15913       int M0 = Mask[2 * DWord + 0];
15914       int M1 = Mask[2 * DWord + 1];
15915       M0 = (M0 >= 0 ? M0 % 4 : M0);
15916       M1 = (M1 >= 0 ? M1 % 4 : M1);
15917       if (M0 < 0 && M1 < 0)
15918         continue;
15919
15920       bool Match = false;
15921       for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
15922         auto &DWordPair = DWordPairs[j];
15923         if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
15924             (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
15925           DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
15926           DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
15927           PSHUFDMask[DWord] = DOffset + j;
15928           Match = true;
15929           break;
15930         }
15931       }
15932       if (!Match) {
15933         PSHUFDMask[DWord] = DOffset + DWordPairs.size();
15934         DWordPairs.push_back(std::make_pair(M0, M1));
15935       }
15936     }
15937
15938     if (DWordPairs.size() <= 2) {
15939       DWordPairs.resize(2, std::make_pair(-1, -1));
15940       int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
15941                               DWordPairs[1].first, DWordPairs[1].second};
15942       if ((NumHToL + NumHToH) == 0)
15943         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
15944       if ((NumLToL + NumLToH) == 0)
15945         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
15946     }
15947   }
15948
15949   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
15950   // such inputs we can swap two of the dwords across the half mark and end up
15951   // with <=2 inputs to each half in each half. Once there, we can fall through
15952   // to the generic code below. For example:
15953   //
15954   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15955   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
15956   //
15957   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
15958   // and an existing 2-into-2 on the other half. In this case we may have to
15959   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
15960   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
15961   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
15962   // because any other situation (including a 3-into-1 or 1-into-3 in the other
15963   // half than the one we target for fixing) will be fixed when we re-enter this
15964   // path. We will also combine away any sequence of PSHUFD instructions that
15965   // result into a single instruction. Here is an example of the tricky case:
15966   //
15967   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
15968   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
15969   //
15970   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
15971   //
15972   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
15973   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
15974   //
15975   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
15976   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
15977   //
15978   // The result is fine to be handled by the generic logic.
15979   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
15980                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
15981                           int AOffset, int BOffset) {
15982     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
15983            "Must call this with A having 3 or 1 inputs from the A half.");
15984     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
15985            "Must call this with B having 1 or 3 inputs from the B half.");
15986     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
15987            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
15988
15989     bool ThreeAInputs = AToAInputs.size() == 3;
15990
15991     // Compute the index of dword with only one word among the three inputs in
15992     // a half by taking the sum of the half with three inputs and subtracting
15993     // the sum of the actual three inputs. The difference is the remaining
15994     // slot.
15995     int ADWord = 0, BDWord = 0;
15996     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
15997     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
15998     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
15999     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
16000     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
16001     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
16002     int TripleNonInputIdx =
16003         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
16004     TripleDWord = TripleNonInputIdx / 2;
16005
16006     // We use xor with one to compute the adjacent DWord to whichever one the
16007     // OneInput is in.
16008     OneInputDWord = (OneInput / 2) ^ 1;
16009
16010     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
16011     // and BToA inputs. If there is also such a problem with the BToB and AToB
16012     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
16013     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
16014     // is essential that we don't *create* a 3<-1 as then we might oscillate.
16015     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
16016       // Compute how many inputs will be flipped by swapping these DWords. We
16017       // need
16018       // to balance this to ensure we don't form a 3-1 shuffle in the other
16019       // half.
16020       int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
16021                                  llvm::count(AToBInputs, 2 * ADWord + 1);
16022       int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
16023                                  llvm::count(BToBInputs, 2 * BDWord + 1);
16024       if ((NumFlippedAToBInputs == 1 &&
16025            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
16026           (NumFlippedBToBInputs == 1 &&
16027            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
16028         // We choose whether to fix the A half or B half based on whether that
16029         // half has zero flipped inputs. At zero, we may not be able to fix it
16030         // with that half. We also bias towards fixing the B half because that
16031         // will more commonly be the high half, and we have to bias one way.
16032         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
16033                                                        ArrayRef<int> Inputs) {
16034           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
16035           bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
16036           // Determine whether the free index is in the flipped dword or the
16037           // unflipped dword based on where the pinned index is. We use this bit
16038           // in an xor to conditionally select the adjacent dword.
16039           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
16040           bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
16041           if (IsFixIdxInput == IsFixFreeIdxInput)
16042             FixFreeIdx += 1;
16043           IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
16044           assert(IsFixIdxInput != IsFixFreeIdxInput &&
16045                  "We need to be changing the number of flipped inputs!");
16046           int PSHUFHalfMask[] = {0, 1, 2, 3};
16047           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
16048           V = DAG.getNode(
16049               FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
16050               MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
16051               getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
16052
16053           for (int &M : Mask)
16054             if (M >= 0 && M == FixIdx)
16055               M = FixFreeIdx;
16056             else if (M >= 0 && M == FixFreeIdx)
16057               M = FixIdx;
16058         };
16059         if (NumFlippedBToBInputs != 0) {
16060           int BPinnedIdx =
16061               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
16062           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
16063         } else {
16064           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
16065           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
16066           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
16067         }
16068       }
16069     }
16070
16071     int PSHUFDMask[] = {0, 1, 2, 3};
16072     PSHUFDMask[ADWord] = BDWord;
16073     PSHUFDMask[BDWord] = ADWord;
16074     V = DAG.getBitcast(
16075         VT,
16076         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
16077                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16078
16079     // Adjust the mask to match the new locations of A and B.
16080     for (int &M : Mask)
16081       if (M >= 0 && M/2 == ADWord)
16082         M = 2 * BDWord + M % 2;
16083       else if (M >= 0 && M/2 == BDWord)
16084         M = 2 * ADWord + M % 2;
16085
16086     // Recurse back into this routine to re-compute state now that this isn't
16087     // a 3 and 1 problem.
16088     return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
16089   };
16090   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
16091     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
16092   if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
16093     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
16094
16095   // At this point there are at most two inputs to the low and high halves from
16096   // each half. That means the inputs can always be grouped into dwords and
16097   // those dwords can then be moved to the correct half with a dword shuffle.
16098   // We use at most one low and one high word shuffle to collect these paired
16099   // inputs into dwords, and finally a dword shuffle to place them.
16100   int PSHUFLMask[4] = {-1, -1, -1, -1};
16101   int PSHUFHMask[4] = {-1, -1, -1, -1};
16102   int PSHUFDMask[4] = {-1, -1, -1, -1};
16103
16104   // First fix the masks for all the inputs that are staying in their
16105   // original halves. This will then dictate the targets of the cross-half
16106   // shuffles.
16107   auto fixInPlaceInputs =
16108       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
16109                     MutableArrayRef<int> SourceHalfMask,
16110                     MutableArrayRef<int> HalfMask, int HalfOffset) {
16111     if (InPlaceInputs.empty())
16112       return;
16113     if (InPlaceInputs.size() == 1) {
16114       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
16115           InPlaceInputs[0] - HalfOffset;
16116       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
16117       return;
16118     }
16119     if (IncomingInputs.empty()) {
16120       // Just fix all of the in place inputs.
16121       for (int Input : InPlaceInputs) {
16122         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
16123         PSHUFDMask[Input / 2] = Input / 2;
16124       }
16125       return;
16126     }
16127
16128     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
16129     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
16130         InPlaceInputs[0] - HalfOffset;
16131     // Put the second input next to the first so that they are packed into
16132     // a dword. We find the adjacent index by toggling the low bit.
16133     int AdjIndex = InPlaceInputs[0] ^ 1;
16134     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
16135     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
16136     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
16137   };
16138   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
16139   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
16140
16141   // Now gather the cross-half inputs and place them into a free dword of
16142   // their target half.
16143   // FIXME: This operation could almost certainly be simplified dramatically to
16144   // look more like the 3-1 fixing operation.
16145   auto moveInputsToRightHalf = [&PSHUFDMask](
16146       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
16147       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
16148       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
16149       int DestOffset) {
16150     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
16151       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
16152     };
16153     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
16154                                                int Word) {
16155       int LowWord = Word & ~1;
16156       int HighWord = Word | 1;
16157       return isWordClobbered(SourceHalfMask, LowWord) ||
16158              isWordClobbered(SourceHalfMask, HighWord);
16159     };
16160
16161     if (IncomingInputs.empty())
16162       return;
16163
16164     if (ExistingInputs.empty()) {
16165       // Map any dwords with inputs from them into the right half.
16166       for (int Input : IncomingInputs) {
16167         // If the source half mask maps over the inputs, turn those into
16168         // swaps and use the swapped lane.
16169         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
16170           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
16171             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
16172                 Input - SourceOffset;
16173             // We have to swap the uses in our half mask in one sweep.
16174             for (int &M : HalfMask)
16175               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
16176                 M = Input;
16177               else if (M == Input)
16178                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
16179           } else {
16180             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
16181                        Input - SourceOffset &&
16182                    "Previous placement doesn't match!");
16183           }
16184           // Note that this correctly re-maps both when we do a swap and when
16185           // we observe the other side of the swap above. We rely on that to
16186           // avoid swapping the members of the input list directly.
16187           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
16188         }
16189
16190         // Map the input's dword into the correct half.
16191         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
16192           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
16193         else
16194           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
16195                      Input / 2 &&
16196                  "Previous placement doesn't match!");
16197       }
16198
16199       // And just directly shift any other-half mask elements to be same-half
16200       // as we will have mirrored the dword containing the element into the
16201       // same position within that half.
16202       for (int &M : HalfMask)
16203         if (M >= SourceOffset && M < SourceOffset + 4) {
16204           M = M - SourceOffset + DestOffset;
16205           assert(M >= 0 && "This should never wrap below zero!");
16206         }
16207       return;
16208     }
16209
16210     // Ensure we have the input in a viable dword of its current half. This
16211     // is particularly tricky because the original position may be clobbered
16212     // by inputs being moved and *staying* in that half.
16213     if (IncomingInputs.size() == 1) {
16214       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
16215         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
16216                          SourceOffset;
16217         SourceHalfMask[InputFixed - SourceOffset] =
16218             IncomingInputs[0] - SourceOffset;
16219         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
16220                      InputFixed);
16221         IncomingInputs[0] = InputFixed;
16222       }
16223     } else if (IncomingInputs.size() == 2) {
16224       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
16225           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
16226         // We have two non-adjacent or clobbered inputs we need to extract from
16227         // the source half. To do this, we need to map them into some adjacent
16228         // dword slot in the source mask.
16229         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
16230                               IncomingInputs[1] - SourceOffset};
16231
16232         // If there is a free slot in the source half mask adjacent to one of
16233         // the inputs, place the other input in it. We use (Index XOR 1) to
16234         // compute an adjacent index.
16235         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
16236             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
16237           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
16238           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
16239           InputsFixed[1] = InputsFixed[0] ^ 1;
16240         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
16241                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
16242           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
16243           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
16244           InputsFixed[0] = InputsFixed[1] ^ 1;
16245         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
16246                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
16247           // The two inputs are in the same DWord but it is clobbered and the
16248           // adjacent DWord isn't used at all. Move both inputs to the free
16249           // slot.
16250           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
16251           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
16252           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
16253           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
16254         } else {
16255           // The only way we hit this point is if there is no clobbering
16256           // (because there are no off-half inputs to this half) and there is no
16257           // free slot adjacent to one of the inputs. In this case, we have to
16258           // swap an input with a non-input.
16259           for (int i = 0; i < 4; ++i)
16260             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
16261                    "We can't handle any clobbers here!");
16262           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
16263                  "Cannot have adjacent inputs here!");
16264
16265           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
16266           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
16267
16268           // We also have to update the final source mask in this case because
16269           // it may need to undo the above swap.
16270           for (int &M : FinalSourceHalfMask)
16271             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
16272               M = InputsFixed[1] + SourceOffset;
16273             else if (M == InputsFixed[1] + SourceOffset)
16274               M = (InputsFixed[0] ^ 1) + SourceOffset;
16275
16276           InputsFixed[1] = InputsFixed[0] ^ 1;
16277         }
16278
16279         // Point everything at the fixed inputs.
16280         for (int &M : HalfMask)
16281           if (M == IncomingInputs[0])
16282             M = InputsFixed[0] + SourceOffset;
16283           else if (M == IncomingInputs[1])
16284             M = InputsFixed[1] + SourceOffset;
16285
16286         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
16287         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
16288       }
16289     } else {
16290       llvm_unreachable("Unhandled input size!");
16291     }
16292
16293     // Now hoist the DWord down to the right half.
16294     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
16295     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
16296     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
16297     for (int &M : HalfMask)
16298       for (int Input : IncomingInputs)
16299         if (M == Input)
16300           M = FreeDWord * 2 + Input % 2;
16301   };
16302   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
16303                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
16304   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
16305                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
16306
16307   // Now enact all the shuffles we've computed to move the inputs into their
16308   // target half.
16309   if (!isNoopShuffleMask(PSHUFLMask))
16310     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
16311                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
16312   if (!isNoopShuffleMask(PSHUFHMask))
16313     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
16314                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
16315   if (!isNoopShuffleMask(PSHUFDMask))
16316     V = DAG.getBitcast(
16317         VT,
16318         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
16319                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16320
16321   // At this point, each half should contain all its inputs, and we can then
16322   // just shuffle them into their final position.
16323   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
16324          "Failed to lift all the high half inputs to the low mask!");
16325   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
16326          "Failed to lift all the low half inputs to the high mask!");
16327
16328   // Do a half shuffle for the low mask.
16329   if (!isNoopShuffleMask(LoMask))
16330     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
16331                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
16332
16333   // Do a half shuffle with the high mask after shifting its values down.
16334   for (int &M : HiMask)
16335     if (M >= 0)
16336       M -= 4;
16337   if (!isNoopShuffleMask(HiMask))
16338     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
16339                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
16340
16341   return V;
16342 }
16343
16344 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
16345 /// blend if only one input is used.
16346 static SDValue lowerShuffleAsBlendOfPSHUFBs(
16347     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16348     const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
16349   assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
16350          "Lane crossing shuffle masks not supported");
16351
16352   int NumBytes = VT.getSizeInBits() / 8;
16353   int Size = Mask.size();
16354   int Scale = NumBytes / Size;
16355
16356   SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
16357   SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
16358   V1InUse = false;
16359   V2InUse = false;
16360
16361   for (int i = 0; i < NumBytes; ++i) {
16362     int M = Mask[i / Scale];
16363     if (M < 0)
16364       continue;
16365
16366     const int ZeroMask = 0x80;
16367     int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
16368     int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
16369     if (Zeroable[i / Scale])
16370       V1Idx = V2Idx = ZeroMask;
16371
16372     V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
16373     V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
16374     V1InUse |= (ZeroMask != V1Idx);
16375     V2InUse |= (ZeroMask != V2Idx);
16376   }
16377
16378   MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
16379   if (V1InUse)
16380     V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
16381                      DAG.getBuildVector(ShufVT, DL, V1Mask));
16382   if (V2InUse)
16383     V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
16384                      DAG.getBuildVector(ShufVT, DL, V2Mask));
16385
16386   // If we need shuffled inputs from both, blend the two.
16387   SDValue V;
16388   if (V1InUse && V2InUse)
16389     V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
16390   else
16391     V = V1InUse ? V1 : V2;
16392
16393   // Cast the result back to the correct type.
16394   return DAG.getBitcast(VT, V);
16395 }
16396
16397 /// Generic lowering of 8-lane i16 shuffles.
16398 ///
16399 /// This handles both single-input shuffles and combined shuffle/blends with
16400 /// two inputs. The single input shuffles are immediately delegated to
16401 /// a dedicated lowering routine.
16402 ///
16403 /// The blends are lowered in one of three fundamental ways. If there are few
16404 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
16405 /// of the input is significantly cheaper when lowered as an interleaving of
16406 /// the two inputs, try to interleave them. Otherwise, blend the low and high
16407 /// halves of the inputs separately (making them have relatively few inputs)
16408 /// and then concatenate them.
16409 static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16410                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16411                                  const X86Subtarget &Subtarget,
16412                                  SelectionDAG &DAG) {
16413   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
16414   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
16415   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16416
16417   // Whenever we can lower this as a zext, that instruction is strictly faster
16418   // than any alternative.
16419   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
16420                                                    Zeroable, Subtarget, DAG))
16421     return ZExt;
16422
16423   // Try to use lower using a truncation.
16424   if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
16425                                         Subtarget, DAG))
16426     return V;
16427
16428   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
16429
16430   if (NumV2Inputs == 0) {
16431     // Try to use shift instructions.
16432     if (SDValue Shift =
16433             lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
16434                                 Subtarget, DAG, /*BitwiseOnly*/ false))
16435       return Shift;
16436
16437     // Check for being able to broadcast a single element.
16438     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
16439                                                     Mask, Subtarget, DAG))
16440       return Broadcast;
16441
16442     // Try to use bit rotation instructions.
16443     if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
16444                                                  Subtarget, DAG))
16445       return Rotate;
16446
16447     // Use dedicated unpack instructions for masks that match their pattern.
16448     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
16449       return V;
16450
16451     // Use dedicated pack instructions for masks that match their pattern.
16452     if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
16453                                          Subtarget))
16454       return V;
16455
16456     // Try to use byte rotation instructions.
16457     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
16458                                                   Subtarget, DAG))
16459       return Rotate;
16460
16461     // Make a copy of the mask so it can be modified.
16462     SmallVector<int, 8> MutableMask(Mask);
16463     return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
16464                                                Subtarget, DAG);
16465   }
16466
16467   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
16468          "All single-input shuffles should be canonicalized to be V1-input "
16469          "shuffles.");
16470
16471   // Try to use shift instructions.
16472   if (SDValue Shift =
16473           lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
16474                               DAG, /*BitwiseOnly*/ false))
16475     return Shift;
16476
16477   // See if we can use SSE4A Extraction / Insertion.
16478   if (Subtarget.hasSSE4A())
16479     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
16480                                           Zeroable, DAG))
16481       return V;
16482
16483   // There are special ways we can lower some single-element blends.
16484   if (NumV2Inputs == 1)
16485     if (SDValue V = lowerShuffleAsElementInsertion(
16486             DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16487       return V;
16488
16489   // We have different paths for blend lowering, but they all must use the
16490   // *exact* same predicate.
16491   bool IsBlendSupported = Subtarget.hasSSE41();
16492   if (IsBlendSupported)
16493     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
16494                                             Zeroable, Subtarget, DAG))
16495       return Blend;
16496
16497   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
16498                                              Zeroable, Subtarget, DAG))
16499     return Masked;
16500
16501   // Use dedicated unpack instructions for masks that match their pattern.
16502   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
16503     return V;
16504
16505   // Use dedicated pack instructions for masks that match their pattern.
16506   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
16507                                        Subtarget))
16508     return V;
16509
16510   // Try to use lower using a truncation.
16511   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
16512                                        Subtarget, DAG))
16513     return V;
16514
16515   // Try to use byte rotation instructions.
16516   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
16517                                                 Subtarget, DAG))
16518     return Rotate;
16519
16520   if (SDValue BitBlend =
16521           lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
16522     return BitBlend;
16523
16524   // Try to use byte shift instructions to mask.
16525   if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
16526                                               Zeroable, Subtarget, DAG))
16527     return V;
16528
16529   // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
16530   int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
16531   if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
16532       !Subtarget.hasVLX()) {
16533     // Check if this is part of a 256-bit vector truncation.
16534     unsigned PackOpc = 0;
16535     if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
16536         peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16537         peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
16538       SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
16539       V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
16540                          getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
16541                          DAG.getTargetConstant(0xEE, DL, MVT::i8));
16542       V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
16543       V1 = extract128BitVector(V1V2, 0, DAG, DL);
16544       V2 = extract128BitVector(V1V2, 4, DAG, DL);
16545       PackOpc = X86ISD::PACKUS;
16546     } else if (Subtarget.hasSSE41()) {
16547       SmallVector<SDValue, 4> DWordClearOps(4,
16548                                             DAG.getConstant(0, DL, MVT::i32));
16549       for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
16550         DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
16551       SDValue DWordClearMask =
16552           DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
16553       V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
16554                        DWordClearMask);
16555       V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
16556                        DWordClearMask);
16557       PackOpc = X86ISD::PACKUS;
16558     } else if (!Subtarget.hasSSSE3()) {
16559       SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
16560       V1 = DAG.getBitcast(MVT::v4i32, V1);
16561       V2 = DAG.getBitcast(MVT::v4i32, V2);
16562       V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
16563       V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
16564       V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
16565       V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
16566       PackOpc = X86ISD::PACKSS;
16567     }
16568     if (PackOpc) {
16569       // Now pack things back together.
16570       SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
16571       if (NumEvenDrops == 2) {
16572         Result = DAG.getBitcast(MVT::v4i32, Result);
16573         Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
16574       }
16575       return Result;
16576     }
16577   }
16578
16579   // When compacting odd (upper) elements, use PACKSS pre-SSE41.
16580   int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
16581   if (NumOddDrops == 1) {
16582     bool HasSSE41 = Subtarget.hasSSE41();
16583     V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
16584                      DAG.getBitcast(MVT::v4i32, V1),
16585                      DAG.getTargetConstant(16, DL, MVT::i8));
16586     V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
16587                      DAG.getBitcast(MVT::v4i32, V2),
16588                      DAG.getTargetConstant(16, DL, MVT::i8));
16589     return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
16590                        MVT::v8i16, V1, V2);
16591   }
16592
16593   // Try to lower by permuting the inputs into an unpack instruction.
16594   if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
16595                                                       Mask, Subtarget, DAG))
16596     return Unpack;
16597
16598   // If we can't directly blend but can use PSHUFB, that will be better as it
16599   // can both shuffle and set up the inefficient blend.
16600   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
16601     bool V1InUse, V2InUse;
16602     return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
16603                                         Zeroable, DAG, V1InUse, V2InUse);
16604   }
16605
16606   // We can always bit-blend if we have to so the fallback strategy is to
16607   // decompose into single-input permutes and blends/unpacks.
16608   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
16609                                               Mask, Subtarget, DAG);
16610 }
16611
16612 /// Lower 8-lane 16-bit floating point shuffles.
16613 static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16614                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16615                                  const X86Subtarget &Subtarget,
16616                                  SelectionDAG &DAG) {
16617   assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
16618   assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
16619   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16620   int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16621
16622   if (Subtarget.hasFP16()) {
16623     if (NumV2Elements == 0) {
16624       // Check for being able to broadcast a single element.
16625       if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
16626                                                       Mask, Subtarget, DAG))
16627         return Broadcast;
16628     }
16629     if (NumV2Elements == 1 && Mask[0] >= 8)
16630       if (SDValue V = lowerShuffleAsElementInsertion(
16631               DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16632         return V;
16633   }
16634
16635   V1 = DAG.getBitcast(MVT::v8i16, V1);
16636   V2 = DAG.getBitcast(MVT::v8i16, V2);
16637   return DAG.getBitcast(MVT::v8f16,
16638                         DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
16639 }
16640
16641 // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
16642 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
16643 // the active subvector is extracted.
16644 static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
16645                                      ArrayRef<int> Mask, SDValue V1, SDValue V2,
16646                                      const X86Subtarget &Subtarget,
16647                                      SelectionDAG &DAG) {
16648   MVT MaskVT = VT.changeTypeToInteger();
16649   SDValue MaskNode;
16650   MVT ShuffleVT = VT;
16651   if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
16652     V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
16653     V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
16654     ShuffleVT = V1.getSimpleValueType();
16655
16656     // Adjust mask to correct indices for the second input.
16657     int NumElts = VT.getVectorNumElements();
16658     unsigned Scale = 512 / VT.getSizeInBits();
16659     SmallVector<int, 32> AdjustedMask(Mask);
16660     for (int &M : AdjustedMask)
16661       if (NumElts <= M)
16662         M += (Scale - 1) * NumElts;
16663     MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
16664     MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
16665   } else {
16666     MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
16667   }
16668
16669   SDValue Result;
16670   if (V2.isUndef())
16671     Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
16672   else
16673     Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
16674
16675   if (VT != ShuffleVT)
16676     Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
16677
16678   return Result;
16679 }
16680
16681 /// Generic lowering of v16i8 shuffles.
16682 ///
16683 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
16684 /// detect any complexity reducing interleaving. If that doesn't help, it uses
16685 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
16686 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
16687 /// back together.
16688 static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16689                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16690                                  const X86Subtarget &Subtarget,
16691                                  SelectionDAG &DAG) {
16692   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
16693   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
16694   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16695
16696   // Try to use shift instructions.
16697   if (SDValue Shift =
16698           lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
16699                               DAG, /*BitwiseOnly*/ false))
16700     return Shift;
16701
16702   // Try to use byte rotation instructions.
16703   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
16704                                                 Subtarget, DAG))
16705     return Rotate;
16706
16707   // Use dedicated pack instructions for masks that match their pattern.
16708   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
16709                                        Subtarget))
16710     return V;
16711
16712   // Try to use a zext lowering.
16713   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
16714                                                    Zeroable, Subtarget, DAG))
16715     return ZExt;
16716
16717   // Try to use lower using a truncation.
16718   if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16719                                         Subtarget, DAG))
16720     return V;
16721
16722   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
16723                                        Subtarget, DAG))
16724     return V;
16725
16726   // See if we can use SSE4A Extraction / Insertion.
16727   if (Subtarget.hasSSE4A())
16728     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
16729                                           Zeroable, DAG))
16730       return V;
16731
16732   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16733
16734   // For single-input shuffles, there are some nicer lowering tricks we can use.
16735   if (NumV2Elements == 0) {
16736     // Check for being able to broadcast a single element.
16737     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
16738                                                     Mask, Subtarget, DAG))
16739       return Broadcast;
16740
16741     // Try to use bit rotation instructions.
16742     if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
16743                                                  Subtarget, DAG))
16744       return Rotate;
16745
16746     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16747       return V;
16748
16749     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
16750     // Notably, this handles splat and partial-splat shuffles more efficiently.
16751     // However, it only makes sense if the pre-duplication shuffle simplifies
16752     // things significantly. Currently, this means we need to be able to
16753     // express the pre-duplication shuffle as an i16 shuffle.
16754     //
16755     // FIXME: We should check for other patterns which can be widened into an
16756     // i16 shuffle as well.
16757     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
16758       for (int i = 0; i < 16; i += 2)
16759         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
16760           return false;
16761
16762       return true;
16763     };
16764     auto tryToWidenViaDuplication = [&]() -> SDValue {
16765       if (!canWidenViaDuplication(Mask))
16766         return SDValue();
16767       SmallVector<int, 4> LoInputs;
16768       copy_if(Mask, std::back_inserter(LoInputs),
16769               [](int M) { return M >= 0 && M < 8; });
16770       array_pod_sort(LoInputs.begin(), LoInputs.end());
16771       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
16772                      LoInputs.end());
16773       SmallVector<int, 4> HiInputs;
16774       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
16775       array_pod_sort(HiInputs.begin(), HiInputs.end());
16776       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
16777                      HiInputs.end());
16778
16779       bool TargetLo = LoInputs.size() >= HiInputs.size();
16780       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
16781       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
16782
16783       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
16784       SmallDenseMap<int, int, 8> LaneMap;
16785       for (int I : InPlaceInputs) {
16786         PreDupI16Shuffle[I/2] = I/2;
16787         LaneMap[I] = I;
16788       }
16789       int j = TargetLo ? 0 : 4, je = j + 4;
16790       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
16791         // Check if j is already a shuffle of this input. This happens when
16792         // there are two adjacent bytes after we move the low one.
16793         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
16794           // If we haven't yet mapped the input, search for a slot into which
16795           // we can map it.
16796           while (j < je && PreDupI16Shuffle[j] >= 0)
16797             ++j;
16798
16799           if (j == je)
16800             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
16801             return SDValue();
16802
16803           // Map this input with the i16 shuffle.
16804           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
16805         }
16806
16807         // Update the lane map based on the mapping we ended up with.
16808         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
16809       }
16810       V1 = DAG.getBitcast(
16811           MVT::v16i8,
16812           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16813                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
16814
16815       // Unpack the bytes to form the i16s that will be shuffled into place.
16816       bool EvenInUse = false, OddInUse = false;
16817       for (int i = 0; i < 16; i += 2) {
16818         EvenInUse |= (Mask[i + 0] >= 0);
16819         OddInUse |= (Mask[i + 1] >= 0);
16820         if (EvenInUse && OddInUse)
16821           break;
16822       }
16823       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
16824                        MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
16825                        OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
16826
16827       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
16828       for (int i = 0; i < 16; ++i)
16829         if (Mask[i] >= 0) {
16830           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
16831           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
16832           if (PostDupI16Shuffle[i / 2] < 0)
16833             PostDupI16Shuffle[i / 2] = MappedMask;
16834           else
16835             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
16836                    "Conflicting entries in the original shuffle!");
16837         }
16838       return DAG.getBitcast(
16839           MVT::v16i8,
16840           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
16841                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
16842     };
16843     if (SDValue V = tryToWidenViaDuplication())
16844       return V;
16845   }
16846
16847   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
16848                                              Zeroable, Subtarget, DAG))
16849     return Masked;
16850
16851   // Use dedicated unpack instructions for masks that match their pattern.
16852   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
16853     return V;
16854
16855   // Try to use byte shift instructions to mask.
16856   if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
16857                                               Zeroable, Subtarget, DAG))
16858     return V;
16859
16860   // Check for compaction patterns.
16861   bool IsSingleInput = V2.isUndef();
16862   int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
16863
16864   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
16865   // with PSHUFB. It is important to do this before we attempt to generate any
16866   // blends but after all of the single-input lowerings. If the single input
16867   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
16868   // want to preserve that and we can DAG combine any longer sequences into
16869   // a PSHUFB in the end. But once we start blending from multiple inputs,
16870   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
16871   // and there are *very* few patterns that would actually be faster than the
16872   // PSHUFB approach because of its ability to zero lanes.
16873   //
16874   // If the mask is a binary compaction, we can more efficiently perform this
16875   // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
16876   //
16877   // FIXME: The only exceptions to the above are blends which are exact
16878   // interleavings with direct instructions supporting them. We currently don't
16879   // handle those well here.
16880   if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
16881     bool V1InUse = false;
16882     bool V2InUse = false;
16883
16884     SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
16885         DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
16886
16887     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
16888     // do so. This avoids using them to handle blends-with-zero which is
16889     // important as a single pshufb is significantly faster for that.
16890     if (V1InUse && V2InUse) {
16891       if (Subtarget.hasSSE41())
16892         if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
16893                                                 Zeroable, Subtarget, DAG))
16894           return Blend;
16895
16896       // We can use an unpack to do the blending rather than an or in some
16897       // cases. Even though the or may be (very minorly) more efficient, we
16898       // preference this lowering because there are common cases where part of
16899       // the complexity of the shuffles goes away when we do the final blend as
16900       // an unpack.
16901       // FIXME: It might be worth trying to detect if the unpack-feeding
16902       // shuffles will both be pshufb, in which case we shouldn't bother with
16903       // this.
16904       if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
16905               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16906         return Unpack;
16907
16908       // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16909       if (Subtarget.hasVBMI())
16910         return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
16911                                      DAG);
16912
16913       // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
16914       if (Subtarget.hasXOP()) {
16915         SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
16916         return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
16917       }
16918
16919       // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
16920       // PALIGNR will be cheaper than the second PSHUFB+OR.
16921       if (SDValue V = lowerShuffleAsByteRotateAndPermute(
16922               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
16923         return V;
16924     }
16925
16926     return PSHUFB;
16927   }
16928
16929   // There are special ways we can lower some single-element blends.
16930   if (NumV2Elements == 1)
16931     if (SDValue V = lowerShuffleAsElementInsertion(
16932             DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
16933       return V;
16934
16935   if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
16936     return Blend;
16937
16938   // Check whether a compaction lowering can be done. This handles shuffles
16939   // which take every Nth element for some even N. See the helper function for
16940   // details.
16941   //
16942   // We special case these as they can be particularly efficiently handled with
16943   // the PACKUSB instruction on x86 and they show up in common patterns of
16944   // rearranging bytes to truncate wide elements.
16945   if (NumEvenDrops) {
16946     // NumEvenDrops is the power of two stride of the elements. Another way of
16947     // thinking about it is that we need to drop the even elements this many
16948     // times to get the original input.
16949
16950     // First we need to zero all the dropped bytes.
16951     assert(NumEvenDrops <= 3 &&
16952            "No support for dropping even elements more than 3 times.");
16953     SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
16954     for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
16955       WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
16956     SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
16957     V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
16958                      WordClearMask);
16959     if (!IsSingleInput)
16960       V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
16961                        WordClearMask);
16962
16963     // Now pack things back together.
16964     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16965                                  IsSingleInput ? V1 : V2);
16966     for (int i = 1; i < NumEvenDrops; ++i) {
16967       Result = DAG.getBitcast(MVT::v8i16, Result);
16968       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
16969     }
16970     return Result;
16971   }
16972
16973   int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
16974   if (NumOddDrops == 1) {
16975     V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16976                      DAG.getBitcast(MVT::v8i16, V1),
16977                      DAG.getTargetConstant(8, DL, MVT::i8));
16978     if (!IsSingleInput)
16979       V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
16980                        DAG.getBitcast(MVT::v8i16, V2),
16981                        DAG.getTargetConstant(8, DL, MVT::i8));
16982     return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
16983                        IsSingleInput ? V1 : V2);
16984   }
16985
16986   // Handle multi-input cases by blending/unpacking single-input shuffles.
16987   if (NumV2Elements > 0)
16988     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
16989                                                 Subtarget, DAG);
16990
16991   // The fallback path for single-input shuffles widens this into two v8i16
16992   // vectors with unpacks, shuffles those, and then pulls them back together
16993   // with a pack.
16994   SDValue V = V1;
16995
16996   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16997   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
16998   for (int i = 0; i < 16; ++i)
16999     if (Mask[i] >= 0)
17000       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
17001
17002   SDValue VLoHalf, VHiHalf;
17003   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
17004   // them out and avoid using UNPCK{L,H} to extract the elements of V as
17005   // i16s.
17006   if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
17007       none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
17008     // Use a mask to drop the high bytes.
17009     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
17010     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
17011                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
17012
17013     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
17014     VHiHalf = DAG.getUNDEF(MVT::v8i16);
17015
17016     // Squash the masks to point directly into VLoHalf.
17017     for (int &M : LoBlendMask)
17018       if (M >= 0)
17019         M /= 2;
17020     for (int &M : HiBlendMask)
17021       if (M >= 0)
17022         M /= 2;
17023   } else {
17024     // Otherwise just unpack the low half of V into VLoHalf and the high half into
17025     // VHiHalf so that we can blend them as i16s.
17026     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
17027
17028     VLoHalf = DAG.getBitcast(
17029         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
17030     VHiHalf = DAG.getBitcast(
17031         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
17032   }
17033
17034   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
17035   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
17036
17037   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
17038 }
17039
17040 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
17041 ///
17042 /// This routine breaks down the specific type of 128-bit shuffle and
17043 /// dispatches to the lowering routines accordingly.
17044 static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
17045                                   MVT VT, SDValue V1, SDValue V2,
17046                                   const APInt &Zeroable,
17047                                   const X86Subtarget &Subtarget,
17048                                   SelectionDAG &DAG) {
17049   switch (VT.SimpleTy) {
17050   case MVT::v2i64:
17051     return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17052   case MVT::v2f64:
17053     return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17054   case MVT::v4i32:
17055     return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17056   case MVT::v4f32:
17057     return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17058   case MVT::v8i16:
17059     return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17060   case MVT::v8f16:
17061     return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17062   case MVT::v16i8:
17063     return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17064
17065   default:
17066     llvm_unreachable("Unimplemented!");
17067   }
17068 }
17069
17070 /// Generic routine to split vector shuffle into half-sized shuffles.
17071 ///
17072 /// This routine just extracts two subvectors, shuffles them independently, and
17073 /// then concatenates them back together. This should work effectively with all
17074 /// AVX vector shuffle types.
17075 static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
17076                                     SDValue V2, ArrayRef<int> Mask,
17077                                     SelectionDAG &DAG, bool SimpleOnly) {
17078   assert(VT.getSizeInBits() >= 256 &&
17079          "Only for 256-bit or wider vector shuffles!");
17080   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
17081   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
17082
17083   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
17084   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
17085
17086   int NumElements = VT.getVectorNumElements();
17087   int SplitNumElements = NumElements / 2;
17088   MVT ScalarVT = VT.getVectorElementType();
17089   MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
17090
17091   // Use splitVector/extractSubVector so that split build-vectors just build two
17092   // narrower build vectors. This helps shuffling with splats and zeros.
17093   auto SplitVector = [&](SDValue V) {
17094     SDValue LoV, HiV;
17095     std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
17096     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
17097                           DAG.getBitcast(SplitVT, HiV));
17098   };
17099
17100   SDValue LoV1, HiV1, LoV2, HiV2;
17101   std::tie(LoV1, HiV1) = SplitVector(V1);
17102   std::tie(LoV2, HiV2) = SplitVector(V2);
17103
17104   // Now create two 4-way blends of these half-width vectors.
17105   auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
17106                                    bool &UseHiV1, bool &UseLoV2,
17107                                    bool &UseHiV2) {
17108     UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
17109     for (int i = 0; i < SplitNumElements; ++i) {
17110       int M = HalfMask[i];
17111       if (M >= NumElements) {
17112         if (M >= NumElements + SplitNumElements)
17113           UseHiV2 = true;
17114         else
17115           UseLoV2 = true;
17116       } else if (M >= 0) {
17117         if (M >= SplitNumElements)
17118           UseHiV1 = true;
17119         else
17120           UseLoV1 = true;
17121       }
17122     }
17123   };
17124
17125   auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
17126     if (!SimpleOnly)
17127       return true;
17128
17129     bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
17130     GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
17131
17132     return !(UseHiV1 || UseHiV2);
17133   };
17134
17135   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
17136     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
17137     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
17138     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
17139     for (int i = 0; i < SplitNumElements; ++i) {
17140       int M = HalfMask[i];
17141       if (M >= NumElements) {
17142         V2BlendMask[i] = M - NumElements;
17143         BlendMask[i] = SplitNumElements + i;
17144       } else if (M >= 0) {
17145         V1BlendMask[i] = M;
17146         BlendMask[i] = i;
17147       }
17148     }
17149
17150     bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
17151     GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
17152
17153     // Because the lowering happens after all combining takes place, we need to
17154     // manually combine these blend masks as much as possible so that we create
17155     // a minimal number of high-level vector shuffle nodes.
17156     assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
17157
17158     // First try just blending the halves of V1 or V2.
17159     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
17160       return DAG.getUNDEF(SplitVT);
17161     if (!UseLoV2 && !UseHiV2)
17162       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
17163     if (!UseLoV1 && !UseHiV1)
17164       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
17165
17166     SDValue V1Blend, V2Blend;
17167     if (UseLoV1 && UseHiV1) {
17168       V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
17169     } else {
17170       // We only use half of V1 so map the usage down into the final blend mask.
17171       V1Blend = UseLoV1 ? LoV1 : HiV1;
17172       for (int i = 0; i < SplitNumElements; ++i)
17173         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
17174           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
17175     }
17176     if (UseLoV2 && UseHiV2) {
17177       V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
17178     } else {
17179       // We only use half of V2 so map the usage down into the final blend mask.
17180       V2Blend = UseLoV2 ? LoV2 : HiV2;
17181       for (int i = 0; i < SplitNumElements; ++i)
17182         if (BlendMask[i] >= SplitNumElements)
17183           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
17184     }
17185     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
17186   };
17187
17188   if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
17189     return SDValue();
17190
17191   SDValue Lo = HalfBlend(LoMask);
17192   SDValue Hi = HalfBlend(HiMask);
17193   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
17194 }
17195
17196 /// Either split a vector in halves or decompose the shuffles and the
17197 /// blend/unpack.
17198 ///
17199 /// This is provided as a good fallback for many lowerings of non-single-input
17200 /// shuffles with more than one 128-bit lane. In those cases, we want to select
17201 /// between splitting the shuffle into 128-bit components and stitching those
17202 /// back together vs. extracting the single-input shuffles and blending those
17203 /// results.
17204 static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
17205                                           SDValue V2, ArrayRef<int> Mask,
17206                                           const X86Subtarget &Subtarget,
17207                                           SelectionDAG &DAG) {
17208   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
17209          "shuffles as it could then recurse on itself.");
17210   int Size = Mask.size();
17211
17212   // If this can be modeled as a broadcast of two elements followed by a blend,
17213   // prefer that lowering. This is especially important because broadcasts can
17214   // often fold with memory operands.
17215   auto DoBothBroadcast = [&] {
17216     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
17217     for (int M : Mask)
17218       if (M >= Size) {
17219         if (V2BroadcastIdx < 0)
17220           V2BroadcastIdx = M - Size;
17221         else if (M - Size != V2BroadcastIdx)
17222           return false;
17223       } else if (M >= 0) {
17224         if (V1BroadcastIdx < 0)
17225           V1BroadcastIdx = M;
17226         else if (M != V1BroadcastIdx)
17227           return false;
17228       }
17229     return true;
17230   };
17231   if (DoBothBroadcast())
17232     return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
17233                                                 DAG);
17234
17235   // If the inputs all stem from a single 128-bit lane of each input, then we
17236   // split them rather than blending because the split will decompose to
17237   // unusually few instructions.
17238   int LaneCount = VT.getSizeInBits() / 128;
17239   int LaneSize = Size / LaneCount;
17240   SmallBitVector LaneInputs[2];
17241   LaneInputs[0].resize(LaneCount, false);
17242   LaneInputs[1].resize(LaneCount, false);
17243   for (int i = 0; i < Size; ++i)
17244     if (Mask[i] >= 0)
17245       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
17246   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
17247     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17248                                 /*SimpleOnly*/ false);
17249
17250   // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
17251   // requires that the decomposed single-input shuffles don't end up here.
17252   return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
17253                                               DAG);
17254 }
17255
17256 // Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17257 // TODO: Extend to support v8f32 (+ 512-bit shuffles).
17258 static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
17259                                                  SDValue V1, SDValue V2,
17260                                                  ArrayRef<int> Mask,
17261                                                  SelectionDAG &DAG) {
17262   assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
17263
17264   int LHSMask[4] = {-1, -1, -1, -1};
17265   int RHSMask[4] = {-1, -1, -1, -1};
17266   unsigned SHUFPMask = 0;
17267
17268   // As SHUFPD uses a single LHS/RHS element per lane, we can always
17269   // perform the shuffle once the lanes have been shuffled in place.
17270   for (int i = 0; i != 4; ++i) {
17271     int M = Mask[i];
17272     if (M < 0)
17273       continue;
17274     int LaneBase = i & ~1;
17275     auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
17276     LaneMask[LaneBase + (M & 1)] = M;
17277     SHUFPMask |= (M & 1) << i;
17278   }
17279
17280   SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
17281   SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
17282   return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
17283                      DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
17284 }
17285
17286 /// Lower a vector shuffle crossing multiple 128-bit lanes as
17287 /// a lane permutation followed by a per-lane permutation.
17288 ///
17289 /// This is mainly for cases where we can have non-repeating permutes
17290 /// in each lane.
17291 ///
17292 /// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
17293 /// we should investigate merging them.
17294 static SDValue lowerShuffleAsLanePermuteAndPermute(
17295     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17296     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
17297   int NumElts = VT.getVectorNumElements();
17298   int NumLanes = VT.getSizeInBits() / 128;
17299   int NumEltsPerLane = NumElts / NumLanes;
17300   bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
17301
17302   /// Attempts to find a sublane permute with the given size
17303   /// that gets all elements into their target lanes.
17304   ///
17305   /// If successful, fills CrossLaneMask and InLaneMask and returns true.
17306   /// If unsuccessful, returns false and may overwrite InLaneMask.
17307   auto getSublanePermute = [&](int NumSublanes) -> SDValue {
17308     int NumSublanesPerLane = NumSublanes / NumLanes;
17309     int NumEltsPerSublane = NumElts / NumSublanes;
17310
17311     SmallVector<int, 16> CrossLaneMask;
17312     SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
17313     // CrossLaneMask but one entry == one sublane.
17314     SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
17315
17316     for (int i = 0; i != NumElts; ++i) {
17317       int M = Mask[i];
17318       if (M < 0)
17319         continue;
17320
17321       int SrcSublane = M / NumEltsPerSublane;
17322       int DstLane = i / NumEltsPerLane;
17323
17324       // We only need to get the elements into the right lane, not sublane.
17325       // So search all sublanes that make up the destination lane.
17326       bool Found = false;
17327       int DstSubStart = DstLane * NumSublanesPerLane;
17328       int DstSubEnd = DstSubStart + NumSublanesPerLane;
17329       for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
17330         if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
17331           continue;
17332
17333         Found = true;
17334         CrossLaneMaskLarge[DstSublane] = SrcSublane;
17335         int DstSublaneOffset = DstSublane * NumEltsPerSublane;
17336         InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
17337         break;
17338       }
17339       if (!Found)
17340         return SDValue();
17341     }
17342
17343     // Fill CrossLaneMask using CrossLaneMaskLarge.
17344     narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
17345
17346     if (!CanUseSublanes) {
17347       // If we're only shuffling a single lowest lane and the rest are identity
17348       // then don't bother.
17349       // TODO - isShuffleMaskInputInPlace could be extended to something like
17350       // this.
17351       int NumIdentityLanes = 0;
17352       bool OnlyShuffleLowestLane = true;
17353       for (int i = 0; i != NumLanes; ++i) {
17354         int LaneOffset = i * NumEltsPerLane;
17355         if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
17356                                        i * NumEltsPerLane))
17357           NumIdentityLanes++;
17358         else if (CrossLaneMask[LaneOffset] != 0)
17359           OnlyShuffleLowestLane = false;
17360       }
17361       if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
17362         return SDValue();
17363     }
17364
17365     // Avoid returning the same shuffle operation. For example,
17366     // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
17367     //                             undef:v16i16
17368     if (CrossLaneMask == Mask || InLaneMask == Mask)
17369       return SDValue();
17370
17371     SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
17372     return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
17373                                 InLaneMask);
17374   };
17375
17376   // First attempt a solution with full lanes.
17377   if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
17378     return V;
17379
17380   // The rest of the solutions use sublanes.
17381   if (!CanUseSublanes)
17382     return SDValue();
17383
17384   // Then attempt a solution with 64-bit sublanes (vpermq).
17385   if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
17386     return V;
17387
17388   // If that doesn't work and we have fast variable cross-lane shuffle,
17389   // attempt 32-bit sublanes (vpermd).
17390   if (!Subtarget.hasFastVariableCrossLaneShuffle())
17391     return SDValue();
17392
17393   return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
17394 }
17395
17396 /// Helper to get compute inlane shuffle mask for a complete shuffle mask.
17397 static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
17398                                      SmallVector<int> &InLaneMask) {
17399   int Size = Mask.size();
17400   InLaneMask.assign(Mask.begin(), Mask.end());
17401   for (int i = 0; i < Size; ++i) {
17402     int &M = InLaneMask[i];
17403     if (M < 0)
17404       continue;
17405     if (((M % Size) / LaneSize) != (i / LaneSize))
17406       M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
17407   }
17408 }
17409
17410 /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
17411 /// source with a lane permutation.
17412 ///
17413 /// This lowering strategy results in four instructions in the worst case for a
17414 /// single-input cross lane shuffle which is lower than any other fully general
17415 /// cross-lane shuffle strategy I'm aware of. Special cases for each particular
17416 /// shuffle pattern should be handled prior to trying this lowering.
17417 static SDValue lowerShuffleAsLanePermuteAndShuffle(
17418     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17419     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
17420   // FIXME: This should probably be generalized for 512-bit vectors as well.
17421   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
17422   int Size = Mask.size();
17423   int LaneSize = Size / 2;
17424
17425   // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17426   // Only do this if the elements aren't all from the lower lane,
17427   // otherwise we're (probably) better off doing a split.
17428   if (VT == MVT::v4f64 &&
17429       !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
17430     return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
17431
17432   // If there are only inputs from one 128-bit lane, splitting will in fact be
17433   // less expensive. The flags track whether the given lane contains an element
17434   // that crosses to another lane.
17435   bool AllLanes;
17436   if (!Subtarget.hasAVX2()) {
17437     bool LaneCrossing[2] = {false, false};
17438     for (int i = 0; i < Size; ++i)
17439       if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
17440         LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
17441     AllLanes = LaneCrossing[0] && LaneCrossing[1];
17442   } else {
17443     bool LaneUsed[2] = {false, false};
17444     for (int i = 0; i < Size; ++i)
17445       if (Mask[i] >= 0)
17446         LaneUsed[(Mask[i] % Size) / LaneSize] = true;
17447     AllLanes = LaneUsed[0] && LaneUsed[1];
17448   }
17449
17450   // TODO - we could support shuffling V2 in the Flipped input.
17451   assert(V2.isUndef() &&
17452          "This last part of this routine only works on single input shuffles");
17453
17454   SmallVector<int> InLaneMask;
17455   computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
17456
17457   assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
17458          "In-lane shuffle mask expected");
17459
17460   // If we're not using both lanes in each lane and the inlane mask is not
17461   // repeating, then we're better off splitting.
17462   if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
17463     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17464                                 /*SimpleOnly*/ false);
17465
17466   // Flip the lanes, and shuffle the results which should now be in-lane.
17467   MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
17468   SDValue Flipped = DAG.getBitcast(PVT, V1);
17469   Flipped =
17470       DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
17471   Flipped = DAG.getBitcast(VT, Flipped);
17472   return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
17473 }
17474
17475 /// Handle lowering 2-lane 128-bit shuffles.
17476 static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
17477                                   SDValue V2, ArrayRef<int> Mask,
17478                                   const APInt &Zeroable,
17479                                   const X86Subtarget &Subtarget,
17480                                   SelectionDAG &DAG) {
17481   if (V2.isUndef()) {
17482     // Attempt to match VBROADCAST*128 subvector broadcast load.
17483     bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
17484     bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
17485     if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
17486         X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
17487       MVT MemVT = VT.getHalfNumVectorElementsVT();
17488       unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
17489       auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
17490       if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
17491                                              VT, MemVT, Ld, Ofs, DAG))
17492         return BcstLd;
17493     }
17494
17495     // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
17496     if (Subtarget.hasAVX2())
17497       return SDValue();
17498   }
17499
17500   bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
17501
17502   SmallVector<int, 4> WidenedMask;
17503   if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
17504     return SDValue();
17505
17506   bool IsLowZero = (Zeroable & 0x3) == 0x3;
17507   bool IsHighZero = (Zeroable & 0xc) == 0xc;
17508
17509   // Try to use an insert into a zero vector.
17510   if (WidenedMask[0] == 0 && IsHighZero) {
17511     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17512     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17513                               DAG.getIntPtrConstant(0, DL));
17514     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17515                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
17516                        DAG.getIntPtrConstant(0, DL));
17517   }
17518
17519   // TODO: If minimizing size and one of the inputs is a zero vector and the
17520   // the zero vector has only one use, we could use a VPERM2X128 to save the
17521   // instruction bytes needed to explicitly generate the zero vector.
17522
17523   // Blends are faster and handle all the non-lane-crossing cases.
17524   if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
17525                                           Subtarget, DAG))
17526     return Blend;
17527
17528   // If either input operand is a zero vector, use VPERM2X128 because its mask
17529   // allows us to replace the zero input with an implicit zero.
17530   if (!IsLowZero && !IsHighZero) {
17531     // Check for patterns which can be matched with a single insert of a 128-bit
17532     // subvector.
17533     bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
17534     if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
17535
17536       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
17537       // this will likely become vinsertf128 which can't fold a 256-bit memop.
17538       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
17539         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17540         SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
17541                                      OnlyUsesV1 ? V1 : V2,
17542                                      DAG.getIntPtrConstant(0, DL));
17543         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17544                            DAG.getIntPtrConstant(2, DL));
17545       }
17546     }
17547
17548     // Try to use SHUF128 if possible.
17549     if (Subtarget.hasVLX()) {
17550       if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
17551         unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
17552                             ((WidenedMask[1] % 2) << 1);
17553         return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
17554                            DAG.getTargetConstant(PermMask, DL, MVT::i8));
17555       }
17556     }
17557   }
17558
17559   // Otherwise form a 128-bit permutation. After accounting for undefs,
17560   // convert the 64-bit shuffle mask selection values into 128-bit
17561   // selection bits by dividing the indexes by 2 and shifting into positions
17562   // defined by a vperm2*128 instruction's immediate control byte.
17563
17564   // The immediate permute control byte looks like this:
17565   //    [1:0] - select 128 bits from sources for low half of destination
17566   //    [2]   - ignore
17567   //    [3]   - zero low half of destination
17568   //    [5:4] - select 128 bits from sources for high half of destination
17569   //    [6]   - ignore
17570   //    [7]   - zero high half of destination
17571
17572   assert((WidenedMask[0] >= 0 || IsLowZero) &&
17573          (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
17574
17575   unsigned PermMask = 0;
17576   PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
17577   PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
17578
17579   // Check the immediate mask and replace unused sources with undef.
17580   if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
17581     V1 = DAG.getUNDEF(VT);
17582   if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
17583     V2 = DAG.getUNDEF(VT);
17584
17585   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
17586                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
17587 }
17588
17589 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
17590 /// shuffling each lane.
17591 ///
17592 /// This attempts to create a repeated lane shuffle where each lane uses one
17593 /// or two of the lanes of the inputs. The lanes of the input vectors are
17594 /// shuffled in one or two independent shuffles to get the lanes into the
17595 /// position needed by the final shuffle.
17596 static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
17597     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17598     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17599   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
17600
17601   if (is128BitLaneRepeatedShuffleMask(VT, Mask))
17602     return SDValue();
17603
17604   int NumElts = Mask.size();
17605   int NumLanes = VT.getSizeInBits() / 128;
17606   int NumLaneElts = 128 / VT.getScalarSizeInBits();
17607   SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
17608   SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
17609
17610   // First pass will try to fill in the RepeatMask from lanes that need two
17611   // sources.
17612   for (int Lane = 0; Lane != NumLanes; ++Lane) {
17613     int Srcs[2] = {-1, -1};
17614     SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
17615     for (int i = 0; i != NumLaneElts; ++i) {
17616       int M = Mask[(Lane * NumLaneElts) + i];
17617       if (M < 0)
17618         continue;
17619       // Determine which of the possible input lanes (NumLanes from each source)
17620       // this element comes from. Assign that as one of the sources for this
17621       // lane. We can assign up to 2 sources for this lane. If we run out
17622       // sources we can't do anything.
17623       int LaneSrc = M / NumLaneElts;
17624       int Src;
17625       if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
17626         Src = 0;
17627       else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
17628         Src = 1;
17629       else
17630         return SDValue();
17631
17632       Srcs[Src] = LaneSrc;
17633       InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
17634     }
17635
17636     // If this lane has two sources, see if it fits with the repeat mask so far.
17637     if (Srcs[1] < 0)
17638       continue;
17639
17640     LaneSrcs[Lane][0] = Srcs[0];
17641     LaneSrcs[Lane][1] = Srcs[1];
17642
17643     auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
17644       assert(M1.size() == M2.size() && "Unexpected mask size");
17645       for (int i = 0, e = M1.size(); i != e; ++i)
17646         if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
17647           return false;
17648       return true;
17649     };
17650
17651     auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
17652       assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
17653       for (int i = 0, e = MergedMask.size(); i != e; ++i) {
17654         int M = Mask[i];
17655         if (M < 0)
17656           continue;
17657         assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
17658                "Unexpected mask element");
17659         MergedMask[i] = M;
17660       }
17661     };
17662
17663     if (MatchMasks(InLaneMask, RepeatMask)) {
17664       // Merge this lane mask into the final repeat mask.
17665       MergeMasks(InLaneMask, RepeatMask);
17666       continue;
17667     }
17668
17669     // Didn't find a match. Swap the operands and try again.
17670     std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
17671     ShuffleVectorSDNode::commuteMask(InLaneMask);
17672
17673     if (MatchMasks(InLaneMask, RepeatMask)) {
17674       // Merge this lane mask into the final repeat mask.
17675       MergeMasks(InLaneMask, RepeatMask);
17676       continue;
17677     }
17678
17679     // Couldn't find a match with the operands in either order.
17680     return SDValue();
17681   }
17682
17683   // Now handle any lanes with only one source.
17684   for (int Lane = 0; Lane != NumLanes; ++Lane) {
17685     // If this lane has already been processed, skip it.
17686     if (LaneSrcs[Lane][0] >= 0)
17687       continue;
17688
17689     for (int i = 0; i != NumLaneElts; ++i) {
17690       int M = Mask[(Lane * NumLaneElts) + i];
17691       if (M < 0)
17692         continue;
17693
17694       // If RepeatMask isn't defined yet we can define it ourself.
17695       if (RepeatMask[i] < 0)
17696         RepeatMask[i] = M % NumLaneElts;
17697
17698       if (RepeatMask[i] < NumElts) {
17699         if (RepeatMask[i] != M % NumLaneElts)
17700           return SDValue();
17701         LaneSrcs[Lane][0] = M / NumLaneElts;
17702       } else {
17703         if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
17704           return SDValue();
17705         LaneSrcs[Lane][1] = M / NumLaneElts;
17706       }
17707     }
17708
17709     if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
17710       return SDValue();
17711   }
17712
17713   SmallVector<int, 16> NewMask(NumElts, -1);
17714   for (int Lane = 0; Lane != NumLanes; ++Lane) {
17715     int Src = LaneSrcs[Lane][0];
17716     for (int i = 0; i != NumLaneElts; ++i) {
17717       int M = -1;
17718       if (Src >= 0)
17719         M = Src * NumLaneElts + i;
17720       NewMask[Lane * NumLaneElts + i] = M;
17721     }
17722   }
17723   SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17724   // Ensure we didn't get back the shuffle we started with.
17725   // FIXME: This is a hack to make up for some splat handling code in
17726   // getVectorShuffle.
17727   if (isa<ShuffleVectorSDNode>(NewV1) &&
17728       cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
17729     return SDValue();
17730
17731   for (int Lane = 0; Lane != NumLanes; ++Lane) {
17732     int Src = LaneSrcs[Lane][1];
17733     for (int i = 0; i != NumLaneElts; ++i) {
17734       int M = -1;
17735       if (Src >= 0)
17736         M = Src * NumLaneElts + i;
17737       NewMask[Lane * NumLaneElts + i] = M;
17738     }
17739   }
17740   SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17741   // Ensure we didn't get back the shuffle we started with.
17742   // FIXME: This is a hack to make up for some splat handling code in
17743   // getVectorShuffle.
17744   if (isa<ShuffleVectorSDNode>(NewV2) &&
17745       cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
17746     return SDValue();
17747
17748   for (int i = 0; i != NumElts; ++i) {
17749     if (Mask[i] < 0) {
17750       NewMask[i] = -1;
17751       continue;
17752     }
17753     NewMask[i] = RepeatMask[i % NumLaneElts];
17754     if (NewMask[i] < 0)
17755       continue;
17756
17757     NewMask[i] += (i / NumLaneElts) * NumLaneElts;
17758   }
17759   return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
17760 }
17761
17762 /// If the input shuffle mask results in a vector that is undefined in all upper
17763 /// or lower half elements and that mask accesses only 2 halves of the
17764 /// shuffle's operands, return true. A mask of half the width with mask indexes
17765 /// adjusted to access the extracted halves of the original shuffle operands is
17766 /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
17767 /// lower half of each input operand is accessed.
17768 static bool
17769 getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
17770                    int &HalfIdx1, int &HalfIdx2) {
17771   assert((Mask.size() == HalfMask.size() * 2) &&
17772          "Expected input mask to be twice as long as output");
17773
17774   // Exactly one half of the result must be undef to allow narrowing.
17775   bool UndefLower = isUndefLowerHalf(Mask);
17776   bool UndefUpper = isUndefUpperHalf(Mask);
17777   if (UndefLower == UndefUpper)
17778     return false;
17779
17780   unsigned HalfNumElts = HalfMask.size();
17781   unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
17782   HalfIdx1 = -1;
17783   HalfIdx2 = -1;
17784   for (unsigned i = 0; i != HalfNumElts; ++i) {
17785     int M = Mask[i + MaskIndexOffset];
17786     if (M < 0) {
17787       HalfMask[i] = M;
17788       continue;
17789     }
17790
17791     // Determine which of the 4 half vectors this element is from.
17792     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
17793     int HalfIdx = M / HalfNumElts;
17794
17795     // Determine the element index into its half vector source.
17796     int HalfElt = M % HalfNumElts;
17797
17798     // We can shuffle with up to 2 half vectors, set the new 'half'
17799     // shuffle mask accordingly.
17800     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
17801       HalfMask[i] = HalfElt;
17802       HalfIdx1 = HalfIdx;
17803       continue;
17804     }
17805     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
17806       HalfMask[i] = HalfElt + HalfNumElts;
17807       HalfIdx2 = HalfIdx;
17808       continue;
17809     }
17810
17811     // Too many half vectors referenced.
17812     return false;
17813   }
17814
17815   return true;
17816 }
17817
17818 /// Given the output values from getHalfShuffleMask(), create a half width
17819 /// shuffle of extracted vectors followed by an insert back to full width.
17820 static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
17821                                      ArrayRef<int> HalfMask, int HalfIdx1,
17822                                      int HalfIdx2, bool UndefLower,
17823                                      SelectionDAG &DAG, bool UseConcat = false) {
17824   assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
17825   assert(V1.getValueType().isSimple() && "Expecting only simple types");
17826
17827   MVT VT = V1.getSimpleValueType();
17828   MVT HalfVT = VT.getHalfNumVectorElementsVT();
17829   unsigned HalfNumElts = HalfVT.getVectorNumElements();
17830
17831   auto getHalfVector = [&](int HalfIdx) {
17832     if (HalfIdx < 0)
17833       return DAG.getUNDEF(HalfVT);
17834     SDValue V = (HalfIdx < 2 ? V1 : V2);
17835     HalfIdx = (HalfIdx % 2) * HalfNumElts;
17836     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
17837                        DAG.getIntPtrConstant(HalfIdx, DL));
17838   };
17839
17840   // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
17841   SDValue Half1 = getHalfVector(HalfIdx1);
17842   SDValue Half2 = getHalfVector(HalfIdx2);
17843   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
17844   if (UseConcat) {
17845     SDValue Op0 = V;
17846     SDValue Op1 = DAG.getUNDEF(HalfVT);
17847     if (UndefLower)
17848       std::swap(Op0, Op1);
17849     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
17850   }
17851
17852   unsigned Offset = UndefLower ? HalfNumElts : 0;
17853   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
17854                      DAG.getIntPtrConstant(Offset, DL));
17855 }
17856
17857 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
17858 /// This allows for fast cases such as subvector extraction/insertion
17859 /// or shuffling smaller vector types which can lower more efficiently.
17860 static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
17861                                          SDValue V2, ArrayRef<int> Mask,
17862                                          const X86Subtarget &Subtarget,
17863                                          SelectionDAG &DAG) {
17864   assert((VT.is256BitVector() || VT.is512BitVector()) &&
17865          "Expected 256-bit or 512-bit vector");
17866
17867   bool UndefLower = isUndefLowerHalf(Mask);
17868   if (!UndefLower && !isUndefUpperHalf(Mask))
17869     return SDValue();
17870
17871   assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
17872          "Completely undef shuffle mask should have been simplified already");
17873
17874   // Upper half is undef and lower half is whole upper subvector.
17875   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
17876   MVT HalfVT = VT.getHalfNumVectorElementsVT();
17877   unsigned HalfNumElts = HalfVT.getVectorNumElements();
17878   if (!UndefLower &&
17879       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
17880     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17881                              DAG.getIntPtrConstant(HalfNumElts, DL));
17882     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17883                        DAG.getIntPtrConstant(0, DL));
17884   }
17885
17886   // Lower half is undef and upper half is whole lower subvector.
17887   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
17888   if (UndefLower &&
17889       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
17890     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
17891                              DAG.getIntPtrConstant(0, DL));
17892     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
17893                        DAG.getIntPtrConstant(HalfNumElts, DL));
17894   }
17895
17896   int HalfIdx1, HalfIdx2;
17897   SmallVector<int, 8> HalfMask(HalfNumElts);
17898   if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
17899     return SDValue();
17900
17901   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
17902
17903   // Only shuffle the halves of the inputs when useful.
17904   unsigned NumLowerHalves =
17905       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
17906   unsigned NumUpperHalves =
17907       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
17908   assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
17909
17910   // Determine the larger pattern of undef/halves, then decide if it's worth
17911   // splitting the shuffle based on subtarget capabilities and types.
17912   unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
17913   if (!UndefLower) {
17914     // XXXXuuuu: no insert is needed.
17915     // Always extract lowers when setting lower - these are all free subreg ops.
17916     if (NumUpperHalves == 0)
17917       return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17918                                    UndefLower, DAG);
17919
17920     if (NumUpperHalves == 1) {
17921       // AVX2 has efficient 32/64-bit element cross-lane shuffles.
17922       if (Subtarget.hasAVX2()) {
17923         // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
17924         if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
17925             !is128BitUnpackShuffleMask(HalfMask, DAG) &&
17926             (!isSingleSHUFPSMask(HalfMask) ||
17927              Subtarget.hasFastVariableCrossLaneShuffle()))
17928           return SDValue();
17929         // If this is a unary shuffle (assume that the 2nd operand is
17930         // canonicalized to undef), then we can use vpermpd. Otherwise, we
17931         // are better off extracting the upper half of 1 operand and using a
17932         // narrow shuffle.
17933         if (EltWidth == 64 && V2.isUndef())
17934           return SDValue();
17935       }
17936       // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17937       if (Subtarget.hasAVX512() && VT.is512BitVector())
17938         return SDValue();
17939       // Extract + narrow shuffle is better than the wide alternative.
17940       return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17941                                    UndefLower, DAG);
17942     }
17943
17944     // Don't extract both uppers, instead shuffle and then extract.
17945     assert(NumUpperHalves == 2 && "Half vector count went wrong");
17946     return SDValue();
17947   }
17948
17949   // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
17950   if (NumUpperHalves == 0) {
17951     // AVX2 has efficient 64-bit element cross-lane shuffles.
17952     // TODO: Refine to account for unary shuffle, splat, and other masks?
17953     if (Subtarget.hasAVX2() && EltWidth == 64)
17954       return SDValue();
17955     // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
17956     if (Subtarget.hasAVX512() && VT.is512BitVector())
17957       return SDValue();
17958     // Narrow shuffle + insert is better than the wide alternative.
17959     return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
17960                                  UndefLower, DAG);
17961   }
17962
17963   // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
17964   return SDValue();
17965 }
17966
17967 /// Handle case where shuffle sources are coming from the same 128-bit lane and
17968 /// every lane can be represented as the same repeating mask - allowing us to
17969 /// shuffle the sources with the repeating shuffle and then permute the result
17970 /// to the destination lanes.
17971 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
17972     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
17973     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
17974   int NumElts = VT.getVectorNumElements();
17975   int NumLanes = VT.getSizeInBits() / 128;
17976   int NumLaneElts = NumElts / NumLanes;
17977
17978   // On AVX2 we may be able to just shuffle the lowest elements and then
17979   // broadcast the result.
17980   if (Subtarget.hasAVX2()) {
17981     for (unsigned BroadcastSize : {16, 32, 64}) {
17982       if (BroadcastSize <= VT.getScalarSizeInBits())
17983         continue;
17984       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
17985
17986       // Attempt to match a repeating pattern every NumBroadcastElts,
17987       // accounting for UNDEFs but only references the lowest 128-bit
17988       // lane of the inputs.
17989       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
17990         for (int i = 0; i != NumElts; i += NumBroadcastElts)
17991           for (int j = 0; j != NumBroadcastElts; ++j) {
17992             int M = Mask[i + j];
17993             if (M < 0)
17994               continue;
17995             int &R = RepeatMask[j];
17996             if (0 != ((M % NumElts) / NumLaneElts))
17997               return false;
17998             if (0 <= R && R != M)
17999               return false;
18000             R = M;
18001           }
18002         return true;
18003       };
18004
18005       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
18006       if (!FindRepeatingBroadcastMask(RepeatMask))
18007         continue;
18008
18009       // Shuffle the (lowest) repeated elements in place for broadcast.
18010       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
18011
18012       // Shuffle the actual broadcast.
18013       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
18014       for (int i = 0; i != NumElts; i += NumBroadcastElts)
18015         for (int j = 0; j != NumBroadcastElts; ++j)
18016           BroadcastMask[i + j] = j;
18017       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
18018                                   BroadcastMask);
18019     }
18020   }
18021
18022   // Bail if the shuffle mask doesn't cross 128-bit lanes.
18023   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
18024     return SDValue();
18025
18026   // Bail if we already have a repeated lane shuffle mask.
18027   if (is128BitLaneRepeatedShuffleMask(VT, Mask))
18028     return SDValue();
18029
18030   // Helper to look for repeated mask in each split sublane, and that those
18031   // sublanes can then be permuted into place.
18032   auto ShuffleSubLanes = [&](int SubLaneScale) {
18033     int NumSubLanes = NumLanes * SubLaneScale;
18034     int NumSubLaneElts = NumLaneElts / SubLaneScale;
18035
18036     // Check that all the sources are coming from the same lane and see if we
18037     // can form a repeating shuffle mask (local to each sub-lane). At the same
18038     // time, determine the source sub-lane for each destination sub-lane.
18039     int TopSrcSubLane = -1;
18040     SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
18041     SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
18042         SubLaneScale,
18043         SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
18044
18045     for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
18046       // Extract the sub-lane mask, check that it all comes from the same lane
18047       // and normalize the mask entries to come from the first lane.
18048       int SrcLane = -1;
18049       SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
18050       for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
18051         int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
18052         if (M < 0)
18053           continue;
18054         int Lane = (M % NumElts) / NumLaneElts;
18055         if ((0 <= SrcLane) && (SrcLane != Lane))
18056           return SDValue();
18057         SrcLane = Lane;
18058         int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
18059         SubLaneMask[Elt] = LocalM;
18060       }
18061
18062       // Whole sub-lane is UNDEF.
18063       if (SrcLane < 0)
18064         continue;
18065
18066       // Attempt to match against the candidate repeated sub-lane masks.
18067       for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
18068         auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
18069           for (int i = 0; i != NumSubLaneElts; ++i) {
18070             if (M1[i] < 0 || M2[i] < 0)
18071               continue;
18072             if (M1[i] != M2[i])
18073               return false;
18074           }
18075           return true;
18076         };
18077
18078         auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
18079         if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
18080           continue;
18081
18082         // Merge the sub-lane mask into the matching repeated sub-lane mask.
18083         for (int i = 0; i != NumSubLaneElts; ++i) {
18084           int M = SubLaneMask[i];
18085           if (M < 0)
18086             continue;
18087           assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
18088                  "Unexpected mask element");
18089           RepeatedSubLaneMask[i] = M;
18090         }
18091
18092         // Track the top most source sub-lane - by setting the remaining to
18093         // UNDEF we can greatly simplify shuffle matching.
18094         int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
18095         TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
18096         Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
18097         break;
18098       }
18099
18100       // Bail if we failed to find a matching repeated sub-lane mask.
18101       if (Dst2SrcSubLanes[DstSubLane] < 0)
18102         return SDValue();
18103     }
18104     assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
18105            "Unexpected source lane");
18106
18107     // Create a repeating shuffle mask for the entire vector.
18108     SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
18109     for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
18110       int Lane = SubLane / SubLaneScale;
18111       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
18112       for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
18113         int M = RepeatedSubLaneMask[Elt];
18114         if (M < 0)
18115           continue;
18116         int Idx = (SubLane * NumSubLaneElts) + Elt;
18117         RepeatedMask[Idx] = M + (Lane * NumLaneElts);
18118       }
18119     }
18120
18121     // Shuffle each source sub-lane to its destination.
18122     SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
18123     for (int i = 0; i != NumElts; i += NumSubLaneElts) {
18124       int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
18125       if (SrcSubLane < 0)
18126         continue;
18127       for (int j = 0; j != NumSubLaneElts; ++j)
18128         SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
18129     }
18130
18131     // Avoid returning the same shuffle operation.
18132     // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
18133     if (RepeatedMask == Mask || SubLaneMask == Mask)
18134       return SDValue();
18135
18136     SDValue RepeatedShuffle =
18137         DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
18138
18139     return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
18140                                 SubLaneMask);
18141   };
18142
18143   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
18144   // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
18145   // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
18146   // Otherwise we can only permute whole 128-bit lanes.
18147   int MinSubLaneScale = 1, MaxSubLaneScale = 1;
18148   if (Subtarget.hasAVX2() && VT.is256BitVector()) {
18149     bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
18150     MinSubLaneScale = 2;
18151     MaxSubLaneScale =
18152         (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
18153   }
18154   if (Subtarget.hasBWI() && VT == MVT::v64i8)
18155     MinSubLaneScale = MaxSubLaneScale = 4;
18156
18157   for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
18158     if (SDValue Shuffle = ShuffleSubLanes(Scale))
18159       return Shuffle;
18160
18161   return SDValue();
18162 }
18163
18164 static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
18165                                    bool &ForceV1Zero, bool &ForceV2Zero,
18166                                    unsigned &ShuffleImm, ArrayRef<int> Mask,
18167                                    const APInt &Zeroable) {
18168   int NumElts = VT.getVectorNumElements();
18169   assert(VT.getScalarSizeInBits() == 64 &&
18170          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
18171          "Unexpected data type for VSHUFPD");
18172   assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
18173          "Illegal shuffle mask");
18174
18175   bool ZeroLane[2] = { true, true };
18176   for (int i = 0; i < NumElts; ++i)
18177     ZeroLane[i & 1] &= Zeroable[i];
18178
18179   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
18180   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
18181   ShuffleImm = 0;
18182   bool ShufpdMask = true;
18183   bool CommutableMask = true;
18184   for (int i = 0; i < NumElts; ++i) {
18185     if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
18186       continue;
18187     if (Mask[i] < 0)
18188       return false;
18189     int Val = (i & 6) + NumElts * (i & 1);
18190     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
18191     if (Mask[i] < Val || Mask[i] > Val + 1)
18192       ShufpdMask = false;
18193     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
18194       CommutableMask = false;
18195     ShuffleImm |= (Mask[i] % 2) << i;
18196   }
18197
18198   if (!ShufpdMask && !CommutableMask)
18199     return false;
18200
18201   if (!ShufpdMask && CommutableMask)
18202     std::swap(V1, V2);
18203
18204   ForceV1Zero = ZeroLane[0];
18205   ForceV2Zero = ZeroLane[1];
18206   return true;
18207 }
18208
18209 static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
18210                                       SDValue V2, ArrayRef<int> Mask,
18211                                       const APInt &Zeroable,
18212                                       const X86Subtarget &Subtarget,
18213                                       SelectionDAG &DAG) {
18214   assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
18215          "Unexpected data type for VSHUFPD");
18216
18217   unsigned Immediate = 0;
18218   bool ForceV1Zero = false, ForceV2Zero = false;
18219   if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
18220                               Mask, Zeroable))
18221     return SDValue();
18222
18223   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
18224   if (ForceV1Zero)
18225     V1 = getZeroVector(VT, Subtarget, DAG, DL);
18226   if (ForceV2Zero)
18227     V2 = getZeroVector(VT, Subtarget, DAG, DL);
18228
18229   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
18230                      DAG.getTargetConstant(Immediate, DL, MVT::i8));
18231 }
18232
18233 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
18234 // by zeroable elements in the remaining 24 elements. Turn this into two
18235 // vmovqb instructions shuffled together.
18236 static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
18237                                              SDValue V1, SDValue V2,
18238                                              ArrayRef<int> Mask,
18239                                              const APInt &Zeroable,
18240                                              SelectionDAG &DAG) {
18241   assert(VT == MVT::v32i8 && "Unexpected type!");
18242
18243   // The first 8 indices should be every 8th element.
18244   if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
18245     return SDValue();
18246
18247   // Remaining elements need to be zeroable.
18248   if (Zeroable.countl_one() < (Mask.size() - 8))
18249     return SDValue();
18250
18251   V1 = DAG.getBitcast(MVT::v4i64, V1);
18252   V2 = DAG.getBitcast(MVT::v4i64, V2);
18253
18254   V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
18255   V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
18256
18257   // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
18258   // the upper bits of the result using an unpckldq.
18259   SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
18260                                         { 0, 1, 2, 3, 16, 17, 18, 19,
18261                                           4, 5, 6, 7, 20, 21, 22, 23 });
18262   // Insert the unpckldq into a zero vector to widen to v32i8.
18263   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
18264                      DAG.getConstant(0, DL, MVT::v32i8), Unpack,
18265                      DAG.getIntPtrConstant(0, DL));
18266 }
18267
18268 // a = shuffle v1, v2, mask1    ; interleaving lower lanes of v1 and v2
18269 // b = shuffle v1, v2, mask2    ; interleaving higher lanes of v1 and v2
18270 //     =>
18271 // ul = unpckl v1, v2
18272 // uh = unpckh v1, v2
18273 // a = vperm ul, uh
18274 // b = vperm ul, uh
18275 //
18276 // Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
18277 // and permute. We cannot directly match v3 because it is split into two
18278 // 256-bit vectors in earlier isel stages. Therefore, this function matches a
18279 // pair of 256-bit shuffles and makes sure the masks are consecutive.
18280 //
18281 // Once unpck and permute nodes are created, the permute corresponding to this
18282 // shuffle is returned, while the other permute replaces the other half of the
18283 // shuffle in the selection dag.
18284 static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
18285                                                  SDValue V1, SDValue V2,
18286                                                  ArrayRef<int> Mask,
18287                                                  SelectionDAG &DAG) {
18288   if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
18289       VT != MVT::v32i8)
18290     return SDValue();
18291   // <B0, B1, B0+1, B1+1, ..., >
18292   auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
18293                                    unsigned Begin1) {
18294     size_t Size = Mask.size();
18295     assert(Size % 2 == 0 && "Expected even mask size");
18296     for (unsigned I = 0; I < Size; I += 2) {
18297       if (Mask[I] != (int)(Begin0 + I / 2) ||
18298           Mask[I + 1] != (int)(Begin1 + I / 2))
18299         return false;
18300     }
18301     return true;
18302   };
18303   // Check which half is this shuffle node
18304   int NumElts = VT.getVectorNumElements();
18305   size_t FirstQtr = NumElts / 2;
18306   size_t ThirdQtr = NumElts + NumElts / 2;
18307   bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
18308   bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
18309   if (!IsFirstHalf && !IsSecondHalf)
18310     return SDValue();
18311
18312   // Find the intersection between shuffle users of V1 and V2.
18313   SmallVector<SDNode *, 2> Shuffles;
18314   for (SDNode *User : V1->uses())
18315     if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
18316         User->getOperand(1) == V2)
18317       Shuffles.push_back(User);
18318   // Limit user size to two for now.
18319   if (Shuffles.size() != 2)
18320     return SDValue();
18321   // Find out which half of the 512-bit shuffles is each smaller shuffle
18322   auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
18323   auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
18324   SDNode *FirstHalf;
18325   SDNode *SecondHalf;
18326   if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
18327       IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
18328     FirstHalf = Shuffles[0];
18329     SecondHalf = Shuffles[1];
18330   } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
18331              IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
18332     FirstHalf = Shuffles[1];
18333     SecondHalf = Shuffles[0];
18334   } else {
18335     return SDValue();
18336   }
18337   // Lower into unpck and perm. Return the perm of this shuffle and replace
18338   // the other.
18339   SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
18340   SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
18341   SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
18342                               DAG.getTargetConstant(0x20, DL, MVT::i8));
18343   SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
18344                               DAG.getTargetConstant(0x31, DL, MVT::i8));
18345   if (IsFirstHalf) {
18346     DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
18347     return Perm1;
18348   }
18349   DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
18350   return Perm2;
18351 }
18352
18353 /// Handle lowering of 4-lane 64-bit floating point shuffles.
18354 ///
18355 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
18356 /// isn't available.
18357 static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18358                                  const APInt &Zeroable, SDValue V1, SDValue V2,
18359                                  const X86Subtarget &Subtarget,
18360                                  SelectionDAG &DAG) {
18361   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
18362   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
18363   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
18364
18365   if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
18366                                      Subtarget, DAG))
18367     return V;
18368
18369   if (V2.isUndef()) {
18370     // Check for being able to broadcast a single element.
18371     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
18372                                                     Mask, Subtarget, DAG))
18373       return Broadcast;
18374
18375     // Use low duplicate instructions for masks that match their pattern.
18376     if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
18377       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
18378
18379     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
18380       // Non-half-crossing single input shuffles can be lowered with an
18381       // interleaved permutation.
18382       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18383                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
18384       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
18385                          DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18386     }
18387
18388     // With AVX2 we have direct support for this permutation.
18389     if (Subtarget.hasAVX2())
18390       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
18391                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
18392
18393     // Try to create an in-lane repeating shuffle mask and then shuffle the
18394     // results into the target lanes.
18395     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18396             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18397       return V;
18398
18399     // Try to permute the lanes and then use a per-lane permute.
18400     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
18401                                                         Mask, DAG, Subtarget))
18402       return V;
18403
18404     // Otherwise, fall back.
18405     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
18406                                                DAG, Subtarget);
18407   }
18408
18409   // Use dedicated unpack instructions for masks that match their pattern.
18410   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
18411     return V;
18412
18413   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
18414                                           Zeroable, Subtarget, DAG))
18415     return Blend;
18416
18417   // Check if the blend happens to exactly fit that of SHUFPD.
18418   if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
18419                                           Zeroable, Subtarget, DAG))
18420     return Op;
18421
18422   bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
18423   bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
18424
18425   // If we have lane crossing shuffles AND they don't all come from the lower
18426   // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
18427   // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
18428   // canonicalize to a blend of splat which isn't necessary for this combine.
18429   if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
18430       !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
18431       (V1.getOpcode() != ISD::BUILD_VECTOR) &&
18432       (V2.getOpcode() != ISD::BUILD_VECTOR))
18433     return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
18434
18435   // If we have one input in place, then we can permute the other input and
18436   // blend the result.
18437   if (V1IsInPlace || V2IsInPlace)
18438     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
18439                                                 Subtarget, DAG);
18440
18441   // Try to create an in-lane repeating shuffle mask and then shuffle the
18442   // results into the target lanes.
18443   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18444           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18445     return V;
18446
18447   // Try to simplify this by merging 128-bit lanes to enable a lane-based
18448   // shuffle. However, if we have AVX2 and either inputs are already in place,
18449   // we will be able to shuffle even across lanes the other input in a single
18450   // instruction so skip this pattern.
18451   if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
18452     if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
18453             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
18454       return V;
18455
18456   // If we have VLX support, we can use VEXPAND.
18457   if (Subtarget.hasVLX())
18458     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
18459                                          DAG, Subtarget))
18460       return V;
18461
18462   // If we have AVX2 then we always want to lower with a blend because an v4 we
18463   // can fully permute the elements.
18464   if (Subtarget.hasAVX2())
18465     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
18466                                                 Subtarget, DAG);
18467
18468   // Otherwise fall back on generic lowering.
18469   return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
18470                                     Subtarget, DAG);
18471 }
18472
18473 /// Handle lowering of 4-lane 64-bit integer shuffles.
18474 ///
18475 /// This routine is only called when we have AVX2 and thus a reasonable
18476 /// instruction set for v4i64 shuffling..
18477 static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18478                                  const APInt &Zeroable, SDValue V1, SDValue V2,
18479                                  const X86Subtarget &Subtarget,
18480                                  SelectionDAG &DAG) {
18481   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
18482   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
18483   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
18484   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
18485
18486   if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
18487                                      Subtarget, DAG))
18488     return V;
18489
18490   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
18491                                           Zeroable, Subtarget, DAG))
18492     return Blend;
18493
18494   // Check for being able to broadcast a single element.
18495   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
18496                                                   Subtarget, DAG))
18497     return Broadcast;
18498
18499   // Try to use shift instructions if fast.
18500   if (Subtarget.preferLowerShuffleAsShift())
18501     if (SDValue Shift =
18502             lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
18503                                 Subtarget, DAG, /*BitwiseOnly*/ true))
18504       return Shift;
18505
18506   if (V2.isUndef()) {
18507     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18508     // can use lower latency instructions that will operate on both lanes.
18509     SmallVector<int, 2> RepeatedMask;
18510     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
18511       SmallVector<int, 4> PSHUFDMask;
18512       narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
18513       return DAG.getBitcast(
18514           MVT::v4i64,
18515           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
18516                       DAG.getBitcast(MVT::v8i32, V1),
18517                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18518     }
18519
18520     // AVX2 provides a direct instruction for permuting a single input across
18521     // lanes.
18522     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
18523                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
18524   }
18525
18526   // Try to use shift instructions.
18527   if (SDValue Shift =
18528           lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
18529                               DAG, /*BitwiseOnly*/ false))
18530     return Shift;
18531
18532   // If we have VLX support, we can use VALIGN or VEXPAND.
18533   if (Subtarget.hasVLX()) {
18534     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
18535                                               Subtarget, DAG))
18536       return Rotate;
18537
18538     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
18539                                          DAG, Subtarget))
18540       return V;
18541   }
18542
18543   // Try to use PALIGNR.
18544   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
18545                                                 Subtarget, DAG))
18546     return Rotate;
18547
18548   // Use dedicated unpack instructions for masks that match their pattern.
18549   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
18550     return V;
18551
18552   bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
18553   bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
18554
18555   // If we have one input in place, then we can permute the other input and
18556   // blend the result.
18557   if (V1IsInPlace || V2IsInPlace)
18558     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
18559                                                 Subtarget, DAG);
18560
18561   // Try to create an in-lane repeating shuffle mask and then shuffle the
18562   // results into the target lanes.
18563   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18564           DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
18565     return V;
18566
18567   // Try to lower to PERMQ(BLENDD(V1,V2)).
18568   if (SDValue V =
18569           lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
18570     return V;
18571
18572   // Try to simplify this by merging 128-bit lanes to enable a lane-based
18573   // shuffle. However, if we have AVX2 and either inputs are already in place,
18574   // we will be able to shuffle even across lanes the other input in a single
18575   // instruction so skip this pattern.
18576   if (!V1IsInPlace && !V2IsInPlace)
18577     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18578             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
18579       return Result;
18580
18581   // Otherwise fall back on generic blend lowering.
18582   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
18583                                               Subtarget, DAG);
18584 }
18585
18586 /// Handle lowering of 8-lane 32-bit floating point shuffles.
18587 ///
18588 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
18589 /// isn't available.
18590 static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18591                                  const APInt &Zeroable, SDValue V1, SDValue V2,
18592                                  const X86Subtarget &Subtarget,
18593                                  SelectionDAG &DAG) {
18594   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
18595   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
18596   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
18597
18598   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
18599                                           Zeroable, Subtarget, DAG))
18600     return Blend;
18601
18602   // Check for being able to broadcast a single element.
18603   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
18604                                                   Subtarget, DAG))
18605     return Broadcast;
18606
18607   if (!Subtarget.hasAVX2()) {
18608     SmallVector<int> InLaneMask;
18609     computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
18610
18611     if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
18612       if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
18613                                            /*SimpleOnly*/ true))
18614         return R;
18615   }
18616   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
18617                                                    Zeroable, Subtarget, DAG))
18618     return DAG.getBitcast(MVT::v8f32, ZExt);
18619
18620   // If the shuffle mask is repeated in each 128-bit lane, we have many more
18621   // options to efficiently lower the shuffle.
18622   SmallVector<int, 4> RepeatedMask;
18623   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
18624     assert(RepeatedMask.size() == 4 &&
18625            "Repeated masks must be half the mask width!");
18626
18627     // Use even/odd duplicate instructions for masks that match their pattern.
18628     if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18629       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
18630     if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18631       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
18632
18633     if (V2.isUndef())
18634       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
18635                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18636
18637     // Use dedicated unpack instructions for masks that match their pattern.
18638     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
18639       return V;
18640
18641     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
18642     // have already handled any direct blends.
18643     return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
18644   }
18645
18646   // Try to create an in-lane repeating shuffle mask and then shuffle the
18647   // results into the target lanes.
18648   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18649           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
18650     return V;
18651
18652   // If we have a single input shuffle with different shuffle patterns in the
18653   // two 128-bit lanes use the variable mask to VPERMILPS.
18654   if (V2.isUndef()) {
18655     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
18656       SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18657       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
18658     }
18659     if (Subtarget.hasAVX2()) {
18660       SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18661       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
18662     }
18663     // Otherwise, fall back.
18664     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
18665                                                DAG, Subtarget);
18666   }
18667
18668   // Try to simplify this by merging 128-bit lanes to enable a lane-based
18669   // shuffle.
18670   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18671           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
18672     return Result;
18673
18674   // If we have VLX support, we can use VEXPAND.
18675   if (Subtarget.hasVLX())
18676     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
18677                                          DAG, Subtarget))
18678       return V;
18679
18680   // Try to match an interleave of two v8f32s and lower them as unpck and
18681   // permutes using ymms. This needs to go before we try to split the vectors.
18682   //
18683   // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
18684   // this path inadvertently.
18685   if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
18686     if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
18687                                                       Mask, DAG))
18688       return V;
18689
18690   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
18691   // since after split we get a more efficient code using vpunpcklwd and
18692   // vpunpckhwd instrs than vblend.
18693   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
18694     return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
18695                                       DAG);
18696
18697   // If we have AVX2 then we always want to lower with a blend because at v8 we
18698   // can fully permute the elements.
18699   if (Subtarget.hasAVX2())
18700     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
18701                                                 Subtarget, DAG);
18702
18703   // Otherwise fall back on generic lowering.
18704   return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
18705                                     Subtarget, DAG);
18706 }
18707
18708 /// Handle lowering of 8-lane 32-bit integer shuffles.
18709 ///
18710 /// This routine is only called when we have AVX2 and thus a reasonable
18711 /// instruction set for v8i32 shuffling..
18712 static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18713                                  const APInt &Zeroable, SDValue V1, SDValue V2,
18714                                  const X86Subtarget &Subtarget,
18715                                  SelectionDAG &DAG) {
18716   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
18717   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
18718   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
18719   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
18720
18721   int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
18722
18723   // Whenever we can lower this as a zext, that instruction is strictly faster
18724   // than any alternative. It also allows us to fold memory operands into the
18725   // shuffle in many cases.
18726   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
18727                                                    Zeroable, Subtarget, DAG))
18728     return ZExt;
18729
18730   // Try to match an interleave of two v8i32s and lower them as unpck and
18731   // permutes using ymms. This needs to go before we try to split the vectors.
18732   if (!Subtarget.hasAVX512())
18733     if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
18734                                                       Mask, DAG))
18735       return V;
18736
18737   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
18738   // since after split we get a more efficient code than vblend by using
18739   // vpunpcklwd and vpunpckhwd instrs.
18740   if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
18741       !Subtarget.hasAVX512())
18742     return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
18743                                       DAG);
18744
18745   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
18746                                           Zeroable, Subtarget, DAG))
18747     return Blend;
18748
18749   // Check for being able to broadcast a single element.
18750   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
18751                                                   Subtarget, DAG))
18752     return Broadcast;
18753
18754   // Try to use shift instructions if fast.
18755   if (Subtarget.preferLowerShuffleAsShift()) {
18756     if (SDValue Shift =
18757             lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
18758                                 Subtarget, DAG, /*BitwiseOnly*/ true))
18759       return Shift;
18760     if (NumV2Elements == 0)
18761       if (SDValue Rotate =
18762               lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
18763         return Rotate;
18764   }
18765
18766   // If the shuffle mask is repeated in each 128-bit lane we can use more
18767   // efficient instructions that mirror the shuffles across the two 128-bit
18768   // lanes.
18769   SmallVector<int, 4> RepeatedMask;
18770   bool Is128BitLaneRepeatedShuffle =
18771       is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
18772   if (Is128BitLaneRepeatedShuffle) {
18773     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
18774     if (V2.isUndef())
18775       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
18776                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18777
18778     // Use dedicated unpack instructions for masks that match their pattern.
18779     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
18780       return V;
18781   }
18782
18783   // Try to use shift instructions.
18784   if (SDValue Shift =
18785           lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
18786                               DAG, /*BitwiseOnly*/ false))
18787     return Shift;
18788
18789   if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
18790     if (SDValue Rotate =
18791             lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
18792       return Rotate;
18793
18794   // If we have VLX support, we can use VALIGN or EXPAND.
18795   if (Subtarget.hasVLX()) {
18796     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
18797                                               Subtarget, DAG))
18798       return Rotate;
18799
18800     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
18801                                          DAG, Subtarget))
18802       return V;
18803   }
18804
18805   // Try to use byte rotation instructions.
18806   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
18807                                                 Subtarget, DAG))
18808     return Rotate;
18809
18810   // Try to create an in-lane repeating shuffle mask and then shuffle the
18811   // results into the target lanes.
18812   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18813           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
18814     return V;
18815
18816   if (V2.isUndef()) {
18817     // Try to produce a fixed cross-128-bit lane permute followed by unpack
18818     // because that should be faster than the variable permute alternatives.
18819     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
18820       return V;
18821
18822     // If the shuffle patterns aren't repeated but it's a single input, directly
18823     // generate a cross-lane VPERMD instruction.
18824     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
18825     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
18826   }
18827
18828   // Assume that a single SHUFPS is faster than an alternative sequence of
18829   // multiple instructions (even if the CPU has a domain penalty).
18830   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18831   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18832     SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
18833     SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
18834     SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
18835                                             CastV1, CastV2, DAG);
18836     return DAG.getBitcast(MVT::v8i32, ShufPS);
18837   }
18838
18839   // Try to simplify this by merging 128-bit lanes to enable a lane-based
18840   // shuffle.
18841   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18842           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
18843     return Result;
18844
18845   // Otherwise fall back on generic blend lowering.
18846   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
18847                                               Subtarget, DAG);
18848 }
18849
18850 /// Handle lowering of 16-lane 16-bit integer shuffles.
18851 ///
18852 /// This routine is only called when we have AVX2 and thus a reasonable
18853 /// instruction set for v16i16 shuffling..
18854 static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18855                                   const APInt &Zeroable, SDValue V1, SDValue V2,
18856                                   const X86Subtarget &Subtarget,
18857                                   SelectionDAG &DAG) {
18858   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
18859   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
18860   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
18861   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
18862
18863   // Whenever we can lower this as a zext, that instruction is strictly faster
18864   // than any alternative. It also allows us to fold memory operands into the
18865   // shuffle in many cases.
18866   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18867           DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18868     return ZExt;
18869
18870   // Check for being able to broadcast a single element.
18871   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
18872                                                   Subtarget, DAG))
18873     return Broadcast;
18874
18875   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
18876                                           Zeroable, Subtarget, DAG))
18877     return Blend;
18878
18879   // Use dedicated unpack instructions for masks that match their pattern.
18880   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
18881     return V;
18882
18883   // Use dedicated pack instructions for masks that match their pattern.
18884   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
18885                                        Subtarget))
18886     return V;
18887
18888   // Try to use lower using a truncation.
18889   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
18890                                        Subtarget, DAG))
18891     return V;
18892
18893   // Try to use shift instructions.
18894   if (SDValue Shift =
18895           lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
18896                               Subtarget, DAG, /*BitwiseOnly*/ false))
18897     return Shift;
18898
18899   // Try to use byte rotation instructions.
18900   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
18901                                                 Subtarget, DAG))
18902     return Rotate;
18903
18904   // Try to create an in-lane repeating shuffle mask and then shuffle the
18905   // results into the target lanes.
18906   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18907           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18908     return V;
18909
18910   if (V2.isUndef()) {
18911     // Try to use bit rotation instructions.
18912     if (SDValue Rotate =
18913             lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
18914       return Rotate;
18915
18916     // Try to produce a fixed cross-128-bit lane permute followed by unpack
18917     // because that should be faster than the variable permute alternatives.
18918     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
18919       return V;
18920
18921     // There are no generalized cross-lane shuffle operations available on i16
18922     // element types.
18923     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
18924       if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18925               DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18926         return V;
18927
18928       return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
18929                                                  DAG, Subtarget);
18930     }
18931
18932     SmallVector<int, 8> RepeatedMask;
18933     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
18934       // As this is a single-input shuffle, the repeated mask should be
18935       // a strictly valid v8i16 mask that we can pass through to the v8i16
18936       // lowering to handle even the v16 case.
18937       return lowerV8I16GeneralSingleInputShuffle(
18938           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
18939     }
18940   }
18941
18942   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
18943                                               Zeroable, Subtarget, DAG))
18944     return PSHUFB;
18945
18946   // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
18947   if (Subtarget.hasBWI())
18948     return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
18949
18950   // Try to simplify this by merging 128-bit lanes to enable a lane-based
18951   // shuffle.
18952   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
18953           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
18954     return Result;
18955
18956   // Try to permute the lanes and then use a per-lane permute.
18957   if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
18958           DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
18959     return V;
18960
18961   // Try to match an interleave of two v16i16s and lower them as unpck and
18962   // permutes using ymms.
18963   if (!Subtarget.hasAVX512())
18964     if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
18965                                                       Mask, DAG))
18966       return V;
18967
18968   // Otherwise fall back on generic lowering.
18969   return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
18970                                     Subtarget, DAG);
18971 }
18972
18973 /// Handle lowering of 32-lane 8-bit integer shuffles.
18974 ///
18975 /// This routine is only called when we have AVX2 and thus a reasonable
18976 /// instruction set for v32i8 shuffling..
18977 static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18978                                  const APInt &Zeroable, SDValue V1, SDValue V2,
18979                                  const X86Subtarget &Subtarget,
18980                                  SelectionDAG &DAG) {
18981   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
18982   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
18983   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
18984   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
18985
18986   // Whenever we can lower this as a zext, that instruction is strictly faster
18987   // than any alternative. It also allows us to fold memory operands into the
18988   // shuffle in many cases.
18989   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
18990                                                    Zeroable, Subtarget, DAG))
18991     return ZExt;
18992
18993   // Check for being able to broadcast a single element.
18994   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
18995                                                   Subtarget, DAG))
18996     return Broadcast;
18997
18998   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
18999                                           Zeroable, Subtarget, DAG))
19000     return Blend;
19001
19002   // Use dedicated unpack instructions for masks that match their pattern.
19003   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
19004     return V;
19005
19006   // Use dedicated pack instructions for masks that match their pattern.
19007   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
19008                                        Subtarget))
19009     return V;
19010
19011   // Try to use lower using a truncation.
19012   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
19013                                        Subtarget, DAG))
19014     return V;
19015
19016   // Try to use shift instructions.
19017   if (SDValue Shift =
19018           lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
19019                               DAG, /*BitwiseOnly*/ false))
19020     return Shift;
19021
19022   // Try to use byte rotation instructions.
19023   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
19024                                                 Subtarget, DAG))
19025     return Rotate;
19026
19027   // Try to use bit rotation instructions.
19028   if (V2.isUndef())
19029     if (SDValue Rotate =
19030             lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
19031       return Rotate;
19032
19033   // Try to create an in-lane repeating shuffle mask and then shuffle the
19034   // results into the target lanes.
19035   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19036           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
19037     return V;
19038
19039   // There are no generalized cross-lane shuffle operations available on i8
19040   // element types.
19041   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
19042     // Try to produce a fixed cross-128-bit lane permute followed by unpack
19043     // because that should be faster than the variable permute alternatives.
19044     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
19045       return V;
19046
19047     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
19048             DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
19049       return V;
19050
19051     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
19052                                                DAG, Subtarget);
19053   }
19054
19055   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
19056                                               Zeroable, Subtarget, DAG))
19057     return PSHUFB;
19058
19059   // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
19060   if (Subtarget.hasVBMI())
19061     return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
19062
19063   // Try to simplify this by merging 128-bit lanes to enable a lane-based
19064   // shuffle.
19065   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
19066           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
19067     return Result;
19068
19069   // Try to permute the lanes and then use a per-lane permute.
19070   if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
19071           DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
19072     return V;
19073
19074   // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
19075   // by zeroable elements in the remaining 24 elements. Turn this into two
19076   // vmovqb instructions shuffled together.
19077   if (Subtarget.hasVLX())
19078     if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
19079                                                   Mask, Zeroable, DAG))
19080       return V;
19081
19082   // Try to match an interleave of two v32i8s and lower them as unpck and
19083   // permutes using ymms.
19084   if (!Subtarget.hasAVX512())
19085     if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
19086                                                       Mask, DAG))
19087       return V;
19088
19089   // Otherwise fall back on generic lowering.
19090   return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
19091                                     Subtarget, DAG);
19092 }
19093
19094 /// High-level routine to lower various 256-bit x86 vector shuffles.
19095 ///
19096 /// This routine either breaks down the specific type of a 256-bit x86 vector
19097 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
19098 /// together based on the available instructions.
19099 static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
19100                                   SDValue V1, SDValue V2, const APInt &Zeroable,
19101                                   const X86Subtarget &Subtarget,
19102                                   SelectionDAG &DAG) {
19103   // If we have a single input to the zero element, insert that into V1 if we
19104   // can do so cheaply.
19105   int NumElts = VT.getVectorNumElements();
19106   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
19107
19108   if (NumV2Elements == 1 && Mask[0] >= NumElts)
19109     if (SDValue Insertion = lowerShuffleAsElementInsertion(
19110             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
19111       return Insertion;
19112
19113   // Handle special cases where the lower or upper half is UNDEF.
19114   if (SDValue V =
19115           lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
19116     return V;
19117
19118   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
19119   // can check for those subtargets here and avoid much of the subtarget
19120   // querying in the per-vector-type lowering routines. With AVX1 we have
19121   // essentially *zero* ability to manipulate a 256-bit vector with integer
19122   // types. Since we'll use floating point types there eventually, just
19123   // immediately cast everything to a float and operate entirely in that domain.
19124   if (VT.isInteger() && !Subtarget.hasAVX2()) {
19125     int ElementBits = VT.getScalarSizeInBits();
19126     if (ElementBits < 32) {
19127       // No floating point type available, if we can't use the bit operations
19128       // for masking/blending then decompose into 128-bit vectors.
19129       if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
19130                                             Subtarget, DAG))
19131         return V;
19132       if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
19133         return V;
19134       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19135     }
19136
19137     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
19138                                 VT.getVectorNumElements());
19139     V1 = DAG.getBitcast(FpVT, V1);
19140     V2 = DAG.getBitcast(FpVT, V2);
19141     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
19142   }
19143
19144   if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
19145     V1 = DAG.getBitcast(MVT::v16i16, V1);
19146     V2 = DAG.getBitcast(MVT::v16i16, V2);
19147     return DAG.getBitcast(VT,
19148                           DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
19149   }
19150
19151   switch (VT.SimpleTy) {
19152   case MVT::v4f64:
19153     return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19154   case MVT::v4i64:
19155     return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19156   case MVT::v8f32:
19157     return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19158   case MVT::v8i32:
19159     return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19160   case MVT::v16i16:
19161     return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19162   case MVT::v32i8:
19163     return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19164
19165   default:
19166     llvm_unreachable("Not a valid 256-bit x86 vector type!");
19167   }
19168 }
19169
19170 /// Try to lower a vector shuffle as a 128-bit shuffles.
19171 static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
19172                                   const APInt &Zeroable, SDValue V1, SDValue V2,
19173                                   const X86Subtarget &Subtarget,
19174                                   SelectionDAG &DAG) {
19175   assert(VT.getScalarSizeInBits() == 64 &&
19176          "Unexpected element type size for 128bit shuffle.");
19177
19178   // To handle 256 bit vector requires VLX and most probably
19179   // function lowerV2X128VectorShuffle() is better solution.
19180   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
19181
19182   // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
19183   SmallVector<int, 4> Widened128Mask;
19184   if (!canWidenShuffleElements(Mask, Widened128Mask))
19185     return SDValue();
19186   assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
19187
19188   // Try to use an insert into a zero vector.
19189   if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
19190       (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
19191     unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
19192     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
19193     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
19194                               DAG.getIntPtrConstant(0, DL));
19195     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
19196                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
19197                        DAG.getIntPtrConstant(0, DL));
19198   }
19199
19200   // Check for patterns which can be matched with a single insert of a 256-bit
19201   // subvector.
19202   bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
19203   if (OnlyUsesV1 ||
19204       isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
19205     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
19206     SDValue SubVec =
19207         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
19208                     DAG.getIntPtrConstant(0, DL));
19209     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
19210                        DAG.getIntPtrConstant(4, DL));
19211   }
19212
19213   // See if this is an insertion of the lower 128-bits of V2 into V1.
19214   bool IsInsert = true;
19215   int V2Index = -1;
19216   for (int i = 0; i < 4; ++i) {
19217     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
19218     if (Widened128Mask[i] < 0)
19219       continue;
19220
19221     // Make sure all V1 subvectors are in place.
19222     if (Widened128Mask[i] < 4) {
19223       if (Widened128Mask[i] != i) {
19224         IsInsert = false;
19225         break;
19226       }
19227     } else {
19228       // Make sure we only have a single V2 index and its the lowest 128-bits.
19229       if (V2Index >= 0 || Widened128Mask[i] != 4) {
19230         IsInsert = false;
19231         break;
19232       }
19233       V2Index = i;
19234     }
19235   }
19236   if (IsInsert && V2Index >= 0) {
19237     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
19238     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
19239                                  DAG.getIntPtrConstant(0, DL));
19240     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
19241   }
19242
19243   // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
19244   // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
19245   // possible we at least ensure the lanes stay sequential to help later
19246   // combines.
19247   SmallVector<int, 2> Widened256Mask;
19248   if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
19249     Widened128Mask.clear();
19250     narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
19251   }
19252
19253   // Try to lower to vshuf64x2/vshuf32x4.
19254   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
19255   unsigned PermMask = 0;
19256   // Insure elements came from the same Op.
19257   for (int i = 0; i < 4; ++i) {
19258     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
19259     if (Widened128Mask[i] < 0)
19260       continue;
19261
19262     SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
19263     unsigned OpIndex = i / 2;
19264     if (Ops[OpIndex].isUndef())
19265       Ops[OpIndex] = Op;
19266     else if (Ops[OpIndex] != Op)
19267       return SDValue();
19268
19269     // Convert the 128-bit shuffle mask selection values into 128-bit selection
19270     // bits defined by a vshuf64x2 instruction's immediate control byte.
19271     PermMask |= (Widened128Mask[i] % 4) << (i * 2);
19272   }
19273
19274   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
19275                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
19276 }
19277
19278 /// Handle lowering of 8-lane 64-bit floating point shuffles.
19279 static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19280                                  const APInt &Zeroable, SDValue V1, SDValue V2,
19281                                  const X86Subtarget &Subtarget,
19282                                  SelectionDAG &DAG) {
19283   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
19284   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
19285   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
19286
19287   if (V2.isUndef()) {
19288     // Use low duplicate instructions for masks that match their pattern.
19289     if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
19290       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
19291
19292     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
19293       // Non-half-crossing single input shuffles can be lowered with an
19294       // interleaved permutation.
19295       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
19296                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
19297                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
19298                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
19299       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
19300                          DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
19301     }
19302
19303     SmallVector<int, 4> RepeatedMask;
19304     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
19305       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
19306                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19307   }
19308
19309   if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
19310                                            V2, Subtarget, DAG))
19311     return Shuf128;
19312
19313   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
19314     return Unpck;
19315
19316   // Check if the blend happens to exactly fit that of SHUFPD.
19317   if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
19318                                           Zeroable, Subtarget, DAG))
19319     return Op;
19320
19321   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
19322                                        DAG, Subtarget))
19323     return V;
19324
19325   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
19326                                           Zeroable, Subtarget, DAG))
19327     return Blend;
19328
19329   return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
19330 }
19331
19332 /// Handle lowering of 16-lane 32-bit floating point shuffles.
19333 static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19334                                   const APInt &Zeroable, SDValue V1, SDValue V2,
19335                                   const X86Subtarget &Subtarget,
19336                                   SelectionDAG &DAG) {
19337   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
19338   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
19339   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
19340
19341   // If the shuffle mask is repeated in each 128-bit lane, we have many more
19342   // options to efficiently lower the shuffle.
19343   SmallVector<int, 4> RepeatedMask;
19344   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
19345     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
19346
19347     // Use even/odd duplicate instructions for masks that match their pattern.
19348     if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
19349       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
19350     if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
19351       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
19352
19353     if (V2.isUndef())
19354       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
19355                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19356
19357     // Use dedicated unpack instructions for masks that match their pattern.
19358     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
19359       return V;
19360
19361     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
19362                                             Zeroable, Subtarget, DAG))
19363       return Blend;
19364
19365     // Otherwise, fall back to a SHUFPS sequence.
19366     return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
19367   }
19368
19369   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
19370                                           Zeroable, Subtarget, DAG))
19371     return Blend;
19372
19373   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19374           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19375     return DAG.getBitcast(MVT::v16f32, ZExt);
19376
19377   // Try to create an in-lane repeating shuffle mask and then shuffle the
19378   // results into the target lanes.
19379   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19380           DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
19381     return V;
19382
19383   // If we have a single input shuffle with different shuffle patterns in the
19384   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
19385   if (V2.isUndef() &&
19386       !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
19387     SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
19388     return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
19389   }
19390
19391   // If we have AVX512F support, we can use VEXPAND.
19392   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
19393                                              V1, V2, DAG, Subtarget))
19394     return V;
19395
19396   return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
19397 }
19398
19399 /// Handle lowering of 8-lane 64-bit integer shuffles.
19400 static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19401                                  const APInt &Zeroable, SDValue V1, SDValue V2,
19402                                  const X86Subtarget &Subtarget,
19403                                  SelectionDAG &DAG) {
19404   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
19405   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
19406   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
19407
19408   // Try to use shift instructions if fast.
19409   if (Subtarget.preferLowerShuffleAsShift())
19410     if (SDValue Shift =
19411             lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
19412                                 Subtarget, DAG, /*BitwiseOnly*/ true))
19413       return Shift;
19414
19415   if (V2.isUndef()) {
19416     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
19417     // can use lower latency instructions that will operate on all four
19418     // 128-bit lanes.
19419     SmallVector<int, 2> Repeated128Mask;
19420     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
19421       SmallVector<int, 4> PSHUFDMask;
19422       narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
19423       return DAG.getBitcast(
19424           MVT::v8i64,
19425           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
19426                       DAG.getBitcast(MVT::v16i32, V1),
19427                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
19428     }
19429
19430     SmallVector<int, 4> Repeated256Mask;
19431     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
19432       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
19433                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
19434   }
19435
19436   if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
19437                                            V2, Subtarget, DAG))
19438     return Shuf128;
19439
19440   // Try to use shift instructions.
19441   if (SDValue Shift =
19442           lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
19443                               DAG, /*BitwiseOnly*/ false))
19444     return Shift;
19445
19446   // Try to use VALIGN.
19447   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
19448                                             Subtarget, DAG))
19449     return Rotate;
19450
19451   // Try to use PALIGNR.
19452   if (Subtarget.hasBWI())
19453     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
19454                                                   Subtarget, DAG))
19455       return Rotate;
19456
19457   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
19458     return Unpck;
19459
19460   // If we have AVX512F support, we can use VEXPAND.
19461   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
19462                                        DAG, Subtarget))
19463     return V;
19464
19465   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
19466                                           Zeroable, Subtarget, DAG))
19467     return Blend;
19468
19469   return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
19470 }
19471
19472 /// Handle lowering of 16-lane 32-bit integer shuffles.
19473 static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19474                                   const APInt &Zeroable, SDValue V1, SDValue V2,
19475                                   const X86Subtarget &Subtarget,
19476                                   SelectionDAG &DAG) {
19477   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
19478   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
19479   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
19480
19481   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
19482
19483   // Whenever we can lower this as a zext, that instruction is strictly faster
19484   // than any alternative. It also allows us to fold memory operands into the
19485   // shuffle in many cases.
19486   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19487           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19488     return ZExt;
19489
19490   // Try to use shift instructions if fast.
19491   if (Subtarget.preferLowerShuffleAsShift()) {
19492     if (SDValue Shift =
19493             lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
19494                                 Subtarget, DAG, /*BitwiseOnly*/ true))
19495       return Shift;
19496     if (NumV2Elements == 0)
19497       if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
19498                                                    Subtarget, DAG))
19499         return Rotate;
19500   }
19501
19502   // If the shuffle mask is repeated in each 128-bit lane we can use more
19503   // efficient instructions that mirror the shuffles across the four 128-bit
19504   // lanes.
19505   SmallVector<int, 4> RepeatedMask;
19506   bool Is128BitLaneRepeatedShuffle =
19507       is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
19508   if (Is128BitLaneRepeatedShuffle) {
19509     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
19510     if (V2.isUndef())
19511       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
19512                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
19513
19514     // Use dedicated unpack instructions for masks that match their pattern.
19515     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
19516       return V;
19517   }
19518
19519   // Try to use shift instructions.
19520   if (SDValue Shift =
19521           lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
19522                               Subtarget, DAG, /*BitwiseOnly*/ false))
19523     return Shift;
19524
19525   if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
19526     if (SDValue Rotate =
19527             lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
19528       return Rotate;
19529
19530   // Try to use VALIGN.
19531   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
19532                                             Subtarget, DAG))
19533     return Rotate;
19534
19535   // Try to use byte rotation instructions.
19536   if (Subtarget.hasBWI())
19537     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
19538                                                   Subtarget, DAG))
19539       return Rotate;
19540
19541   // Assume that a single SHUFPS is faster than using a permv shuffle.
19542   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
19543   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
19544     SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
19545     SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
19546     SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
19547                                             CastV1, CastV2, DAG);
19548     return DAG.getBitcast(MVT::v16i32, ShufPS);
19549   }
19550
19551   // Try to create an in-lane repeating shuffle mask and then shuffle the
19552   // results into the target lanes.
19553   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19554           DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
19555     return V;
19556
19557   // If we have AVX512F support, we can use VEXPAND.
19558   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
19559                                        DAG, Subtarget))
19560     return V;
19561
19562   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
19563                                           Zeroable, Subtarget, DAG))
19564     return Blend;
19565
19566   return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
19567 }
19568
19569 /// Handle lowering of 32-lane 16-bit integer shuffles.
19570 static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19571                                   const APInt &Zeroable, SDValue V1, SDValue V2,
19572                                   const X86Subtarget &Subtarget,
19573                                   SelectionDAG &DAG) {
19574   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
19575   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
19576   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
19577   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
19578
19579   // Whenever we can lower this as a zext, that instruction is strictly faster
19580   // than any alternative. It also allows us to fold memory operands into the
19581   // shuffle in many cases.
19582   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19583           DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
19584     return ZExt;
19585
19586   // Use dedicated unpack instructions for masks that match their pattern.
19587   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
19588     return V;
19589
19590   // Use dedicated pack instructions for masks that match their pattern.
19591   if (SDValue V =
19592           lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
19593     return V;
19594
19595   // Try to use shift instructions.
19596   if (SDValue Shift =
19597           lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
19598                               Subtarget, DAG, /*BitwiseOnly*/ false))
19599     return Shift;
19600
19601   // Try to use byte rotation instructions.
19602   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
19603                                                 Subtarget, DAG))
19604     return Rotate;
19605
19606   if (V2.isUndef()) {
19607     // Try to use bit rotation instructions.
19608     if (SDValue Rotate =
19609             lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
19610       return Rotate;
19611
19612     SmallVector<int, 8> RepeatedMask;
19613     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
19614       // As this is a single-input shuffle, the repeated mask should be
19615       // a strictly valid v8i16 mask that we can pass through to the v8i16
19616       // lowering to handle even the v32 case.
19617       return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
19618                                                  RepeatedMask, Subtarget, DAG);
19619     }
19620   }
19621
19622   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
19623                                           Zeroable, Subtarget, DAG))
19624     return Blend;
19625
19626   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
19627                                               Zeroable, Subtarget, DAG))
19628     return PSHUFB;
19629
19630   return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
19631 }
19632
19633 /// Handle lowering of 64-lane 8-bit integer shuffles.
19634 static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19635                                  const APInt &Zeroable, SDValue V1, SDValue V2,
19636                                  const X86Subtarget &Subtarget,
19637                                  SelectionDAG &DAG) {
19638   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
19639   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
19640   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
19641   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
19642
19643   // Whenever we can lower this as a zext, that instruction is strictly faster
19644   // than any alternative. It also allows us to fold memory operands into the
19645   // shuffle in many cases.
19646   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19647           DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
19648     return ZExt;
19649
19650   // Use dedicated unpack instructions for masks that match their pattern.
19651   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
19652     return V;
19653
19654   // Use dedicated pack instructions for masks that match their pattern.
19655   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
19656                                        Subtarget))
19657     return V;
19658
19659   // Try to use shift instructions.
19660   if (SDValue Shift =
19661           lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
19662                               DAG, /*BitwiseOnly*/ false))
19663     return Shift;
19664
19665   // Try to use byte rotation instructions.
19666   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
19667                                                 Subtarget, DAG))
19668     return Rotate;
19669
19670   // Try to use bit rotation instructions.
19671   if (V2.isUndef())
19672     if (SDValue Rotate =
19673             lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
19674       return Rotate;
19675
19676   // Lower as AND if possible.
19677   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
19678                                              Zeroable, Subtarget, DAG))
19679     return Masked;
19680
19681   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
19682                                               Zeroable, Subtarget, DAG))
19683     return PSHUFB;
19684
19685   // Try to create an in-lane repeating shuffle mask and then shuffle the
19686   // results into the target lanes.
19687   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
19688           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
19689     return V;
19690
19691   if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(
19692           DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
19693     return Result;
19694
19695   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
19696                                           Zeroable, Subtarget, DAG))
19697     return Blend;
19698
19699   if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
19700     // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
19701     // PALIGNR will be cheaper than the second PSHUFB+OR.
19702     if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
19703                                                        Mask, Subtarget, DAG))
19704       return V;
19705
19706     // If we can't directly blend but can use PSHUFB, that will be better as it
19707     // can both shuffle and set up the inefficient blend.
19708     bool V1InUse, V2InUse;
19709     return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
19710                                         DAG, V1InUse, V2InUse);
19711   }
19712
19713   // Try to simplify this by merging 128-bit lanes to enable a lane-based
19714   // shuffle.
19715   if (!V2.isUndef())
19716     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
19717             DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
19718       return Result;
19719
19720   // VBMI can use VPERMV/VPERMV3 byte shuffles.
19721   if (Subtarget.hasVBMI())
19722     return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
19723
19724   return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19725 }
19726
19727 /// High-level routine to lower various 512-bit x86 vector shuffles.
19728 ///
19729 /// This routine either breaks down the specific type of a 512-bit x86 vector
19730 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
19731 /// together based on the available instructions.
19732 static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19733                                   MVT VT, SDValue V1, SDValue V2,
19734                                   const APInt &Zeroable,
19735                                   const X86Subtarget &Subtarget,
19736                                   SelectionDAG &DAG) {
19737   assert(Subtarget.hasAVX512() &&
19738          "Cannot lower 512-bit vectors w/ basic ISA!");
19739
19740   // If we have a single input to the zero element, insert that into V1 if we
19741   // can do so cheaply.
19742   int NumElts = Mask.size();
19743   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
19744
19745   if (NumV2Elements == 1 && Mask[0] >= NumElts)
19746     if (SDValue Insertion = lowerShuffleAsElementInsertion(
19747             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
19748       return Insertion;
19749
19750   // Handle special cases where the lower or upper half is UNDEF.
19751   if (SDValue V =
19752           lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
19753     return V;
19754
19755   // Check for being able to broadcast a single element.
19756   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
19757                                                   Subtarget, DAG))
19758     return Broadcast;
19759
19760   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
19761     // Try using bit ops for masking and blending before falling back to
19762     // splitting.
19763     if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
19764                                           Subtarget, DAG))
19765       return V;
19766     if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
19767       return V;
19768
19769     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
19770   }
19771
19772   if (VT == MVT::v32f16) {
19773     V1 = DAG.getBitcast(MVT::v32i16, V1);
19774     V2 = DAG.getBitcast(MVT::v32i16, V2);
19775     return DAG.getBitcast(MVT::v32f16,
19776                           DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
19777   }
19778
19779   // Dispatch to each element type for lowering. If we don't have support for
19780   // specific element type shuffles at 512 bits, immediately split them and
19781   // lower them. Each lowering routine of a given type is allowed to assume that
19782   // the requisite ISA extensions for that element type are available.
19783   switch (VT.SimpleTy) {
19784   case MVT::v8f64:
19785     return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19786   case MVT::v16f32:
19787     return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19788   case MVT::v8i64:
19789     return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19790   case MVT::v16i32:
19791     return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19792   case MVT::v32i16:
19793     return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19794   case MVT::v64i8:
19795     return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
19796
19797   default:
19798     llvm_unreachable("Not a valid 512-bit x86 vector type!");
19799   }
19800 }
19801
19802 static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
19803                                          MVT VT, SDValue V1, SDValue V2,
19804                                          const X86Subtarget &Subtarget,
19805                                          SelectionDAG &DAG) {
19806   // Shuffle should be unary.
19807   if (!V2.isUndef())
19808     return SDValue();
19809
19810   int ShiftAmt = -1;
19811   int NumElts = Mask.size();
19812   for (int i = 0; i != NumElts; ++i) {
19813     int M = Mask[i];
19814     assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
19815            "Unexpected mask index.");
19816     if (M < 0)
19817       continue;
19818
19819     // The first non-undef element determines our shift amount.
19820     if (ShiftAmt < 0) {
19821       ShiftAmt = M - i;
19822       // Need to be shifting right.
19823       if (ShiftAmt <= 0)
19824         return SDValue();
19825     }
19826     // All non-undef elements must shift by the same amount.
19827     if (ShiftAmt != M - i)
19828       return SDValue();
19829   }
19830   assert(ShiftAmt >= 0 && "All undef?");
19831
19832   // Great we found a shift right.
19833   MVT WideVT = VT;
19834   if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
19835     WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19836   SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
19837                             DAG.getUNDEF(WideVT), V1,
19838                             DAG.getIntPtrConstant(0, DL));
19839   Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
19840                     DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19841   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19842                      DAG.getIntPtrConstant(0, DL));
19843 }
19844
19845 // Determine if this shuffle can be implemented with a KSHIFT instruction.
19846 // Returns the shift amount if possible or -1 if not. This is a simplified
19847 // version of matchShuffleAsShift.
19848 static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
19849                                     int MaskOffset, const APInt &Zeroable) {
19850   int Size = Mask.size();
19851
19852   auto CheckZeros = [&](int Shift, bool Left) {
19853     for (int j = 0; j < Shift; ++j)
19854       if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
19855         return false;
19856
19857     return true;
19858   };
19859
19860   auto MatchShift = [&](int Shift, bool Left) {
19861     unsigned Pos = Left ? Shift : 0;
19862     unsigned Low = Left ? 0 : Shift;
19863     unsigned Len = Size - Shift;
19864     return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
19865   };
19866
19867   for (int Shift = 1; Shift != Size; ++Shift)
19868     for (bool Left : {true, false})
19869       if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
19870         Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
19871         return Shift;
19872       }
19873
19874   return -1;
19875 }
19876
19877
19878 // Lower vXi1 vector shuffles.
19879 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
19880 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
19881 // vector, shuffle and then truncate it back.
19882 static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19883                                 MVT VT, SDValue V1, SDValue V2,
19884                                 const APInt &Zeroable,
19885                                 const X86Subtarget &Subtarget,
19886                                 SelectionDAG &DAG) {
19887   assert(Subtarget.hasAVX512() &&
19888          "Cannot lower 512-bit vectors w/o basic ISA!");
19889
19890   int NumElts = Mask.size();
19891
19892   // Try to recognize shuffles that are just padding a subvector with zeros.
19893   int SubvecElts = 0;
19894   int Src = -1;
19895   for (int i = 0; i != NumElts; ++i) {
19896     if (Mask[i] >= 0) {
19897       // Grab the source from the first valid mask. All subsequent elements need
19898       // to use this same source.
19899       if (Src < 0)
19900         Src = Mask[i] / NumElts;
19901       if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
19902         break;
19903     }
19904
19905     ++SubvecElts;
19906   }
19907   assert(SubvecElts != NumElts && "Identity shuffle?");
19908
19909   // Clip to a power 2.
19910   SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
19911
19912   // Make sure the number of zeroable bits in the top at least covers the bits
19913   // not covered by the subvector.
19914   if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
19915     assert(Src >= 0 && "Expected a source!");
19916     MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
19917     SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
19918                                   Src == 0 ? V1 : V2,
19919                                   DAG.getIntPtrConstant(0, DL));
19920     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
19921                        DAG.getConstant(0, DL, VT),
19922                        Extract, DAG.getIntPtrConstant(0, DL));
19923   }
19924
19925   // Try a simple shift right with undef elements. Later we'll try with zeros.
19926   if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
19927                                                 DAG))
19928     return Shift;
19929
19930   // Try to match KSHIFTs.
19931   unsigned Offset = 0;
19932   for (SDValue V : { V1, V2 }) {
19933     unsigned Opcode;
19934     int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
19935     if (ShiftAmt >= 0) {
19936       MVT WideVT = VT;
19937       if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
19938         WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19939       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
19940                                 DAG.getUNDEF(WideVT), V,
19941                                 DAG.getIntPtrConstant(0, DL));
19942       // Widened right shifts need two shifts to ensure we shift in zeroes.
19943       if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
19944         int WideElts = WideVT.getVectorNumElements();
19945         // Shift left to put the original vector in the MSBs of the new size.
19946         Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
19947                           DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
19948         // Increase the shift amount to account for the left shift.
19949         ShiftAmt += WideElts - NumElts;
19950       }
19951
19952       Res = DAG.getNode(Opcode, DL, WideVT, Res,
19953                         DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
19954       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19955                          DAG.getIntPtrConstant(0, DL));
19956     }
19957     Offset += NumElts; // Increment for next iteration.
19958   }
19959
19960   // If we're broadcasting a SETCC result, try to broadcast the ops instead.
19961   // TODO: What other unary shuffles would benefit from this?
19962   if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
19963       V1->hasOneUse()) {
19964     SDValue Op0 = V1.getOperand(0);
19965     SDValue Op1 = V1.getOperand(1);
19966     ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
19967     EVT OpVT = Op0.getValueType();
19968     return DAG.getSetCC(
19969         DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
19970         DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
19971   }
19972
19973   MVT ExtVT;
19974   switch (VT.SimpleTy) {
19975   default:
19976     llvm_unreachable("Expected a vector of i1 elements");
19977   case MVT::v2i1:
19978     ExtVT = MVT::v2i64;
19979     break;
19980   case MVT::v4i1:
19981     ExtVT = MVT::v4i32;
19982     break;
19983   case MVT::v8i1:
19984     // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
19985     // shuffle.
19986     ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
19987     break;
19988   case MVT::v16i1:
19989     // Take 512-bit type, unless we are avoiding 512-bit types and have the
19990     // 256-bit operation available.
19991     ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
19992     break;
19993   case MVT::v32i1:
19994     // Take 512-bit type, unless we are avoiding 512-bit types and have the
19995     // 256-bit operation available.
19996     assert(Subtarget.hasBWI() && "Expected AVX512BW support");
19997     ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
19998     break;
19999   case MVT::v64i1:
20000     // Fall back to scalarization. FIXME: We can do better if the shuffle
20001     // can be partitioned cleanly.
20002     if (!Subtarget.useBWIRegs())
20003       return SDValue();
20004     ExtVT = MVT::v64i8;
20005     break;
20006   }
20007
20008   V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
20009   V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
20010
20011   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
20012   // i1 was sign extended we can use X86ISD::CVT2MASK.
20013   int NumElems = VT.getVectorNumElements();
20014   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
20015       (Subtarget.hasDQI() && (NumElems < 32)))
20016     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
20017                        Shuffle, ISD::SETGT);
20018
20019   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
20020 }
20021
20022 /// Helper function that returns true if the shuffle mask should be
20023 /// commuted to improve canonicalization.
20024 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
20025   int NumElements = Mask.size();
20026
20027   int NumV1Elements = 0, NumV2Elements = 0;
20028   for (int M : Mask)
20029     if (M < 0)
20030       continue;
20031     else if (M < NumElements)
20032       ++NumV1Elements;
20033     else
20034       ++NumV2Elements;
20035
20036   // Commute the shuffle as needed such that more elements come from V1 than
20037   // V2. This allows us to match the shuffle pattern strictly on how many
20038   // elements come from V1 without handling the symmetric cases.
20039   if (NumV2Elements > NumV1Elements)
20040     return true;
20041
20042   assert(NumV1Elements > 0 && "No V1 indices");
20043
20044   if (NumV2Elements == 0)
20045     return false;
20046
20047   // When the number of V1 and V2 elements are the same, try to minimize the
20048   // number of uses of V2 in the low half of the vector. When that is tied,
20049   // ensure that the sum of indices for V1 is equal to or lower than the sum
20050   // indices for V2. When those are equal, try to ensure that the number of odd
20051   // indices for V1 is lower than the number of odd indices for V2.
20052   if (NumV1Elements == NumV2Elements) {
20053     int LowV1Elements = 0, LowV2Elements = 0;
20054     for (int M : Mask.slice(0, NumElements / 2))
20055       if (M >= NumElements)
20056         ++LowV2Elements;
20057       else if (M >= 0)
20058         ++LowV1Elements;
20059     if (LowV2Elements > LowV1Elements)
20060       return true;
20061     if (LowV2Elements == LowV1Elements) {
20062       int SumV1Indices = 0, SumV2Indices = 0;
20063       for (int i = 0, Size = Mask.size(); i < Size; ++i)
20064         if (Mask[i] >= NumElements)
20065           SumV2Indices += i;
20066         else if (Mask[i] >= 0)
20067           SumV1Indices += i;
20068       if (SumV2Indices < SumV1Indices)
20069         return true;
20070       if (SumV2Indices == SumV1Indices) {
20071         int NumV1OddIndices = 0, NumV2OddIndices = 0;
20072         for (int i = 0, Size = Mask.size(); i < Size; ++i)
20073           if (Mask[i] >= NumElements)
20074             NumV2OddIndices += i % 2;
20075           else if (Mask[i] >= 0)
20076             NumV1OddIndices += i % 2;
20077         if (NumV2OddIndices < NumV1OddIndices)
20078           return true;
20079       }
20080     }
20081   }
20082
20083   return false;
20084 }
20085
20086 static bool canCombineAsMaskOperation(SDValue V,
20087                                       const X86Subtarget &Subtarget) {
20088   if (!Subtarget.hasAVX512())
20089     return false;
20090
20091   if (!V.getValueType().isSimple())
20092     return false;
20093
20094   MVT VT = V.getSimpleValueType().getScalarType();
20095   if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
20096     return false;
20097
20098   // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
20099   // are preferable to blendw/blendvb/masked-mov.
20100   if ((VT == MVT::i16 || VT == MVT::i8) &&
20101       V.getSimpleValueType().getSizeInBits() < 512)
20102     return false;
20103
20104   auto HasMaskOperation = [&](SDValue V) {
20105     // TODO: Currently we only check limited opcode. We probably extend
20106     // it to all binary operation by checking TLI.isBinOp().
20107     switch (V->getOpcode()) {
20108     default:
20109       return false;
20110     case ISD::ADD:
20111     case ISD::SUB:
20112     case ISD::AND:
20113     case ISD::XOR:
20114     case ISD::OR:
20115     case ISD::SMAX:
20116     case ISD::SMIN:
20117     case ISD::UMAX:
20118     case ISD::UMIN:
20119     case ISD::ABS:
20120     case ISD::SHL:
20121     case ISD::SRL:
20122     case ISD::SRA:
20123     case ISD::MUL:
20124       break;
20125     }
20126     if (!V->hasOneUse())
20127       return false;
20128
20129     return true;
20130   };
20131
20132   if (HasMaskOperation(V))
20133     return true;
20134
20135   return false;
20136 }
20137
20138 // Forward declaration.
20139 static SDValue canonicalizeShuffleMaskWithHorizOp(
20140     MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
20141     unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
20142     const X86Subtarget &Subtarget);
20143
20144     /// Top-level lowering for x86 vector shuffles.
20145 ///
20146 /// This handles decomposition, canonicalization, and lowering of all x86
20147 /// vector shuffles. Most of the specific lowering strategies are encapsulated
20148 /// above in helper routines. The canonicalization attempts to widen shuffles
20149 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
20150 /// s.t. only one of the two inputs needs to be tested, etc.
20151 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
20152                                    SelectionDAG &DAG) {
20153   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
20154   ArrayRef<int> OrigMask = SVOp->getMask();
20155   SDValue V1 = Op.getOperand(0);
20156   SDValue V2 = Op.getOperand(1);
20157   MVT VT = Op.getSimpleValueType();
20158   int NumElements = VT.getVectorNumElements();
20159   SDLoc DL(Op);
20160   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
20161
20162   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
20163          "Can't lower MMX shuffles");
20164
20165   bool V1IsUndef = V1.isUndef();
20166   bool V2IsUndef = V2.isUndef();
20167   if (V1IsUndef && V2IsUndef)
20168     return DAG.getUNDEF(VT);
20169
20170   // When we create a shuffle node we put the UNDEF node to second operand,
20171   // but in some cases the first operand may be transformed to UNDEF.
20172   // In this case we should just commute the node.
20173   if (V1IsUndef)
20174     return DAG.getCommutedVectorShuffle(*SVOp);
20175
20176   // Check for non-undef masks pointing at an undef vector and make the masks
20177   // undef as well. This makes it easier to match the shuffle based solely on
20178   // the mask.
20179   if (V2IsUndef &&
20180       any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
20181     SmallVector<int, 8> NewMask(OrigMask);
20182     for (int &M : NewMask)
20183       if (M >= NumElements)
20184         M = -1;
20185     return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
20186   }
20187
20188   // Check for illegal shuffle mask element index values.
20189   int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
20190   (void)MaskUpperLimit;
20191   assert(llvm::all_of(OrigMask,
20192                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
20193          "Out of bounds shuffle index");
20194
20195   // We actually see shuffles that are entirely re-arrangements of a set of
20196   // zero inputs. This mostly happens while decomposing complex shuffles into
20197   // simple ones. Directly lower these as a buildvector of zeros.
20198   APInt KnownUndef, KnownZero;
20199   computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
20200
20201   APInt Zeroable = KnownUndef | KnownZero;
20202   if (Zeroable.isAllOnes())
20203     return getZeroVector(VT, Subtarget, DAG, DL);
20204
20205   bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
20206
20207   // Try to collapse shuffles into using a vector type with fewer elements but
20208   // wider element types. We cap this to not form integers or floating point
20209   // elements wider than 64 bits. It does not seem beneficial to form i128
20210   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
20211   SmallVector<int, 16> WidenedMask;
20212   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
20213       !canCombineAsMaskOperation(V1, Subtarget) &&
20214       !canCombineAsMaskOperation(V2, Subtarget) &&
20215       canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
20216     // Shuffle mask widening should not interfere with a broadcast opportunity
20217     // by obfuscating the operands with bitcasts.
20218     // TODO: Avoid lowering directly from this top-level function: make this
20219     // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
20220     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
20221                                                     Subtarget, DAG))
20222       return Broadcast;
20223
20224     MVT NewEltVT = VT.isFloatingPoint()
20225                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
20226                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
20227     int NewNumElts = NumElements / 2;
20228     MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
20229     // Make sure that the new vector type is legal. For example, v2f64 isn't
20230     // legal on SSE1.
20231     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
20232       if (V2IsZero) {
20233         // Modify the new Mask to take all zeros from the all-zero vector.
20234         // Choose indices that are blend-friendly.
20235         bool UsedZeroVector = false;
20236         assert(is_contained(WidenedMask, SM_SentinelZero) &&
20237                "V2's non-undef elements are used?!");
20238         for (int i = 0; i != NewNumElts; ++i)
20239           if (WidenedMask[i] == SM_SentinelZero) {
20240             WidenedMask[i] = i + NewNumElts;
20241             UsedZeroVector = true;
20242           }
20243         // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
20244         // some elements to be undef.
20245         if (UsedZeroVector)
20246           V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
20247       }
20248       V1 = DAG.getBitcast(NewVT, V1);
20249       V2 = DAG.getBitcast(NewVT, V2);
20250       return DAG.getBitcast(
20251           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
20252     }
20253   }
20254
20255   SmallVector<SDValue> Ops = {V1, V2};
20256   SmallVector<int> Mask(OrigMask);
20257
20258   // Canonicalize the shuffle with any horizontal ops inputs.
20259   // NOTE: This may update Ops and Mask.
20260   if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
20261           Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
20262     return DAG.getBitcast(VT, HOp);
20263
20264   V1 = DAG.getBitcast(VT, Ops[0]);
20265   V2 = DAG.getBitcast(VT, Ops[1]);
20266   assert(NumElements == (int)Mask.size() &&
20267          "canonicalizeShuffleMaskWithHorizOp "
20268          "shouldn't alter the shuffle mask size");
20269
20270   // Commute the shuffle if it will improve canonicalization.
20271   if (canonicalizeShuffleMaskWithCommute(Mask)) {
20272     ShuffleVectorSDNode::commuteMask(Mask);
20273     std::swap(V1, V2);
20274   }
20275
20276   // For each vector width, delegate to a specialized lowering routine.
20277   if (VT.is128BitVector())
20278     return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20279
20280   if (VT.is256BitVector())
20281     return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20282
20283   if (VT.is512BitVector())
20284     return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20285
20286   if (Is1BitVector)
20287     return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
20288
20289   llvm_unreachable("Unimplemented!");
20290 }
20291
20292 /// Try to lower a VSELECT instruction to a vector shuffle.
20293 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
20294                                            const X86Subtarget &Subtarget,
20295                                            SelectionDAG &DAG) {
20296   SDValue Cond = Op.getOperand(0);
20297   SDValue LHS = Op.getOperand(1);
20298   SDValue RHS = Op.getOperand(2);
20299   MVT VT = Op.getSimpleValueType();
20300
20301   // Only non-legal VSELECTs reach this lowering, convert those into generic
20302   // shuffles and re-use the shuffle lowering path for blends.
20303   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
20304     SmallVector<int, 32> Mask;
20305     if (createShuffleMaskFromVSELECT(Mask, Cond))
20306       return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
20307   }
20308
20309   return SDValue();
20310 }
20311
20312 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
20313   SDValue Cond = Op.getOperand(0);
20314   SDValue LHS = Op.getOperand(1);
20315   SDValue RHS = Op.getOperand(2);
20316
20317   SDLoc dl(Op);
20318   MVT VT = Op.getSimpleValueType();
20319   if (isSoftFP16(VT)) {
20320     MVT NVT = VT.changeVectorElementTypeToInteger();
20321     return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
20322                                           DAG.getBitcast(NVT, LHS),
20323                                           DAG.getBitcast(NVT, RHS)));
20324   }
20325
20326   // A vselect where all conditions and data are constants can be optimized into
20327   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
20328   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
20329       ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
20330       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
20331     return SDValue();
20332
20333   // Try to lower this to a blend-style vector shuffle. This can handle all
20334   // constant condition cases.
20335   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
20336     return BlendOp;
20337
20338   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
20339   // with patterns on the mask registers on AVX-512.
20340   MVT CondVT = Cond.getSimpleValueType();
20341   unsigned CondEltSize = Cond.getScalarValueSizeInBits();
20342   if (CondEltSize == 1)
20343     return Op;
20344
20345   // Variable blends are only legal from SSE4.1 onward.
20346   if (!Subtarget.hasSSE41())
20347     return SDValue();
20348
20349   unsigned EltSize = VT.getScalarSizeInBits();
20350   unsigned NumElts = VT.getVectorNumElements();
20351
20352   // Expand v32i16/v64i8 without BWI.
20353   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
20354     return SDValue();
20355
20356   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
20357   // into an i1 condition so that we can use the mask-based 512-bit blend
20358   // instructions.
20359   if (VT.getSizeInBits() == 512) {
20360     // Build a mask by testing the condition against zero.
20361     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
20362     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
20363                                 DAG.getConstant(0, dl, CondVT),
20364                                 ISD::SETNE);
20365     // Now return a new VSELECT using the mask.
20366     return DAG.getSelect(dl, VT, Mask, LHS, RHS);
20367   }
20368
20369   // SEXT/TRUNC cases where the mask doesn't match the destination size.
20370   if (CondEltSize != EltSize) {
20371     // If we don't have a sign splat, rely on the expansion.
20372     if (CondEltSize != DAG.ComputeNumSignBits(Cond))
20373       return SDValue();
20374
20375     MVT NewCondSVT = MVT::getIntegerVT(EltSize);
20376     MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
20377     Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
20378     return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
20379   }
20380
20381   // Only some types will be legal on some subtargets. If we can emit a legal
20382   // VSELECT-matching blend, return Op, and but if we need to expand, return
20383   // a null value.
20384   switch (VT.SimpleTy) {
20385   default:
20386     // Most of the vector types have blends past SSE4.1.
20387     return Op;
20388
20389   case MVT::v32i8:
20390     // The byte blends for AVX vectors were introduced only in AVX2.
20391     if (Subtarget.hasAVX2())
20392       return Op;
20393
20394     return SDValue();
20395
20396   case MVT::v8i16:
20397   case MVT::v16i16: {
20398     // Bitcast everything to the vXi8 type and use a vXi8 vselect.
20399     MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
20400     Cond = DAG.getBitcast(CastVT, Cond);
20401     LHS = DAG.getBitcast(CastVT, LHS);
20402     RHS = DAG.getBitcast(CastVT, RHS);
20403     SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
20404     return DAG.getBitcast(VT, Select);
20405   }
20406   }
20407 }
20408
20409 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
20410   MVT VT = Op.getSimpleValueType();
20411   SDValue Vec = Op.getOperand(0);
20412   SDValue Idx = Op.getOperand(1);
20413   assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
20414   SDLoc dl(Op);
20415
20416   if (!Vec.getSimpleValueType().is128BitVector())
20417     return SDValue();
20418
20419   if (VT.getSizeInBits() == 8) {
20420     // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
20421     // we're going to zero extend the register or fold the store.
20422     if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&
20423         !X86::mayFoldIntoStore(Op))
20424       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
20425                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20426                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
20427
20428     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
20429     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
20430                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20431     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
20432   }
20433
20434   if (VT == MVT::f32) {
20435     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
20436     // the result back to FR32 register. It's only worth matching if the
20437     // result has a single use which is a store or a bitcast to i32.  And in
20438     // the case of a store, it's not worth it if the index is a constant 0,
20439     // because a MOVSSmr can be used instead, which is smaller and faster.
20440     if (!Op.hasOneUse())
20441       return SDValue();
20442     SDNode *User = *Op.getNode()->use_begin();
20443     if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
20444         (User->getOpcode() != ISD::BITCAST ||
20445          User->getValueType(0) != MVT::i32))
20446       return SDValue();
20447     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20448                                   DAG.getBitcast(MVT::v4i32, Vec), Idx);
20449     return DAG.getBitcast(MVT::f32, Extract);
20450   }
20451
20452   if (VT == MVT::i32 || VT == MVT::i64)
20453       return Op;
20454
20455   return SDValue();
20456 }
20457
20458 /// Extract one bit from mask vector, like v16i1 or v8i1.
20459 /// AVX-512 feature.
20460 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
20461                                         const X86Subtarget &Subtarget) {
20462   SDValue Vec = Op.getOperand(0);
20463   SDLoc dl(Vec);
20464   MVT VecVT = Vec.getSimpleValueType();
20465   SDValue Idx = Op.getOperand(1);
20466   auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
20467   MVT EltVT = Op.getSimpleValueType();
20468
20469   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
20470          "Unexpected vector type in ExtractBitFromMaskVector");
20471
20472   // variable index can't be handled in mask registers,
20473   // extend vector to VR512/128
20474   if (!IdxC) {
20475     unsigned NumElts = VecVT.getVectorNumElements();
20476     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
20477     // than extending to 128/256bit.
20478     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
20479     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
20480     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
20481     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
20482     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
20483   }
20484
20485   unsigned IdxVal = IdxC->getZExtValue();
20486   if (IdxVal == 0) // the operation is legal
20487     return Op;
20488
20489   // Extend to natively supported kshift.
20490   unsigned NumElems = VecVT.getVectorNumElements();
20491   MVT WideVecVT = VecVT;
20492   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
20493     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
20494     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
20495                       DAG.getUNDEF(WideVecVT), Vec,
20496                       DAG.getIntPtrConstant(0, dl));
20497   }
20498
20499   // Use kshiftr instruction to move to the lower element.
20500   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
20501                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20502
20503   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
20504                      DAG.getIntPtrConstant(0, dl));
20505 }
20506
20507 SDValue
20508 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
20509                                            SelectionDAG &DAG) const {
20510   SDLoc dl(Op);
20511   SDValue Vec = Op.getOperand(0);
20512   MVT VecVT = Vec.getSimpleValueType();
20513   SDValue Idx = Op.getOperand(1);
20514   auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
20515
20516   if (VecVT.getVectorElementType() == MVT::i1)
20517     return ExtractBitFromMaskVector(Op, DAG, Subtarget);
20518
20519   if (!IdxC) {
20520     // Its more profitable to go through memory (1 cycles throughput)
20521     // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
20522     // IACA tool was used to get performance estimation
20523     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
20524     //
20525     // example : extractelement <16 x i8> %a, i32 %i
20526     //
20527     // Block Throughput: 3.00 Cycles
20528     // Throughput Bottleneck: Port5
20529     //
20530     // | Num Of |   Ports pressure in cycles  |    |
20531     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
20532     // ---------------------------------------------
20533     // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
20534     // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
20535     // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
20536     // Total Num Of Uops: 4
20537     //
20538     //
20539     // Block Throughput: 1.00 Cycles
20540     // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
20541     //
20542     // |    |  Ports pressure in cycles   |  |
20543     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
20544     // ---------------------------------------------------------
20545     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
20546     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
20547     // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
20548     // Total Num Of Uops: 4
20549
20550     return SDValue();
20551   }
20552
20553   unsigned IdxVal = IdxC->getZExtValue();
20554
20555   // If this is a 256-bit vector result, first extract the 128-bit vector and
20556   // then extract the element from the 128-bit vector.
20557   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
20558     // Get the 128-bit vector.
20559     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
20560     MVT EltVT = VecVT.getVectorElementType();
20561
20562     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
20563     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
20564
20565     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
20566     // this can be done with a mask.
20567     IdxVal &= ElemsPerChunk - 1;
20568     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
20569                        DAG.getIntPtrConstant(IdxVal, dl));
20570   }
20571
20572   assert(VecVT.is128BitVector() && "Unexpected vector length");
20573
20574   MVT VT = Op.getSimpleValueType();
20575
20576   if (VT == MVT::i16) {
20577     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
20578     // we're going to zero extend the register or fold the store (SSE41 only).
20579     if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
20580         !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
20581       if (Subtarget.hasFP16())
20582         return Op;
20583
20584       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
20585                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20586                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
20587     }
20588
20589     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
20590                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20591     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
20592   }
20593
20594   if (Subtarget.hasSSE41())
20595     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
20596       return Res;
20597
20598   // TODO: We only extract a single element from v16i8, we can probably afford
20599   // to be more aggressive here before using the default approach of spilling to
20600   // stack.
20601   if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
20602     // Extract either the lowest i32 or any i16, and extract the sub-byte.
20603     int DWordIdx = IdxVal / 4;
20604     if (DWordIdx == 0) {
20605       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
20606                                 DAG.getBitcast(MVT::v4i32, Vec),
20607                                 DAG.getIntPtrConstant(DWordIdx, dl));
20608       int ShiftVal = (IdxVal % 4) * 8;
20609       if (ShiftVal != 0)
20610         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
20611                           DAG.getConstant(ShiftVal, dl, MVT::i8));
20612       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20613     }
20614
20615     int WordIdx = IdxVal / 2;
20616     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
20617                               DAG.getBitcast(MVT::v8i16, Vec),
20618                               DAG.getIntPtrConstant(WordIdx, dl));
20619     int ShiftVal = (IdxVal % 2) * 8;
20620     if (ShiftVal != 0)
20621       Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
20622                         DAG.getConstant(ShiftVal, dl, MVT::i8));
20623     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20624   }
20625
20626   if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
20627     if (IdxVal == 0)
20628       return Op;
20629
20630     // Shuffle the element to the lowest element, then movss or movsh.
20631     SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
20632     Mask[0] = static_cast<int>(IdxVal);
20633     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
20634     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
20635                        DAG.getIntPtrConstant(0, dl));
20636   }
20637
20638   if (VT.getSizeInBits() == 64) {
20639     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
20640     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
20641     //        to match extract_elt for f64.
20642     if (IdxVal == 0)
20643       return Op;
20644
20645     // UNPCKHPD the element to the lowest double word, then movsd.
20646     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
20647     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
20648     int Mask[2] = { 1, -1 };
20649     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
20650     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
20651                        DAG.getIntPtrConstant(0, dl));
20652   }
20653
20654   return SDValue();
20655 }
20656
20657 /// Insert one bit to mask vector, like v16i1 or v8i1.
20658 /// AVX-512 feature.
20659 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
20660                                      const X86Subtarget &Subtarget) {
20661   SDLoc dl(Op);
20662   SDValue Vec = Op.getOperand(0);
20663   SDValue Elt = Op.getOperand(1);
20664   SDValue Idx = Op.getOperand(2);
20665   MVT VecVT = Vec.getSimpleValueType();
20666
20667   if (!isa<ConstantSDNode>(Idx)) {
20668     // Non constant index. Extend source and destination,
20669     // insert element and then truncate the result.
20670     unsigned NumElts = VecVT.getVectorNumElements();
20671     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
20672     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
20673     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
20674       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
20675       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
20676     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
20677   }
20678
20679   // Copy into a k-register, extract to v1i1 and insert_subvector.
20680   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
20681   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
20682 }
20683
20684 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
20685                                                   SelectionDAG &DAG) const {
20686   MVT VT = Op.getSimpleValueType();
20687   MVT EltVT = VT.getVectorElementType();
20688   unsigned NumElts = VT.getVectorNumElements();
20689   unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
20690
20691   if (EltVT == MVT::i1)
20692     return InsertBitToMaskVector(Op, DAG, Subtarget);
20693
20694   SDLoc dl(Op);
20695   SDValue N0 = Op.getOperand(0);
20696   SDValue N1 = Op.getOperand(1);
20697   SDValue N2 = Op.getOperand(2);
20698   auto *N2C = dyn_cast<ConstantSDNode>(N2);
20699
20700   if (!N2C) {
20701     // Variable insertion indices, usually we're better off spilling to stack,
20702     // but AVX512 can use a variable compare+select by comparing against all
20703     // possible vector indices, and FP insertion has less gpr->simd traffic.
20704     if (!(Subtarget.hasBWI() ||
20705           (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
20706           (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
20707       return SDValue();
20708
20709     MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
20710     MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
20711     if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
20712       return SDValue();
20713
20714     SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
20715     SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
20716     SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
20717
20718     SmallVector<SDValue, 16> RawIndices;
20719     for (unsigned I = 0; I != NumElts; ++I)
20720       RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
20721     SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
20722
20723     // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
20724     return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
20725                            ISD::CondCode::SETEQ);
20726   }
20727
20728   if (N2C->getAPIntValue().uge(NumElts))
20729     return SDValue();
20730   uint64_t IdxVal = N2C->getZExtValue();
20731
20732   bool IsZeroElt = X86::isZeroNode(N1);
20733   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
20734
20735   if (IsZeroElt || IsAllOnesElt) {
20736     // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
20737     // We don't deal with i8 0 since it appears to be handled elsewhere.
20738     if (IsAllOnesElt &&
20739         ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
20740          ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
20741       SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
20742       SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
20743       SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
20744       CstVectorElts[IdxVal] = OnesCst;
20745       SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
20746       return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
20747     }
20748     // See if we can do this more efficiently with a blend shuffle with a
20749     // rematerializable vector.
20750     if (Subtarget.hasSSE41() &&
20751         (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
20752       SmallVector<int, 8> BlendMask;
20753       for (unsigned i = 0; i != NumElts; ++i)
20754         BlendMask.push_back(i == IdxVal ? i + NumElts : i);
20755       SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
20756                                     : getOnesVector(VT, DAG, dl);
20757       return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
20758     }
20759   }
20760
20761   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
20762   // into that, and then insert the subvector back into the result.
20763   if (VT.is256BitVector() || VT.is512BitVector()) {
20764     // With a 256-bit vector, we can insert into the zero element efficiently
20765     // using a blend if we have AVX or AVX2 and the right data type.
20766     if (VT.is256BitVector() && IdxVal == 0) {
20767       // TODO: It is worthwhile to cast integer to floating point and back
20768       // and incur a domain crossing penalty if that's what we'll end up
20769       // doing anyway after extracting to a 128-bit vector.
20770       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
20771           (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
20772         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
20773         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
20774                            DAG.getTargetConstant(1, dl, MVT::i8));
20775       }
20776     }
20777
20778     unsigned NumEltsIn128 = 128 / EltSizeInBits;
20779     assert(isPowerOf2_32(NumEltsIn128) &&
20780            "Vectors will always have power-of-two number of elements.");
20781
20782     // If we are not inserting into the low 128-bit vector chunk,
20783     // then prefer the broadcast+blend sequence.
20784     // FIXME: relax the profitability check iff all N1 uses are insertions.
20785     if (IdxVal >= NumEltsIn128 &&
20786         ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
20787          (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
20788           X86::mayFoldLoad(N1, Subtarget)))) {
20789       SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
20790       SmallVector<int, 8> BlendMask;
20791       for (unsigned i = 0; i != NumElts; ++i)
20792         BlendMask.push_back(i == IdxVal ? i + NumElts : i);
20793       return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
20794     }
20795
20796     // Get the desired 128-bit vector chunk.
20797     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
20798
20799     // Insert the element into the desired chunk.
20800     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
20801     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
20802
20803     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
20804                     DAG.getIntPtrConstant(IdxIn128, dl));
20805
20806     // Insert the changed part back into the bigger vector
20807     return insert128BitVector(N0, V, IdxVal, DAG, dl);
20808   }
20809   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
20810
20811   // This will be just movw/movd/movq/movsh/movss/movsd.
20812   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
20813     if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
20814         EltVT == MVT::f16 || EltVT == MVT::i64) {
20815       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
20816       return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
20817     }
20818
20819     // We can't directly insert an i8 or i16 into a vector, so zero extend
20820     // it to i32 first.
20821     if (EltVT == MVT::i16 || EltVT == MVT::i8) {
20822       N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
20823       MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
20824       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
20825       N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
20826       return DAG.getBitcast(VT, N1);
20827     }
20828   }
20829
20830   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
20831   // argument. SSE41 required for pinsrb.
20832   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
20833     unsigned Opc;
20834     if (VT == MVT::v8i16) {
20835       assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
20836       Opc = X86ISD::PINSRW;
20837     } else {
20838       assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
20839       assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
20840       Opc = X86ISD::PINSRB;
20841     }
20842
20843     assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
20844     N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
20845     N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
20846     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
20847   }
20848
20849   if (Subtarget.hasSSE41()) {
20850     if (EltVT == MVT::f32) {
20851       // Bits [7:6] of the constant are the source select. This will always be
20852       //   zero here. The DAG Combiner may combine an extract_elt index into
20853       //   these bits. For example (insert (extract, 3), 2) could be matched by
20854       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
20855       // Bits [5:4] of the constant are the destination select. This is the
20856       //   value of the incoming immediate.
20857       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
20858       //   combine either bitwise AND or insert of float 0.0 to set these bits.
20859
20860       bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
20861       if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
20862         // If this is an insertion of 32-bits into the low 32-bits of
20863         // a vector, we prefer to generate a blend with immediate rather
20864         // than an insertps. Blends are simpler operations in hardware and so
20865         // will always have equal or better performance than insertps.
20866         // But if optimizing for size and there's a load folding opportunity,
20867         // generate insertps because blendps does not have a 32-bit memory
20868         // operand form.
20869         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
20870         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
20871                            DAG.getTargetConstant(1, dl, MVT::i8));
20872       }
20873       // Create this as a scalar to vector..
20874       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
20875       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
20876                          DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
20877     }
20878
20879     // PINSR* works with constant index.
20880     if (EltVT == MVT::i32 || EltVT == MVT::i64)
20881       return Op;
20882   }
20883
20884   return SDValue();
20885 }
20886
20887 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
20888                                      SelectionDAG &DAG) {
20889   SDLoc dl(Op);
20890   MVT OpVT = Op.getSimpleValueType();
20891
20892   // It's always cheaper to replace a xor+movd with xorps and simplifies further
20893   // combines.
20894   if (X86::isZeroNode(Op.getOperand(0)))
20895     return getZeroVector(OpVT, Subtarget, DAG, dl);
20896
20897   // If this is a 256-bit vector result, first insert into a 128-bit
20898   // vector and then insert into the 256-bit vector.
20899   if (!OpVT.is128BitVector()) {
20900     // Insert into a 128-bit vector.
20901     unsigned SizeFactor = OpVT.getSizeInBits() / 128;
20902     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
20903                                  OpVT.getVectorNumElements() / SizeFactor);
20904
20905     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
20906
20907     // Insert the 128-bit vector.
20908     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
20909   }
20910   assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
20911          "Expected an SSE type!");
20912
20913   // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
20914   // tblgen.
20915   if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
20916     return Op;
20917
20918   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
20919   return DAG.getBitcast(
20920       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
20921 }
20922
20923 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
20924 // simple superregister reference or explicit instructions to insert
20925 // the upper bits of a vector.
20926 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
20927                                      SelectionDAG &DAG) {
20928   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
20929
20930   return insert1BitVector(Op, DAG, Subtarget);
20931 }
20932
20933 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
20934                                       SelectionDAG &DAG) {
20935   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
20936          "Only vXi1 extract_subvectors need custom lowering");
20937
20938   SDLoc dl(Op);
20939   SDValue Vec = Op.getOperand(0);
20940   uint64_t IdxVal = Op.getConstantOperandVal(1);
20941
20942   if (IdxVal == 0) // the operation is legal
20943     return Op;
20944
20945   MVT VecVT = Vec.getSimpleValueType();
20946   unsigned NumElems = VecVT.getVectorNumElements();
20947
20948   // Extend to natively supported kshift.
20949   MVT WideVecVT = VecVT;
20950   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
20951     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
20952     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
20953                       DAG.getUNDEF(WideVecVT), Vec,
20954                       DAG.getIntPtrConstant(0, dl));
20955   }
20956
20957   // Shift to the LSB.
20958   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
20959                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
20960
20961   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
20962                      DAG.getIntPtrConstant(0, dl));
20963 }
20964
20965 // Returns the appropriate wrapper opcode for a global reference.
20966 unsigned X86TargetLowering::getGlobalWrapperKind(
20967     const GlobalValue *GV, const unsigned char OpFlags) const {
20968   // References to absolute symbols are never PC-relative.
20969   if (GV && GV->isAbsoluteSymbolRef())
20970     return X86ISD::Wrapper;
20971
20972   CodeModel::Model M = getTargetMachine().getCodeModel();
20973   if (Subtarget.isPICStyleRIPRel() &&
20974       (M == CodeModel::Small || M == CodeModel::Kernel))
20975     return X86ISD::WrapperRIP;
20976
20977   // In the medium model, functions can always be referenced RIP-relatively,
20978   // since they must be within 2GiB. This is also possible in non-PIC mode, and
20979   // shorter than the 64-bit absolute immediate that would otherwise be emitted.
20980   if (M == CodeModel::Medium && isa_and_nonnull<Function>(GV))
20981     return X86ISD::WrapperRIP;
20982
20983   // GOTPCREL references must always use RIP.
20984   if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
20985     return X86ISD::WrapperRIP;
20986
20987   return X86ISD::Wrapper;
20988 }
20989
20990 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
20991 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
20992 // one of the above mentioned nodes. It has to be wrapped because otherwise
20993 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
20994 // be used to form addressing mode. These wrapped nodes will be selected
20995 // into MOV32ri.
20996 SDValue
20997 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
20998   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
20999
21000   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
21001   // global base reg.
21002   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
21003
21004   auto PtrVT = getPointerTy(DAG.getDataLayout());
21005   SDValue Result = DAG.getTargetConstantPool(
21006       CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
21007   SDLoc DL(CP);
21008   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
21009   // With PIC, the address is actually $g + Offset.
21010   if (OpFlag) {
21011     Result =
21012         DAG.getNode(ISD::ADD, DL, PtrVT,
21013                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
21014   }
21015
21016   return Result;
21017 }
21018
21019 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
21020   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
21021
21022   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
21023   // global base reg.
21024   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
21025
21026   auto PtrVT = getPointerTy(DAG.getDataLayout());
21027   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
21028   SDLoc DL(JT);
21029   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
21030
21031   // With PIC, the address is actually $g + Offset.
21032   if (OpFlag)
21033     Result =
21034         DAG.getNode(ISD::ADD, DL, PtrVT,
21035                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
21036
21037   return Result;
21038 }
21039
21040 SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
21041                                                SelectionDAG &DAG) const {
21042   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
21043 }
21044
21045 SDValue
21046 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
21047   // Create the TargetBlockAddressAddress node.
21048   unsigned char OpFlags =
21049     Subtarget.classifyBlockAddressReference();
21050   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
21051   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
21052   SDLoc dl(Op);
21053   auto PtrVT = getPointerTy(DAG.getDataLayout());
21054   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
21055   Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
21056
21057   // With PIC, the address is actually $g + Offset.
21058   if (isGlobalRelativeToPICBase(OpFlags)) {
21059     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
21060                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
21061   }
21062
21063   return Result;
21064 }
21065
21066 /// Creates target global address or external symbol nodes for calls or
21067 /// other uses.
21068 SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
21069                                                  bool ForCall) const {
21070   // Unpack the global address or external symbol.
21071   const SDLoc &dl = SDLoc(Op);
21072   const GlobalValue *GV = nullptr;
21073   int64_t Offset = 0;
21074   const char *ExternalSym = nullptr;
21075   if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
21076     GV = G->getGlobal();
21077     Offset = G->getOffset();
21078   } else {
21079     const auto *ES = cast<ExternalSymbolSDNode>(Op);
21080     ExternalSym = ES->getSymbol();
21081   }
21082
21083   // Calculate some flags for address lowering.
21084   const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
21085   unsigned char OpFlags;
21086   if (ForCall)
21087     OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
21088   else
21089     OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
21090   bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
21091   bool NeedsLoad = isGlobalStubReference(OpFlags);
21092
21093   CodeModel::Model M = DAG.getTarget().getCodeModel();
21094   auto PtrVT = getPointerTy(DAG.getDataLayout());
21095   SDValue Result;
21096
21097   if (GV) {
21098     // Create a target global address if this is a global. If possible, fold the
21099     // offset into the global address reference. Otherwise, ADD it on later.
21100     // Suppress the folding if Offset is negative: movl foo-1, %eax is not
21101     // allowed because if the address of foo is 0, the ELF R_X86_64_32
21102     // relocation will compute to a negative value, which is invalid.
21103     int64_t GlobalOffset = 0;
21104     if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
21105         X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
21106       std::swap(GlobalOffset, Offset);
21107     }
21108     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
21109   } else {
21110     // If this is not a global address, this must be an external symbol.
21111     Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
21112   }
21113
21114   // If this is a direct call, avoid the wrapper if we don't need to do any
21115   // loads or adds. This allows SDAG ISel to match direct calls.
21116   if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
21117     return Result;
21118
21119   Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
21120
21121   // With PIC, the address is actually $g + Offset.
21122   if (HasPICReg) {
21123     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
21124                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
21125   }
21126
21127   // For globals that require a load from a stub to get the address, emit the
21128   // load.
21129   if (NeedsLoad)
21130     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
21131                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
21132
21133   // If there was a non-zero offset that we didn't fold, create an explicit
21134   // addition for it.
21135   if (Offset != 0)
21136     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
21137                          DAG.getConstant(Offset, dl, PtrVT));
21138
21139   return Result;
21140 }
21141
21142 SDValue
21143 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
21144   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
21145 }
21146
21147 static SDValue
21148 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
21149            SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
21150            unsigned char OperandFlags, bool LocalDynamic = false) {
21151   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21152   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
21153   SDLoc dl(GA);
21154   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21155                                            GA->getValueType(0),
21156                                            GA->getOffset(),
21157                                            OperandFlags);
21158
21159   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
21160                                            : X86ISD::TLSADDR;
21161
21162   if (InGlue) {
21163     SDValue Ops[] = { Chain,  TGA, *InGlue };
21164     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
21165   } else {
21166     SDValue Ops[]  = { Chain, TGA };
21167     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
21168   }
21169
21170   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
21171   MFI.setAdjustsStack(true);
21172   MFI.setHasCalls(true);
21173
21174   SDValue Glue = Chain.getValue(1);
21175   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
21176 }
21177
21178 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
21179 static SDValue
21180 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21181                                 const EVT PtrVT) {
21182   SDValue InGlue;
21183   SDLoc dl(GA);  // ? function entry point might be better
21184   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
21185                                    DAG.getNode(X86ISD::GlobalBaseReg,
21186                                                SDLoc(), PtrVT), InGlue);
21187   InGlue = Chain.getValue(1);
21188
21189   return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
21190 }
21191
21192 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
21193 static SDValue
21194 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21195                                 const EVT PtrVT) {
21196   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
21197                     X86::RAX, X86II::MO_TLSGD);
21198 }
21199
21200 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
21201 static SDValue
21202 LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21203                                  const EVT PtrVT) {
21204   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
21205                     X86::EAX, X86II::MO_TLSGD);
21206 }
21207
21208 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
21209                                            SelectionDAG &DAG, const EVT PtrVT,
21210                                            bool Is64Bit, bool Is64BitLP64) {
21211   SDLoc dl(GA);
21212
21213   // Get the start address of the TLS block for this module.
21214   X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
21215       .getInfo<X86MachineFunctionInfo>();
21216   MFI->incNumLocalDynamicTLSAccesses();
21217
21218   SDValue Base;
21219   if (Is64Bit) {
21220     unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
21221     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
21222                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
21223   } else {
21224     SDValue InGlue;
21225     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
21226         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
21227     InGlue = Chain.getValue(1);
21228     Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
21229                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
21230   }
21231
21232   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
21233   // of Base.
21234
21235   // Build x@dtpoff.
21236   unsigned char OperandFlags = X86II::MO_DTPOFF;
21237   unsigned WrapperKind = X86ISD::Wrapper;
21238   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21239                                            GA->getValueType(0),
21240                                            GA->getOffset(), OperandFlags);
21241   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
21242
21243   // Add x@dtpoff with the base.
21244   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
21245 }
21246
21247 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
21248 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
21249                                    const EVT PtrVT, TLSModel::Model model,
21250                                    bool is64Bit, bool isPIC) {
21251   SDLoc dl(GA);
21252
21253   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
21254   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
21255                                                          is64Bit ? 257 : 256));
21256
21257   SDValue ThreadPointer =
21258       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
21259                   MachinePointerInfo(Ptr));
21260
21261   unsigned char OperandFlags = 0;
21262   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
21263   // initialexec.
21264   unsigned WrapperKind = X86ISD::Wrapper;
21265   if (model == TLSModel::LocalExec) {
21266     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
21267   } else if (model == TLSModel::InitialExec) {
21268     if (is64Bit) {
21269       OperandFlags = X86II::MO_GOTTPOFF;
21270       WrapperKind = X86ISD::WrapperRIP;
21271     } else {
21272       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
21273     }
21274   } else {
21275     llvm_unreachable("Unexpected model");
21276   }
21277
21278   // emit "addl x@ntpoff,%eax" (local exec)
21279   // or "addl x@indntpoff,%eax" (initial exec)
21280   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
21281   SDValue TGA =
21282       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
21283                                  GA->getOffset(), OperandFlags);
21284   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
21285
21286   if (model == TLSModel::InitialExec) {
21287     if (isPIC && !is64Bit) {
21288       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
21289                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
21290                            Offset);
21291     }
21292
21293     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
21294                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
21295   }
21296
21297   // The address of the thread local variable is the add of the thread
21298   // pointer with the offset of the variable.
21299   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
21300 }
21301
21302 SDValue
21303 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
21304
21305   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
21306
21307   if (DAG.getTarget().useEmulatedTLS())
21308     return LowerToTLSEmulatedModel(GA, DAG);
21309
21310   const GlobalValue *GV = GA->getGlobal();
21311   auto PtrVT = getPointerTy(DAG.getDataLayout());
21312   bool PositionIndependent = isPositionIndependent();
21313
21314   if (Subtarget.isTargetELF()) {
21315     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
21316     switch (model) {
21317       case TLSModel::GeneralDynamic:
21318         if (Subtarget.is64Bit()) {
21319           if (Subtarget.isTarget64BitLP64())
21320             return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
21321           return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
21322         }
21323         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
21324       case TLSModel::LocalDynamic:
21325         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
21326                                            Subtarget.isTarget64BitLP64());
21327       case TLSModel::InitialExec:
21328       case TLSModel::LocalExec:
21329         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
21330                                    PositionIndependent);
21331     }
21332     llvm_unreachable("Unknown TLS model.");
21333   }
21334
21335   if (Subtarget.isTargetDarwin()) {
21336     // Darwin only has one model of TLS.  Lower to that.
21337     unsigned char OpFlag = 0;
21338     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
21339                            X86ISD::WrapperRIP : X86ISD::Wrapper;
21340
21341     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
21342     // global base reg.
21343     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
21344     if (PIC32)
21345       OpFlag = X86II::MO_TLVP_PIC_BASE;
21346     else
21347       OpFlag = X86II::MO_TLVP;
21348     SDLoc DL(Op);
21349     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
21350                                                 GA->getValueType(0),
21351                                                 GA->getOffset(), OpFlag);
21352     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
21353
21354     // With PIC32, the address is actually $g + Offset.
21355     if (PIC32)
21356       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
21357                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
21358                            Offset);
21359
21360     // Lowering the machine isd will make sure everything is in the right
21361     // location.
21362     SDValue Chain = DAG.getEntryNode();
21363     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
21364     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
21365     SDValue Args[] = { Chain, Offset };
21366     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
21367     Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
21368
21369     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
21370     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21371     MFI.setAdjustsStack(true);
21372
21373     // And our return value (tls address) is in the standard call return value
21374     // location.
21375     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
21376     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
21377   }
21378
21379   if (Subtarget.isOSWindows()) {
21380     // Just use the implicit TLS architecture
21381     // Need to generate something similar to:
21382     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
21383     //                                  ; from TEB
21384     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
21385     //   mov     rcx, qword [rdx+rcx*8]
21386     //   mov     eax, .tls$:tlsvar
21387     //   [rax+rcx] contains the address
21388     // Windows 64bit: gs:0x58
21389     // Windows 32bit: fs:__tls_array
21390
21391     SDLoc dl(GA);
21392     SDValue Chain = DAG.getEntryNode();
21393
21394     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
21395     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
21396     // use its literal value of 0x2C.
21397     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
21398                                         ? Type::getInt8PtrTy(*DAG.getContext(),
21399                                                              256)
21400                                         : Type::getInt32PtrTy(*DAG.getContext(),
21401                                                               257));
21402
21403     SDValue TlsArray = Subtarget.is64Bit()
21404                            ? DAG.getIntPtrConstant(0x58, dl)
21405                            : (Subtarget.isTargetWindowsGNU()
21406                                   ? DAG.getIntPtrConstant(0x2C, dl)
21407                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
21408
21409     SDValue ThreadPointer =
21410         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
21411
21412     SDValue res;
21413     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
21414       res = ThreadPointer;
21415     } else {
21416       // Load the _tls_index variable
21417       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
21418       if (Subtarget.is64Bit())
21419         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
21420                              MachinePointerInfo(), MVT::i32);
21421       else
21422         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
21423
21424       const DataLayout &DL = DAG.getDataLayout();
21425       SDValue Scale =
21426           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
21427       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
21428
21429       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
21430     }
21431
21432     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
21433
21434     // Get the offset of start of .tls section
21435     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
21436                                              GA->getValueType(0),
21437                                              GA->getOffset(), X86II::MO_SECREL);
21438     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
21439
21440     // The address of the thread local variable is the add of the thread
21441     // pointer with the offset of the variable.
21442     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
21443   }
21444
21445   llvm_unreachable("TLS not implemented for this target.");
21446 }
21447
21448 /// Lower SRA_PARTS and friends, which return two i32 values
21449 /// and take a 2 x i32 value to shift plus a shift amount.
21450 /// TODO: Can this be moved to general expansion code?
21451 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
21452   SDValue Lo, Hi;
21453   DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
21454   return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
21455 }
21456
21457 // Try to use a packed vector operation to handle i64 on 32-bit targets when
21458 // AVX512DQ is enabled.
21459 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
21460                                         const X86Subtarget &Subtarget) {
21461   assert((Op.getOpcode() == ISD::SINT_TO_FP ||
21462           Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
21463           Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
21464           Op.getOpcode() == ISD::UINT_TO_FP) &&
21465          "Unexpected opcode!");
21466   bool IsStrict = Op->isStrictFPOpcode();
21467   unsigned OpNo = IsStrict ? 1 : 0;
21468   SDValue Src = Op.getOperand(OpNo);
21469   MVT SrcVT = Src.getSimpleValueType();
21470   MVT VT = Op.getSimpleValueType();
21471
21472    if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
21473        (VT != MVT::f32 && VT != MVT::f64))
21474     return SDValue();
21475
21476   // Pack the i64 into a vector, do the operation and extract.
21477
21478   // Using 256-bit to ensure result is 128-bits for f32 case.
21479   unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
21480   MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
21481   MVT VecVT = MVT::getVectorVT(VT, NumElts);
21482
21483   SDLoc dl(Op);
21484   SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
21485   if (IsStrict) {
21486     SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
21487                                  {Op.getOperand(0), InVec});
21488     SDValue Chain = CvtVec.getValue(1);
21489     SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21490                                 DAG.getIntPtrConstant(0, dl));
21491     return DAG.getMergeValues({Value, Chain}, dl);
21492   }
21493
21494   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
21495
21496   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21497                      DAG.getIntPtrConstant(0, dl));
21498 }
21499
21500 // Try to use a packed vector operation to handle i64 on 32-bit targets.
21501 static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
21502                                  const X86Subtarget &Subtarget) {
21503   assert((Op.getOpcode() == ISD::SINT_TO_FP ||
21504           Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
21505           Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
21506           Op.getOpcode() == ISD::UINT_TO_FP) &&
21507          "Unexpected opcode!");
21508   bool IsStrict = Op->isStrictFPOpcode();
21509   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21510   MVT SrcVT = Src.getSimpleValueType();
21511   MVT VT = Op.getSimpleValueType();
21512
21513   if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
21514     return SDValue();
21515
21516   // Pack the i64 into a vector, do the operation and extract.
21517
21518   assert(Subtarget.hasFP16() && "Expected FP16");
21519
21520   SDLoc dl(Op);
21521   SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
21522   if (IsStrict) {
21523     SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
21524                                  {Op.getOperand(0), InVec});
21525     SDValue Chain = CvtVec.getValue(1);
21526     SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21527                                 DAG.getIntPtrConstant(0, dl));
21528     return DAG.getMergeValues({Value, Chain}, dl);
21529   }
21530
21531   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
21532
21533   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
21534                      DAG.getIntPtrConstant(0, dl));
21535 }
21536
21537 static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
21538                           const X86Subtarget &Subtarget) {
21539   switch (Opcode) {
21540     case ISD::SINT_TO_FP:
21541       // TODO: Handle wider types with AVX/AVX512.
21542       if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
21543         return false;
21544       // CVTDQ2PS or (V)CVTDQ2PD
21545       return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
21546
21547     case ISD::UINT_TO_FP:
21548       // TODO: Handle wider types and i64 elements.
21549       if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
21550         return false;
21551       // VCVTUDQ2PS or VCVTUDQ2PD
21552       return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
21553
21554     default:
21555       return false;
21556   }
21557 }
21558
21559 /// Given a scalar cast operation that is extracted from a vector, try to
21560 /// vectorize the cast op followed by extraction. This will avoid an expensive
21561 /// round-trip between XMM and GPR.
21562 static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
21563                                       const X86Subtarget &Subtarget) {
21564   // TODO: This could be enhanced to handle smaller integer types by peeking
21565   // through an extend.
21566   SDValue Extract = Cast.getOperand(0);
21567   MVT DestVT = Cast.getSimpleValueType();
21568   if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21569       !isa<ConstantSDNode>(Extract.getOperand(1)))
21570     return SDValue();
21571
21572   // See if we have a 128-bit vector cast op for this type of cast.
21573   SDValue VecOp = Extract.getOperand(0);
21574   MVT FromVT = VecOp.getSimpleValueType();
21575   unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
21576   MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
21577   MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
21578   if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
21579     return SDValue();
21580
21581   // If we are extracting from a non-zero element, first shuffle the source
21582   // vector to allow extracting from element zero.
21583   SDLoc DL(Cast);
21584   if (!isNullConstant(Extract.getOperand(1))) {
21585     SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
21586     Mask[0] = Extract.getConstantOperandVal(1);
21587     VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
21588   }
21589   // If the source vector is wider than 128-bits, extract the low part. Do not
21590   // create an unnecessarily wide vector cast op.
21591   if (FromVT != Vec128VT)
21592     VecOp = extract128BitVector(VecOp, 0, DAG, DL);
21593
21594   // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
21595   // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
21596   SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
21597   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
21598                      DAG.getIntPtrConstant(0, DL));
21599 }
21600
21601 /// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
21602 /// try to vectorize the cast ops. This will avoid an expensive round-trip
21603 /// between XMM and GPR.
21604 static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
21605                                 const X86Subtarget &Subtarget) {
21606   // TODO: Allow FP_TO_UINT.
21607   SDValue CastToInt = CastToFP.getOperand(0);
21608   MVT VT = CastToFP.getSimpleValueType();
21609   if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
21610     return SDValue();
21611
21612   MVT IntVT = CastToInt.getSimpleValueType();
21613   SDValue X = CastToInt.getOperand(0);
21614   MVT SrcVT = X.getSimpleValueType();
21615   if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
21616     return SDValue();
21617
21618   // See if we have 128-bit vector cast instructions for this type of cast.
21619   // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
21620   if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
21621       IntVT != MVT::i32)
21622     return SDValue();
21623
21624   unsigned SrcSize = SrcVT.getSizeInBits();
21625   unsigned IntSize = IntVT.getSizeInBits();
21626   unsigned VTSize = VT.getSizeInBits();
21627   MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
21628   MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
21629   MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
21630
21631   // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
21632   unsigned ToIntOpcode =
21633       SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
21634   unsigned ToFPOpcode =
21635       IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
21636
21637   // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
21638   //
21639   // We are not defining the high elements (for example, zero them) because
21640   // that could nullify any performance advantage that we hoped to gain from
21641   // this vector op hack. We do not expect any adverse effects (like denorm
21642   // penalties) with cast ops.
21643   SDLoc DL(CastToFP);
21644   SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
21645   SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
21646   SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
21647   SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
21648   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
21649 }
21650
21651 static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
21652                                     const X86Subtarget &Subtarget) {
21653   SDLoc DL(Op);
21654   bool IsStrict = Op->isStrictFPOpcode();
21655   MVT VT = Op->getSimpleValueType(0);
21656   SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
21657
21658   if (Subtarget.hasDQI()) {
21659     assert(!Subtarget.hasVLX() && "Unexpected features");
21660
21661     assert((Src.getSimpleValueType() == MVT::v2i64 ||
21662             Src.getSimpleValueType() == MVT::v4i64) &&
21663            "Unsupported custom type");
21664
21665     // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
21666     assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
21667            "Unexpected VT!");
21668     MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21669
21670     // Need to concat with zero vector for strict fp to avoid spurious
21671     // exceptions.
21672     SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
21673                            : DAG.getUNDEF(MVT::v8i64);
21674     Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
21675                       DAG.getIntPtrConstant(0, DL));
21676     SDValue Res, Chain;
21677     if (IsStrict) {
21678       Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
21679                         {Op->getOperand(0), Src});
21680       Chain = Res.getValue(1);
21681     } else {
21682       Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
21683     }
21684
21685     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
21686                       DAG.getIntPtrConstant(0, DL));
21687
21688     if (IsStrict)
21689       return DAG.getMergeValues({Res, Chain}, DL);
21690     return Res;
21691   }
21692
21693   bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
21694                   Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
21695   if (VT != MVT::v4f32 || IsSigned)
21696     return SDValue();
21697
21698   SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
21699   SDValue One  = DAG.getConstant(1, DL, MVT::v4i64);
21700   SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
21701                              DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
21702                              DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
21703   SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
21704   SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
21705   SmallVector<SDValue, 4> SignCvts(4);
21706   SmallVector<SDValue, 4> Chains(4);
21707   for (int i = 0; i != 4; ++i) {
21708     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
21709                               DAG.getIntPtrConstant(i, DL));
21710     if (IsStrict) {
21711       SignCvts[i] =
21712           DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
21713                       {Op.getOperand(0), Elt});
21714       Chains[i] = SignCvts[i].getValue(1);
21715     } else {
21716       SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
21717     }
21718   }
21719   SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
21720
21721   SDValue Slow, Chain;
21722   if (IsStrict) {
21723     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
21724     Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
21725                        {Chain, SignCvt, SignCvt});
21726     Chain = Slow.getValue(1);
21727   } else {
21728     Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
21729   }
21730
21731   IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
21732   SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
21733
21734   if (IsStrict)
21735     return DAG.getMergeValues({Cvt, Chain}, DL);
21736
21737   return Cvt;
21738 }
21739
21740 static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
21741   bool IsStrict = Op->isStrictFPOpcode();
21742   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21743   SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
21744   MVT VT = Op.getSimpleValueType();
21745   MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21746   SDLoc dl(Op);
21747
21748   SDValue Rnd = DAG.getIntPtrConstant(0, dl);
21749   if (IsStrict)
21750     return DAG.getNode(
21751         ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
21752         {Chain,
21753          DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
21754          Rnd});
21755   return DAG.getNode(ISD::FP_ROUND, dl, VT,
21756                      DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
21757 }
21758
21759 static bool isLegalConversion(MVT VT, bool IsSigned,
21760                               const X86Subtarget &Subtarget) {
21761   if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
21762     return true;
21763   if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
21764     return true;
21765   if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
21766     return true;
21767   if (Subtarget.useAVX512Regs()) {
21768     if (VT == MVT::v16i32)
21769       return true;
21770     if (VT == MVT::v8i64 && Subtarget.hasDQI())
21771       return true;
21772   }
21773   if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
21774       (VT == MVT::v2i64 || VT == MVT::v4i64))
21775     return true;
21776   return false;
21777 }
21778
21779 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
21780                                            SelectionDAG &DAG) const {
21781   bool IsStrict = Op->isStrictFPOpcode();
21782   unsigned OpNo = IsStrict ? 1 : 0;
21783   SDValue Src = Op.getOperand(OpNo);
21784   SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
21785   MVT SrcVT = Src.getSimpleValueType();
21786   MVT VT = Op.getSimpleValueType();
21787   SDLoc dl(Op);
21788
21789   if (isSoftFP16(VT))
21790     return promoteXINT_TO_FP(Op, DAG);
21791   else if (isLegalConversion(SrcVT, true, Subtarget))
21792     return Op;
21793
21794   if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21795     return LowerWin64_INT128_TO_FP(Op, DAG);
21796
21797   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
21798     return Extract;
21799
21800   if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
21801     return R;
21802
21803   if (SrcVT.isVector()) {
21804     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
21805       // Note: Since v2f64 is a legal type. We don't need to zero extend the
21806       // source for strict FP.
21807       if (IsStrict)
21808         return DAG.getNode(
21809             X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
21810             {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
21811                                 DAG.getUNDEF(SrcVT))});
21812       return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
21813                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
21814                                      DAG.getUNDEF(SrcVT)));
21815     }
21816     if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
21817       return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
21818
21819     return SDValue();
21820   }
21821
21822   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
21823          "Unknown SINT_TO_FP to lower!");
21824
21825   bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
21826
21827   // These are really Legal; return the operand so the caller accepts it as
21828   // Legal.
21829   if (SrcVT == MVT::i32 && UseSSEReg)
21830     return Op;
21831   if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
21832     return Op;
21833
21834   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
21835     return V;
21836   if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
21837     return V;
21838
21839   // SSE doesn't have an i16 conversion so we need to promote.
21840   if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
21841     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
21842     if (IsStrict)
21843       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
21844                          {Chain, Ext});
21845
21846     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
21847   }
21848
21849   if (VT == MVT::f128 || !Subtarget.hasX87())
21850     return SDValue();
21851
21852   SDValue ValueToStore = Src;
21853   if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
21854     // Bitcasting to f64 here allows us to do a single 64-bit store from
21855     // an SSE register, avoiding the store forwarding penalty that would come
21856     // with two 32-bit stores.
21857     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
21858
21859   unsigned Size = SrcVT.getStoreSize();
21860   Align Alignment(Size);
21861   MachineFunction &MF = DAG.getMachineFunction();
21862   auto PtrVT = getPointerTy(MF.getDataLayout());
21863   int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
21864   MachinePointerInfo MPI =
21865       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
21866   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21867   Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
21868   std::pair<SDValue, SDValue> Tmp =
21869       BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
21870
21871   if (IsStrict)
21872     return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
21873
21874   return Tmp.first;
21875 }
21876
21877 std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
21878     EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
21879     MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
21880   // Build the FILD
21881   SDVTList Tys;
21882   bool useSSE = isScalarFPTypeInSSEReg(DstVT);
21883   if (useSSE)
21884     Tys = DAG.getVTList(MVT::f80, MVT::Other);
21885   else
21886     Tys = DAG.getVTList(DstVT, MVT::Other);
21887
21888   SDValue FILDOps[] = {Chain, Pointer};
21889   SDValue Result =
21890       DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
21891                               Alignment, MachineMemOperand::MOLoad);
21892   Chain = Result.getValue(1);
21893
21894   if (useSSE) {
21895     MachineFunction &MF = DAG.getMachineFunction();
21896     unsigned SSFISize = DstVT.getStoreSize();
21897     int SSFI =
21898         MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
21899     auto PtrVT = getPointerTy(MF.getDataLayout());
21900     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21901     Tys = DAG.getVTList(MVT::Other);
21902     SDValue FSTOps[] = {Chain, Result, StackSlot};
21903     MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
21904         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
21905         MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
21906
21907     Chain =
21908         DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
21909     Result = DAG.getLoad(
21910         DstVT, DL, Chain, StackSlot,
21911         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
21912     Chain = Result.getValue(1);
21913   }
21914
21915   return { Result, Chain };
21916 }
21917
21918 /// Horizontal vector math instructions may be slower than normal math with
21919 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
21920 /// implementation, and likely shuffle complexity of the alternate sequence.
21921 static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
21922                                   const X86Subtarget &Subtarget) {
21923   bool IsOptimizingSize = DAG.shouldOptForSize();
21924   bool HasFastHOps = Subtarget.hasFastHorizontalOps();
21925   return !IsSingleSource || IsOptimizingSize || HasFastHOps;
21926 }
21927
21928 /// 64-bit unsigned integer to double expansion.
21929 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
21930                                    const X86Subtarget &Subtarget) {
21931   // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
21932   // when converting 0 when rounding toward negative infinity. Caller will
21933   // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
21934   assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
21935   // This algorithm is not obvious. Here it is what we're trying to output:
21936   /*
21937      movq       %rax,  %xmm0
21938      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
21939      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
21940      #ifdef __SSE3__
21941        haddpd   %xmm0, %xmm0
21942      #else
21943        pshufd   $0x4e, %xmm0, %xmm1
21944        addpd    %xmm1, %xmm0
21945      #endif
21946   */
21947
21948   SDLoc dl(Op);
21949   LLVMContext *Context = DAG.getContext();
21950
21951   // Build some magic constants.
21952   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
21953   Constant *C0 = ConstantDataVector::get(*Context, CV0);
21954   auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
21955   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
21956
21957   SmallVector<Constant*,2> CV1;
21958   CV1.push_back(
21959     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21960                                       APInt(64, 0x4330000000000000ULL))));
21961   CV1.push_back(
21962     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
21963                                       APInt(64, 0x4530000000000000ULL))));
21964   Constant *C1 = ConstantVector::get(CV1);
21965   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
21966
21967   // Load the 64-bit value into an XMM register.
21968   SDValue XR1 =
21969       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
21970   SDValue CLod0 = DAG.getLoad(
21971       MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
21972       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
21973   SDValue Unpck1 =
21974       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
21975
21976   SDValue CLod1 = DAG.getLoad(
21977       MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
21978       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
21979   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
21980   // TODO: Are there any fast-math-flags to propagate here?
21981   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
21982   SDValue Result;
21983
21984   if (Subtarget.hasSSE3() &&
21985       shouldUseHorizontalOp(true, DAG, Subtarget)) {
21986     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
21987   } else {
21988     SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
21989     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
21990   }
21991   Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
21992                        DAG.getIntPtrConstant(0, dl));
21993   return Result;
21994 }
21995
21996 /// 32-bit unsigned integer to float expansion.
21997 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
21998                                    const X86Subtarget &Subtarget) {
21999   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
22000   SDLoc dl(Op);
22001   // FP constant to bias correct the final result.
22002   SDValue Bias = DAG.getConstantFP(
22003       llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
22004
22005   // Load the 32-bit value into an XMM register.
22006   SDValue Load =
22007       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
22008
22009   // Zero out the upper parts of the register.
22010   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
22011
22012   // Or the load with the bias.
22013   SDValue Or = DAG.getNode(
22014       ISD::OR, dl, MVT::v2i64,
22015       DAG.getBitcast(MVT::v2i64, Load),
22016       DAG.getBitcast(MVT::v2i64,
22017                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
22018   Or =
22019       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
22020                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
22021
22022   if (Op.getNode()->isStrictFPOpcode()) {
22023     // Subtract the bias.
22024     // TODO: Are there any fast-math-flags to propagate here?
22025     SDValue Chain = Op.getOperand(0);
22026     SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
22027                               {Chain, Or, Bias});
22028
22029     if (Op.getValueType() == Sub.getValueType())
22030       return Sub;
22031
22032     // Handle final rounding.
22033     std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
22034         Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
22035
22036     return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
22037   }
22038
22039   // Subtract the bias.
22040   // TODO: Are there any fast-math-flags to propagate here?
22041   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
22042
22043   // Handle final rounding.
22044   return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
22045 }
22046
22047 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
22048                                      const X86Subtarget &Subtarget,
22049                                      const SDLoc &DL) {
22050   if (Op.getSimpleValueType() != MVT::v2f64)
22051     return SDValue();
22052
22053   bool IsStrict = Op->isStrictFPOpcode();
22054
22055   SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
22056   assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
22057
22058   if (Subtarget.hasAVX512()) {
22059     if (!Subtarget.hasVLX()) {
22060       // Let generic type legalization widen this.
22061       if (!IsStrict)
22062         return SDValue();
22063       // Otherwise pad the integer input with 0s and widen the operation.
22064       N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
22065                        DAG.getConstant(0, DL, MVT::v2i32));
22066       SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
22067                                 {Op.getOperand(0), N0});
22068       SDValue Chain = Res.getValue(1);
22069       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
22070                         DAG.getIntPtrConstant(0, DL));
22071       return DAG.getMergeValues({Res, Chain}, DL);
22072     }
22073
22074     // Legalize to v4i32 type.
22075     N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
22076                      DAG.getUNDEF(MVT::v2i32));
22077     if (IsStrict)
22078       return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
22079                          {Op.getOperand(0), N0});
22080     return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
22081   }
22082
22083   // Zero extend to 2i64, OR with the floating point representation of 2^52.
22084   // This gives us the floating point equivalent of 2^52 + the i32 integer
22085   // since double has 52-bits of mantissa. Then subtract 2^52 in floating
22086   // point leaving just our i32 integers in double format.
22087   SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
22088   SDValue VBias = DAG.getConstantFP(
22089       llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
22090   SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
22091                            DAG.getBitcast(MVT::v2i64, VBias));
22092   Or = DAG.getBitcast(MVT::v2f64, Or);
22093
22094   if (IsStrict)
22095     return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
22096                        {Op.getOperand(0), Or, VBias});
22097   return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
22098 }
22099
22100 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
22101                                      const X86Subtarget &Subtarget) {
22102   SDLoc DL(Op);
22103   bool IsStrict = Op->isStrictFPOpcode();
22104   SDValue V = Op->getOperand(IsStrict ? 1 : 0);
22105   MVT VecIntVT = V.getSimpleValueType();
22106   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
22107          "Unsupported custom type");
22108
22109   if (Subtarget.hasAVX512()) {
22110     // With AVX512, but not VLX we need to widen to get a 512-bit result type.
22111     assert(!Subtarget.hasVLX() && "Unexpected features");
22112     MVT VT = Op->getSimpleValueType(0);
22113
22114     // v8i32->v8f64 is legal with AVX512 so just return it.
22115     if (VT == MVT::v8f64)
22116       return Op;
22117
22118     assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
22119            "Unexpected VT!");
22120     MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
22121     MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
22122     // Need to concat with zero vector for strict fp to avoid spurious
22123     // exceptions.
22124     SDValue Tmp =
22125         IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
22126     V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
22127                     DAG.getIntPtrConstant(0, DL));
22128     SDValue Res, Chain;
22129     if (IsStrict) {
22130       Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
22131                         {Op->getOperand(0), V});
22132       Chain = Res.getValue(1);
22133     } else {
22134       Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
22135     }
22136
22137     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
22138                       DAG.getIntPtrConstant(0, DL));
22139
22140     if (IsStrict)
22141       return DAG.getMergeValues({Res, Chain}, DL);
22142     return Res;
22143   }
22144
22145   if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
22146       Op->getSimpleValueType(0) == MVT::v4f64) {
22147     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
22148     Constant *Bias = ConstantFP::get(
22149         *DAG.getContext(),
22150         APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
22151     auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
22152     SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
22153     SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
22154     SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
22155     SDValue VBias = DAG.getMemIntrinsicNode(
22156         X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
22157         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
22158         MachineMemOperand::MOLoad);
22159
22160     SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
22161                              DAG.getBitcast(MVT::v4i64, VBias));
22162     Or = DAG.getBitcast(MVT::v4f64, Or);
22163
22164     if (IsStrict)
22165       return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
22166                          {Op.getOperand(0), Or, VBias});
22167     return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
22168   }
22169
22170   // The algorithm is the following:
22171   // #ifdef __SSE4_1__
22172   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
22173   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
22174   //                                 (uint4) 0x53000000, 0xaa);
22175   // #else
22176   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
22177   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
22178   // #endif
22179   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
22180   //     return (float4) lo + fhi;
22181
22182   bool Is128 = VecIntVT == MVT::v4i32;
22183   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
22184   // If we convert to something else than the supported type, e.g., to v4f64,
22185   // abort early.
22186   if (VecFloatVT != Op->getSimpleValueType(0))
22187     return SDValue();
22188
22189   // In the #idef/#else code, we have in common:
22190   // - The vector of constants:
22191   // -- 0x4b000000
22192   // -- 0x53000000
22193   // - A shift:
22194   // -- v >> 16
22195
22196   // Create the splat vector for 0x4b000000.
22197   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
22198   // Create the splat vector for 0x53000000.
22199   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
22200
22201   // Create the right shift.
22202   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
22203   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
22204
22205   SDValue Low, High;
22206   if (Subtarget.hasSSE41()) {
22207     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
22208     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
22209     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
22210     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
22211     // Low will be bitcasted right away, so do not bother bitcasting back to its
22212     // original type.
22213     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
22214                       VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
22215     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
22216     //                                 (uint4) 0x53000000, 0xaa);
22217     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
22218     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
22219     // High will be bitcasted right away, so do not bother bitcasting back to
22220     // its original type.
22221     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
22222                        VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
22223   } else {
22224     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
22225     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
22226     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
22227     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
22228
22229     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
22230     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
22231   }
22232
22233   // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
22234   SDValue VecCstFSub = DAG.getConstantFP(
22235       APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
22236
22237   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
22238   // NOTE: By using fsub of a positive constant instead of fadd of a negative
22239   // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
22240   // enabled. See PR24512.
22241   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
22242   // TODO: Are there any fast-math-flags to propagate here?
22243   //     (float4) lo;
22244   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
22245   //     return (float4) lo + fhi;
22246   if (IsStrict) {
22247     SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
22248                                 {Op.getOperand(0), HighBitcast, VecCstFSub});
22249     return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
22250                        {FHigh.getValue(1), LowBitcast, FHigh});
22251   }
22252
22253   SDValue FHigh =
22254       DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
22255   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
22256 }
22257
22258 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
22259                                    const X86Subtarget &Subtarget) {
22260   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
22261   SDValue N0 = Op.getOperand(OpNo);
22262   MVT SrcVT = N0.getSimpleValueType();
22263   SDLoc dl(Op);
22264
22265   switch (SrcVT.SimpleTy) {
22266   default:
22267     llvm_unreachable("Custom UINT_TO_FP is not supported!");
22268   case MVT::v2i32:
22269     return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
22270   case MVT::v4i32:
22271   case MVT::v8i32:
22272     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
22273   case MVT::v2i64:
22274   case MVT::v4i64:
22275     return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
22276   }
22277 }
22278
22279 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
22280                                            SelectionDAG &DAG) const {
22281   bool IsStrict = Op->isStrictFPOpcode();
22282   unsigned OpNo = IsStrict ? 1 : 0;
22283   SDValue Src = Op.getOperand(OpNo);
22284   SDLoc dl(Op);
22285   auto PtrVT = getPointerTy(DAG.getDataLayout());
22286   MVT SrcVT = Src.getSimpleValueType();
22287   MVT DstVT = Op->getSimpleValueType(0);
22288   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22289
22290   // Bail out when we don't have native conversion instructions.
22291   if (DstVT == MVT::f128)
22292     return SDValue();
22293
22294   if (isSoftFP16(DstVT))
22295     return promoteXINT_TO_FP(Op, DAG);
22296   else if (isLegalConversion(SrcVT, false, Subtarget))
22297     return Op;
22298
22299   if (DstVT.isVector())
22300     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
22301
22302   if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
22303     return LowerWin64_INT128_TO_FP(Op, DAG);
22304
22305   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
22306     return Extract;
22307
22308   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
22309       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
22310     // Conversions from unsigned i32 to f32/f64 are legal,
22311     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
22312     return Op;
22313   }
22314
22315   // Promote i32 to i64 and use a signed conversion on 64-bit targets.
22316   if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
22317     Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
22318     if (IsStrict)
22319       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
22320                          {Chain, Src});
22321     return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
22322   }
22323
22324   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
22325     return V;
22326   if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
22327     return V;
22328
22329   // The transform for i64->f64 isn't correct for 0 when rounding to negative
22330   // infinity. It produces -0.0, so disable under strictfp.
22331   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
22332       !IsStrict)
22333     return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
22334   // The transform for i32->f64/f32 isn't correct for 0 when rounding to
22335   // negative infinity. So disable under strictfp. Using FILD instead.
22336   if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
22337       !IsStrict)
22338     return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
22339   if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
22340       (DstVT == MVT::f32 || DstVT == MVT::f64))
22341     return SDValue();
22342
22343   // Make a 64-bit buffer, and use it to build an FILD.
22344   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
22345   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
22346   Align SlotAlign(8);
22347   MachinePointerInfo MPI =
22348     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
22349   if (SrcVT == MVT::i32) {
22350     SDValue OffsetSlot =
22351         DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
22352     SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
22353     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
22354                                   OffsetSlot, MPI.getWithOffset(4), SlotAlign);
22355     std::pair<SDValue, SDValue> Tmp =
22356         BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
22357     if (IsStrict)
22358       return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
22359
22360     return Tmp.first;
22361   }
22362
22363   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
22364   SDValue ValueToStore = Src;
22365   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
22366     // Bitcasting to f64 here allows us to do a single 64-bit store from
22367     // an SSE register, avoiding the store forwarding penalty that would come
22368     // with two 32-bit stores.
22369     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
22370   }
22371   SDValue Store =
22372       DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
22373   // For i64 source, we need to add the appropriate power of 2 if the input
22374   // was negative. We must be careful to do the computation in x87 extended
22375   // precision, not in SSE.
22376   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22377   SDValue Ops[] = { Store, StackSlot };
22378   SDValue Fild =
22379       DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
22380                               SlotAlign, MachineMemOperand::MOLoad);
22381   Chain = Fild.getValue(1);
22382
22383
22384   // Check whether the sign bit is set.
22385   SDValue SignSet = DAG.getSetCC(
22386       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
22387       Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
22388
22389   // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
22390   APInt FF(64, 0x5F80000000000000ULL);
22391   SDValue FudgePtr = DAG.getConstantPool(
22392       ConstantInt::get(*DAG.getContext(), FF), PtrVT);
22393   Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
22394
22395   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
22396   SDValue Zero = DAG.getIntPtrConstant(0, dl);
22397   SDValue Four = DAG.getIntPtrConstant(4, dl);
22398   SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
22399   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
22400
22401   // Load the value out, extending it from f32 to f80.
22402   SDValue Fudge = DAG.getExtLoad(
22403       ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
22404       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
22405       CPAlignment);
22406   Chain = Fudge.getValue(1);
22407   // Extend everything to 80 bits to force it to be done on x87.
22408   // TODO: Are there any fast-math-flags to propagate here?
22409   if (IsStrict) {
22410     unsigned Opc = ISD::STRICT_FADD;
22411     // Windows needs the precision control changed to 80bits around this add.
22412     if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22413       Opc = X86ISD::STRICT_FP80_ADD;
22414
22415     SDValue Add =
22416         DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
22417     // STRICT_FP_ROUND can't handle equal types.
22418     if (DstVT == MVT::f80)
22419       return Add;
22420     return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
22421                        {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
22422   }
22423   unsigned Opc = ISD::FADD;
22424   // Windows needs the precision control changed to 80bits around this add.
22425   if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22426     Opc = X86ISD::FP80_ADD;
22427
22428   SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
22429   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
22430                      DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22431 }
22432
22433 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
22434 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
22435 // just return an SDValue().
22436 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
22437 // to i16, i32 or i64, and we lower it to a legal sequence and return the
22438 // result.
22439 SDValue
22440 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
22441                                    bool IsSigned, SDValue &Chain) const {
22442   bool IsStrict = Op->isStrictFPOpcode();
22443   SDLoc DL(Op);
22444
22445   EVT DstTy = Op.getValueType();
22446   SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
22447   EVT TheVT = Value.getValueType();
22448   auto PtrVT = getPointerTy(DAG.getDataLayout());
22449
22450   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
22451     // f16 must be promoted before using the lowering in this routine.
22452     // fp128 does not use this lowering.
22453     return SDValue();
22454   }
22455
22456   // If using FIST to compute an unsigned i64, we'll need some fixup
22457   // to handle values above the maximum signed i64.  A FIST is always
22458   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
22459   bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
22460
22461   // FIXME: This does not generate an invalid exception if the input does not
22462   // fit in i32. PR44019
22463   if (!IsSigned && DstTy != MVT::i64) {
22464     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
22465     // The low 32 bits of the fist result will have the correct uint32 result.
22466     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
22467     DstTy = MVT::i64;
22468   }
22469
22470   assert(DstTy.getSimpleVT() <= MVT::i64 &&
22471          DstTy.getSimpleVT() >= MVT::i16 &&
22472          "Unknown FP_TO_INT to lower!");
22473
22474   // We lower FP->int64 into FISTP64 followed by a load from a temporary
22475   // stack slot.
22476   MachineFunction &MF = DAG.getMachineFunction();
22477   unsigned MemSize = DstTy.getStoreSize();
22478   int SSFI =
22479       MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
22480   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
22481
22482   Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22483
22484   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
22485
22486   if (UnsignedFixup) {
22487     //
22488     // Conversion to unsigned i64 is implemented with a select,
22489     // depending on whether the source value fits in the range
22490     // of a signed i64.  Let Thresh be the FP equivalent of
22491     // 0x8000000000000000ULL.
22492     //
22493     //  Adjust = (Value >= Thresh) ? 0x80000000 : 0;
22494     //  FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
22495     //  FistSrc = (Value - FltOfs);
22496     //  Fist-to-mem64 FistSrc
22497     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
22498     //  to XOR'ing the high 32 bits with Adjust.
22499     //
22500     // Being a power of 2, Thresh is exactly representable in all FP formats.
22501     // For X87 we'd like to use the smallest FP type for this constant, but
22502     // for DAG type consistency we have to match the FP operand type.
22503
22504     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
22505     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
22506     bool LosesInfo = false;
22507     if (TheVT == MVT::f64)
22508       // The rounding mode is irrelevant as the conversion should be exact.
22509       Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
22510                               &LosesInfo);
22511     else if (TheVT == MVT::f80)
22512       Status = Thresh.convert(APFloat::x87DoubleExtended(),
22513                               APFloat::rmNearestTiesToEven, &LosesInfo);
22514
22515     assert(Status == APFloat::opOK && !LosesInfo &&
22516            "FP conversion should have been exact");
22517
22518     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
22519
22520     EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
22521                                    *DAG.getContext(), TheVT);
22522     SDValue Cmp;
22523     if (IsStrict) {
22524       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
22525                          /*IsSignaling*/ true);
22526       Chain = Cmp.getValue(1);
22527     } else {
22528       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
22529     }
22530
22531     // Our preferred lowering of
22532     //
22533     // (Value >= Thresh) ? 0x8000000000000000ULL : 0
22534     //
22535     // is
22536     //
22537     // (Value >= Thresh) << 63
22538     //
22539     // but since we can get here after LegalOperations, DAGCombine might do the
22540     // wrong thing if we create a select. So, directly create the preferred
22541     // version.
22542     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
22543     SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
22544     Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
22545
22546     SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
22547                                    DAG.getConstantFP(0.0, DL, TheVT));
22548
22549     if (IsStrict) {
22550       Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
22551                           { Chain, Value, FltOfs });
22552       Chain = Value.getValue(1);
22553     } else
22554       Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
22555   }
22556
22557   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
22558
22559   // FIXME This causes a redundant load/store if the SSE-class value is already
22560   // in memory, such as if it is on the callstack.
22561   if (isScalarFPTypeInSSEReg(TheVT)) {
22562     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
22563     Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
22564     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22565     SDValue Ops[] = { Chain, StackSlot };
22566
22567     unsigned FLDSize = TheVT.getStoreSize();
22568     assert(FLDSize <= MemSize && "Stack slot not big enough");
22569     MachineMemOperand *MMO = MF.getMachineMemOperand(
22570         MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
22571     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
22572     Chain = Value.getValue(1);
22573   }
22574
22575   // Build the FP_TO_INT*_IN_MEM
22576   MachineMemOperand *MMO = MF.getMachineMemOperand(
22577       MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
22578   SDValue Ops[] = { Chain, Value, StackSlot };
22579   SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
22580                                          DAG.getVTList(MVT::Other),
22581                                          Ops, DstTy, MMO);
22582
22583   SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
22584   Chain = Res.getValue(1);
22585
22586   // If we need an unsigned fixup, XOR the result with adjust.
22587   if (UnsignedFixup)
22588     Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
22589
22590   return Res;
22591 }
22592
22593 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
22594                               const X86Subtarget &Subtarget) {
22595   MVT VT = Op.getSimpleValueType();
22596   SDValue In = Op.getOperand(0);
22597   MVT InVT = In.getSimpleValueType();
22598   SDLoc dl(Op);
22599   unsigned Opc = Op.getOpcode();
22600
22601   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
22602   assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
22603          "Unexpected extension opcode");
22604   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
22605          "Expected same number of elements");
22606   assert((VT.getVectorElementType() == MVT::i16 ||
22607           VT.getVectorElementType() == MVT::i32 ||
22608           VT.getVectorElementType() == MVT::i64) &&
22609          "Unexpected element type");
22610   assert((InVT.getVectorElementType() == MVT::i8 ||
22611           InVT.getVectorElementType() == MVT::i16 ||
22612           InVT.getVectorElementType() == MVT::i32) &&
22613          "Unexpected element type");
22614
22615   unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
22616
22617   if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
22618     assert(InVT == MVT::v32i8 && "Unexpected VT!");
22619     return splitVectorIntUnary(Op, DAG);
22620   }
22621
22622   if (Subtarget.hasInt256())
22623     return Op;
22624
22625   // Optimize vectors in AVX mode:
22626   //
22627   //   v8i16 -> v8i32
22628   //   Use vpmovzwd for 4 lower elements  v8i16 -> v4i32.
22629   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
22630   //   Concat upper and lower parts.
22631   //
22632   //   v4i32 -> v4i64
22633   //   Use vpmovzdq for 4 lower elements  v4i32 -> v2i64.
22634   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
22635   //   Concat upper and lower parts.
22636   //
22637   MVT HalfVT = VT.getHalfNumVectorElementsVT();
22638   SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
22639
22640   // Short-circuit if we can determine that each 128-bit half is the same value.
22641   // Otherwise, this is difficult to match and optimize.
22642   if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
22643     if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
22644       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
22645
22646   SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
22647   SDValue Undef = DAG.getUNDEF(InVT);
22648   bool NeedZero = Opc == ISD::ZERO_EXTEND;
22649   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
22650   OpHi = DAG.getBitcast(HalfVT, OpHi);
22651
22652   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
22653 }
22654
22655 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
22656 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
22657                                    const SDLoc &dl, SelectionDAG &DAG) {
22658   assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
22659   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
22660                            DAG.getIntPtrConstant(0, dl));
22661   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
22662                            DAG.getIntPtrConstant(8, dl));
22663   Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
22664   Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
22665   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
22666   return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
22667 }
22668
22669 static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
22670                                       const X86Subtarget &Subtarget,
22671                                       SelectionDAG &DAG) {
22672   MVT VT = Op->getSimpleValueType(0);
22673   SDValue In = Op->getOperand(0);
22674   MVT InVT = In.getSimpleValueType();
22675   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
22676   SDLoc DL(Op);
22677   unsigned NumElts = VT.getVectorNumElements();
22678
22679   // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
22680   // avoids a constant pool load.
22681   if (VT.getVectorElementType() != MVT::i8) {
22682     SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
22683     return DAG.getNode(ISD::SRL, DL, VT, Extend,
22684                        DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
22685   }
22686
22687   // Extend VT if BWI is not supported.
22688   MVT ExtVT = VT;
22689   if (!Subtarget.hasBWI()) {
22690     // If v16i32 is to be avoided, we'll need to split and concatenate.
22691     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
22692       return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
22693
22694     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
22695   }
22696
22697   // Widen to 512-bits if VLX is not supported.
22698   MVT WideVT = ExtVT;
22699   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
22700     NumElts *= 512 / ExtVT.getSizeInBits();
22701     InVT = MVT::getVectorVT(MVT::i1, NumElts);
22702     In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
22703                      In, DAG.getIntPtrConstant(0, DL));
22704     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
22705                               NumElts);
22706   }
22707
22708   SDValue One = DAG.getConstant(1, DL, WideVT);
22709   SDValue Zero = DAG.getConstant(0, DL, WideVT);
22710
22711   SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
22712
22713   // Truncate if we had to extend above.
22714   if (VT != ExtVT) {
22715     WideVT = MVT::getVectorVT(MVT::i8, NumElts);
22716     SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
22717   }
22718
22719   // Extract back to 128/256-bit if we widened.
22720   if (WideVT != VT)
22721     SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
22722                               DAG.getIntPtrConstant(0, DL));
22723
22724   return SelectedVal;
22725 }
22726
22727 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
22728                                 SelectionDAG &DAG) {
22729   SDValue In = Op.getOperand(0);
22730   MVT SVT = In.getSimpleValueType();
22731
22732   if (SVT.getVectorElementType() == MVT::i1)
22733     return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
22734
22735   assert(Subtarget.hasAVX() && "Expected AVX support");
22736   return LowerAVXExtend(Op, DAG, Subtarget);
22737 }
22738
22739 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
22740 /// It makes use of the fact that vectors with enough leading sign/zero bits
22741 /// prevent the PACKSS/PACKUS from saturating the results.
22742 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
22743 /// within each 128-bit lane.
22744 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
22745                                       const SDLoc &DL, SelectionDAG &DAG,
22746                                       const X86Subtarget &Subtarget) {
22747   assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
22748          "Unexpected PACK opcode");
22749   assert(DstVT.isVector() && "VT not a vector?");
22750
22751   // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
22752   if (!Subtarget.hasSSE2())
22753     return SDValue();
22754
22755   EVT SrcVT = In.getValueType();
22756
22757   // No truncation required, we might get here due to recursive calls.
22758   if (SrcVT == DstVT)
22759     return In;
22760
22761   // We only support vector truncation to 64bits or greater from a
22762   // 128bits or greater source.
22763   unsigned DstSizeInBits = DstVT.getSizeInBits();
22764   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
22765   if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
22766     return SDValue();
22767
22768   unsigned NumElems = SrcVT.getVectorNumElements();
22769   if (!isPowerOf2_32(NumElems))
22770     return SDValue();
22771
22772   LLVMContext &Ctx = *DAG.getContext();
22773   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
22774   assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
22775
22776   EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
22777
22778   // Pack to the largest type possible:
22779   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
22780   EVT InVT = MVT::i16, OutVT = MVT::i8;
22781   if (SrcVT.getScalarSizeInBits() > 16 &&
22782       (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
22783     InVT = MVT::i32;
22784     OutVT = MVT::i16;
22785   }
22786
22787   // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
22788   if (SrcVT.is128BitVector()) {
22789     InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
22790     OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
22791     In = DAG.getBitcast(InVT, In);
22792     SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
22793     Res = extractSubVector(Res, 0, DAG, DL, 64);
22794     return DAG.getBitcast(DstVT, Res);
22795   }
22796
22797   // Split lower/upper subvectors.
22798   SDValue Lo, Hi;
22799   std::tie(Lo, Hi) = splitVector(In, DAG, DL);
22800
22801   unsigned SubSizeInBits = SrcSizeInBits / 2;
22802   InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
22803   OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
22804
22805   // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
22806   if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
22807     Lo = DAG.getBitcast(InVT, Lo);
22808     Hi = DAG.getBitcast(InVT, Hi);
22809     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
22810     return DAG.getBitcast(DstVT, Res);
22811   }
22812
22813   // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
22814   // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
22815   if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
22816     Lo = DAG.getBitcast(InVT, Lo);
22817     Hi = DAG.getBitcast(InVT, Hi);
22818     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
22819
22820     // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
22821     // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
22822     // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
22823     SmallVector<int, 64> Mask;
22824     int Scale = 64 / OutVT.getScalarSizeInBits();
22825     narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
22826     Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
22827
22828     if (DstVT.is256BitVector())
22829       return DAG.getBitcast(DstVT, Res);
22830
22831     // If 512bit -> 128bit truncate another stage.
22832     EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
22833     Res = DAG.getBitcast(PackedVT, Res);
22834     return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
22835   }
22836
22837   // Recursively pack lower/upper subvectors, concat result and pack again.
22838   assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
22839   EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
22840   Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
22841   Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
22842
22843   PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
22844   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
22845   return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
22846 }
22847
22848 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
22849 /// e.g. trunc <8 x i32> X to <8 x i16> -->
22850 /// MaskX = X & 0xffff (clear high bits to prevent saturation)
22851 /// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
22852 static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL,
22853                                         const X86Subtarget &Subtarget,
22854                                         SelectionDAG &DAG) {
22855   EVT SrcVT = In.getValueType();
22856   APInt Mask = APInt::getLowBitsSet(SrcVT.getScalarSizeInBits(),
22857                                     DstVT.getScalarSizeInBits());
22858   In = DAG.getNode(ISD::AND, DL, SrcVT, In, DAG.getConstant(Mask, DL, SrcVT));
22859   return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
22860 }
22861
22862 /// Truncate using inreg sign extension and X86ISD::PACKSS.
22863 static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL,
22864                                         const X86Subtarget &Subtarget,
22865                                         SelectionDAG &DAG) {
22866   EVT SrcVT = In.getValueType();
22867   In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
22868                    DAG.getValueType(DstVT));
22869   return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
22870 }
22871
22872 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
22873                                   const X86Subtarget &Subtarget) {
22874
22875   SDLoc DL(Op);
22876   MVT VT = Op.getSimpleValueType();
22877   SDValue In = Op.getOperand(0);
22878   MVT InVT = In.getSimpleValueType();
22879
22880   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
22881
22882   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
22883   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
22884   if (InVT.getScalarSizeInBits() <= 16) {
22885     if (Subtarget.hasBWI()) {
22886       // legal, will go to VPMOVB2M, VPMOVW2M
22887       if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
22888         // We need to shift to get the lsb into sign position.
22889         // Shift packed bytes not supported natively, bitcast to word
22890         MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
22891         In = DAG.getNode(ISD::SHL, DL, ExtVT,
22892                          DAG.getBitcast(ExtVT, In),
22893                          DAG.getConstant(ShiftInx, DL, ExtVT));
22894         In = DAG.getBitcast(InVT, In);
22895       }
22896       return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
22897                           In, ISD::SETGT);
22898     }
22899     // Use TESTD/Q, extended vector to packed dword/qword.
22900     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
22901            "Unexpected vector type.");
22902     unsigned NumElts = InVT.getVectorNumElements();
22903     assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
22904     // We need to change to a wider element type that we have support for.
22905     // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
22906     // For 16 element vectors we extend to v16i32 unless we are explicitly
22907     // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
22908     // we need to split into two 8 element vectors which we can extend to v8i32,
22909     // truncate and concat the results. There's an additional complication if
22910     // the original type is v16i8. In that case we can't split the v16i8
22911     // directly, so we need to shuffle high elements to low and use
22912     // sign_extend_vector_inreg.
22913     if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
22914       SDValue Lo, Hi;
22915       if (InVT == MVT::v16i8) {
22916         Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
22917         Hi = DAG.getVectorShuffle(
22918             InVT, DL, In, In,
22919             {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
22920         Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
22921       } else {
22922         assert(InVT == MVT::v16i16 && "Unexpected VT!");
22923         Lo = extract128BitVector(In, 0, DAG, DL);
22924         Hi = extract128BitVector(In, 8, DAG, DL);
22925       }
22926       // We're split now, just emit two truncates and a concat. The two
22927       // truncates will trigger legalization to come back to this function.
22928       Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
22929       Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
22930       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22931     }
22932     // We either have 8 elements or we're allowed to use 512-bit vectors.
22933     // If we have VLX, we want to use the narrowest vector that can get the
22934     // job done so we use vXi32.
22935     MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
22936     MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
22937     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
22938     InVT = ExtVT;
22939     ShiftInx = InVT.getScalarSizeInBits() - 1;
22940   }
22941
22942   if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
22943     // We need to shift to get the lsb into sign position.
22944     In = DAG.getNode(ISD::SHL, DL, InVT, In,
22945                      DAG.getConstant(ShiftInx, DL, InVT));
22946   }
22947   // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
22948   if (Subtarget.hasDQI())
22949     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
22950   return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
22951 }
22952
22953 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
22954   SDLoc DL(Op);
22955   MVT VT = Op.getSimpleValueType();
22956   SDValue In = Op.getOperand(0);
22957   MVT InVT = In.getSimpleValueType();
22958   unsigned InNumEltBits = InVT.getScalarSizeInBits();
22959
22960   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
22961          "Invalid TRUNCATE operation");
22962
22963   // If we're called by the type legalizer, handle a few cases.
22964   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22965   if (!TLI.isTypeLegal(InVT)) {
22966     if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
22967         VT.is128BitVector()) {
22968       assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
22969              "Unexpected subtarget!");
22970       // The default behavior is to truncate one step, concatenate, and then
22971       // truncate the remainder. We'd rather produce two 64-bit results and
22972       // concatenate those.
22973       SDValue Lo, Hi;
22974       std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
22975
22976       EVT LoVT, HiVT;
22977       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22978
22979       Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
22980       Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
22981       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22982     }
22983
22984     // Otherwise let default legalization handle it.
22985     return SDValue();
22986   }
22987
22988   if (VT.getVectorElementType() == MVT::i1)
22989     return LowerTruncateVecI1(Op, DAG, Subtarget);
22990
22991   unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
22992   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
22993
22994   // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
22995   // concat from subvectors to use VPTRUNC etc.
22996   if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG)) {
22997     // Truncate with PACKUS if we are truncating a vector with leading zero
22998     // bits that extend all the way to the packed/truncated value. Pre-SSE41
22999     // we can only use PACKUSWB.
23000     KnownBits Known = DAG.computeKnownBits(In);
23001     if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
23002       if (SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG,
23003                                              Subtarget))
23004         return V;
23005
23006     // Truncate with PACKSS if we are truncating a vector with sign-bits
23007     // that extend all the way to the packed/truncated value.
23008     if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
23009       if (SDValue V = truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG,
23010                                              Subtarget))
23011         return V;
23012   }
23013
23014   // vpmovqb/w/d, vpmovdb/w, vpmovwb
23015   if (Subtarget.hasAVX512()) {
23016     if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
23017       assert(VT == MVT::v32i8 && "Unexpected VT!");
23018       return splitVectorIntUnary(Op, DAG);
23019     }
23020
23021     // word to byte only under BWI. Otherwise we have to promoted to v16i32
23022     // and then truncate that. But we should only do that if we haven't been
23023     // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
23024     // handled by isel patterns.
23025     if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
23026         Subtarget.canExtendTo512DQ())
23027       return Op;
23028   }
23029
23030   // Handle truncation of V256 to V128 using shuffles.
23031   assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
23032
23033   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
23034     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
23035     if (Subtarget.hasInt256()) {
23036       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
23037       In = DAG.getBitcast(MVT::v8i32, In);
23038       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
23039       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
23040                          DAG.getIntPtrConstant(0, DL));
23041     }
23042
23043     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
23044                                DAG.getIntPtrConstant(0, DL));
23045     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
23046                                DAG.getIntPtrConstant(2, DL));
23047     static const int ShufMask[] = {0, 2, 4, 6};
23048     return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
23049                                 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
23050   }
23051
23052   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
23053     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
23054     if (Subtarget.hasInt256()) {
23055       // The PSHUFB mask:
23056       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
23057                                       -1, -1, -1, -1, -1, -1, -1, -1,
23058                                       16, 17, 20, 21, 24, 25, 28, 29,
23059                                       -1, -1, -1, -1, -1, -1, -1, -1 };
23060       In = DAG.getBitcast(MVT::v32i8, In);
23061       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
23062       In = DAG.getBitcast(MVT::v4i64, In);
23063
23064       static const int ShufMask2[] = {0, 2, -1, -1};
23065       In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
23066       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
23067                        DAG.getIntPtrConstant(0, DL));
23068       return DAG.getBitcast(MVT::v8i16, In);
23069     }
23070
23071     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
23072                                DAG.getIntPtrConstant(0, DL));
23073     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
23074                                DAG.getIntPtrConstant(4, DL));
23075
23076     // The PSHUFB mask:
23077     static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};
23078
23079     OpLo = DAG.getBitcast(MVT::v8i16, OpLo);
23080     OpHi = DAG.getBitcast(MVT::v8i16, OpHi);
23081
23082     OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);
23083     OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);
23084
23085     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
23086     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
23087
23088     // The MOVLHPS Mask:
23089     static const int ShufMask2[] = {0, 1, 4, 5};
23090     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
23091     return DAG.getBitcast(MVT::v8i16, res);
23092   }
23093
23094   if (VT == MVT::v16i8 && InVT == MVT::v16i16)
23095     return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
23096
23097   llvm_unreachable("All 256->128 cases should have been handled above!");
23098 }
23099
23100 // We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
23101 // behaves on out of range inputs to generate optimized conversions.
23102 static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
23103                                     SelectionDAG &DAG,
23104                                     const X86Subtarget &Subtarget) {
23105   MVT SrcVT = Src.getSimpleValueType();
23106   unsigned DstBits = VT.getScalarSizeInBits();
23107   assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
23108
23109   // Calculate the converted result for values in the range 0 to
23110   // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
23111   SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
23112   SDValue Big =
23113       DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
23114                   DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
23115                               DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
23116
23117   // The "CVTTP2SI" instruction conveniently sets the sign bit if
23118   // and only if the value was out of range. So we can use that
23119   // as our indicator that we rather use "Big" instead of "Small".
23120   //
23121   // Use "Small" if "IsOverflown" has all bits cleared
23122   // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
23123
23124   // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
23125   // use the slightly slower blendv select instead.
23126   if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
23127     SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
23128     return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
23129   }
23130
23131   SDValue IsOverflown =
23132       DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
23133                   DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
23134   return DAG.getNode(ISD::OR, dl, VT, Small,
23135                      DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
23136 }
23137
23138 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
23139   bool IsStrict = Op->isStrictFPOpcode();
23140   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
23141                   Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
23142   MVT VT = Op->getSimpleValueType(0);
23143   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23144   SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
23145   MVT SrcVT = Src.getSimpleValueType();
23146   SDLoc dl(Op);
23147
23148   SDValue Res;
23149   if (isSoftFP16(SrcVT)) {
23150     MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
23151     if (IsStrict)
23152       return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
23153                          {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
23154                                              {NVT, MVT::Other}, {Chain, Src})});
23155     return DAG.getNode(Op.getOpcode(), dl, VT,
23156                        DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
23157   } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
23158     return Op;
23159   }
23160
23161   if (VT.isVector()) {
23162     if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
23163       MVT ResVT = MVT::v4i32;
23164       MVT TruncVT = MVT::v4i1;
23165       unsigned Opc;
23166       if (IsStrict)
23167         Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
23168       else
23169         Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
23170
23171       if (!IsSigned && !Subtarget.hasVLX()) {
23172         assert(Subtarget.useAVX512Regs() && "Unexpected features!");
23173         // Widen to 512-bits.
23174         ResVT = MVT::v8i32;
23175         TruncVT = MVT::v8i1;
23176         Opc = Op.getOpcode();
23177         // Need to concat with zero vector for strict fp to avoid spurious
23178         // exceptions.
23179         // TODO: Should we just do this for non-strict as well?
23180         SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
23181                                : DAG.getUNDEF(MVT::v8f64);
23182         Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
23183                           DAG.getIntPtrConstant(0, dl));
23184       }
23185       if (IsStrict) {
23186         Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
23187         Chain = Res.getValue(1);
23188       } else {
23189         Res = DAG.getNode(Opc, dl, ResVT, Src);
23190       }
23191
23192       Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
23193       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
23194                         DAG.getIntPtrConstant(0, dl));
23195       if (IsStrict)
23196         return DAG.getMergeValues({Res, Chain}, dl);
23197       return Res;
23198     }
23199
23200     if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
23201       if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
23202         return Op;
23203
23204       MVT ResVT = VT;
23205       MVT EleVT = VT.getVectorElementType();
23206       if (EleVT != MVT::i64)
23207         ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
23208
23209       if (SrcVT != MVT::v8f16) {
23210         SDValue Tmp =
23211             IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
23212         SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
23213         Ops[0] = Src;
23214         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
23215       }
23216
23217       if (IsStrict) {
23218         Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
23219                                    : X86ISD::STRICT_CVTTP2UI,
23220                           dl, {ResVT, MVT::Other}, {Chain, Src});
23221         Chain = Res.getValue(1);
23222       } else {
23223         Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
23224                           ResVT, Src);
23225       }
23226
23227       // TODO: Need to add exception check code for strict FP.
23228       if (EleVT.getSizeInBits() < 16) {
23229         ResVT = MVT::getVectorVT(EleVT, 8);
23230         Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
23231       }
23232
23233       if (ResVT != VT)
23234         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23235                           DAG.getIntPtrConstant(0, dl));
23236
23237       if (IsStrict)
23238         return DAG.getMergeValues({Res, Chain}, dl);
23239       return Res;
23240     }
23241
23242     // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
23243     if (VT.getVectorElementType() == MVT::i16) {
23244       assert((SrcVT.getVectorElementType() == MVT::f32 ||
23245               SrcVT.getVectorElementType() == MVT::f64) &&
23246              "Expected f32/f64 vector!");
23247       MVT NVT = VT.changeVectorElementType(MVT::i32);
23248       if (IsStrict) {
23249         Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
23250                                    : ISD::STRICT_FP_TO_UINT,
23251                           dl, {NVT, MVT::Other}, {Chain, Src});
23252         Chain = Res.getValue(1);
23253       } else {
23254         Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
23255                           NVT, Src);
23256       }
23257
23258       // TODO: Need to add exception check code for strict FP.
23259       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23260
23261       if (IsStrict)
23262         return DAG.getMergeValues({Res, Chain}, dl);
23263       return Res;
23264     }
23265
23266     // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
23267     if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
23268       assert(!IsSigned && "Expected unsigned conversion!");
23269       assert(Subtarget.useAVX512Regs() && "Requires avx512f");
23270       return Op;
23271     }
23272
23273     // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
23274     if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
23275         (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
23276         Subtarget.useAVX512Regs()) {
23277       assert(!IsSigned && "Expected unsigned conversion!");
23278       assert(!Subtarget.hasVLX() && "Unexpected features!");
23279       MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
23280       MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
23281       // Need to concat with zero vector for strict fp to avoid spurious
23282       // exceptions.
23283       // TODO: Should we just do this for non-strict as well?
23284       SDValue Tmp =
23285           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
23286       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
23287                         DAG.getIntPtrConstant(0, dl));
23288
23289       if (IsStrict) {
23290         Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
23291                           {Chain, Src});
23292         Chain = Res.getValue(1);
23293       } else {
23294         Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
23295       }
23296
23297       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23298                         DAG.getIntPtrConstant(0, dl));
23299
23300       if (IsStrict)
23301         return DAG.getMergeValues({Res, Chain}, dl);
23302       return Res;
23303     }
23304
23305     // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
23306     if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
23307         (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
23308         Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
23309       assert(!Subtarget.hasVLX() && "Unexpected features!");
23310       MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
23311       // Need to concat with zero vector for strict fp to avoid spurious
23312       // exceptions.
23313       // TODO: Should we just do this for non-strict as well?
23314       SDValue Tmp =
23315           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
23316       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
23317                         DAG.getIntPtrConstant(0, dl));
23318
23319       if (IsStrict) {
23320         Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
23321                           {Chain, Src});
23322         Chain = Res.getValue(1);
23323       } else {
23324         Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
23325       }
23326
23327       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
23328                         DAG.getIntPtrConstant(0, dl));
23329
23330       if (IsStrict)
23331         return DAG.getMergeValues({Res, Chain}, dl);
23332       return Res;
23333     }
23334
23335     if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
23336       if (!Subtarget.hasVLX()) {
23337         // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
23338         // legalizer and then widened again by vector op legalization.
23339         if (!IsStrict)
23340           return SDValue();
23341
23342         SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
23343         SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
23344                                   {Src, Zero, Zero, Zero});
23345         Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
23346                           {Chain, Tmp});
23347         SDValue Chain = Tmp.getValue(1);
23348         Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
23349                           DAG.getIntPtrConstant(0, dl));
23350         return DAG.getMergeValues({Tmp, Chain}, dl);
23351       }
23352
23353       assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
23354       SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23355                                 DAG.getUNDEF(MVT::v2f32));
23356       if (IsStrict) {
23357         unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
23358                                 : X86ISD::STRICT_CVTTP2UI;
23359         return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
23360       }
23361       unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
23362       return DAG.getNode(Opc, dl, VT, Tmp);
23363     }
23364
23365     // Generate optimized instructions for pre AVX512 unsigned conversions from
23366     // vXf32 to vXi32.
23367     if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
23368         (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
23369         (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
23370       assert(!IsSigned && "Expected unsigned conversion!");
23371       return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
23372     }
23373
23374     return SDValue();
23375   }
23376
23377   assert(!VT.isVector());
23378
23379   bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
23380
23381   if (!IsSigned && UseSSEReg) {
23382     // Conversions from f32/f64 with AVX512 should be legal.
23383     if (Subtarget.hasAVX512())
23384       return Op;
23385
23386     // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
23387     // behaves on out of range inputs to generate optimized conversions.
23388     if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
23389                       (VT == MVT::i64 && Subtarget.is64Bit()))) {
23390       unsigned DstBits = VT.getScalarSizeInBits();
23391       APInt UIntLimit = APInt::getSignMask(DstBits);
23392       SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
23393                                         DAG.getConstant(UIntLimit, dl, VT));
23394       MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
23395
23396       // Calculate the converted result for values in the range:
23397       // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
23398       // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
23399       SDValue Small =
23400           DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
23401                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
23402       SDValue Big = DAG.getNode(
23403           X86ISD::CVTTS2SI, dl, VT,
23404           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
23405                       DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
23406
23407       // The "CVTTS2SI" instruction conveniently sets the sign bit if
23408       // and only if the value was out of range. So we can use that
23409       // as our indicator that we rather use "Big" instead of "Small".
23410       //
23411       // Use "Small" if "IsOverflown" has all bits cleared
23412       // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
23413       SDValue IsOverflown = DAG.getNode(
23414           ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
23415       return DAG.getNode(ISD::OR, dl, VT, Small,
23416                          DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
23417     }
23418
23419     // Use default expansion for i64.
23420     if (VT == MVT::i64)
23421       return SDValue();
23422
23423     assert(VT == MVT::i32 && "Unexpected VT!");
23424
23425     // Promote i32 to i64 and use a signed operation on 64-bit targets.
23426     // FIXME: This does not generate an invalid exception if the input does not
23427     // fit in i32. PR44019
23428     if (Subtarget.is64Bit()) {
23429       if (IsStrict) {
23430         Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
23431                           {Chain, Src});
23432         Chain = Res.getValue(1);
23433       } else
23434         Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
23435
23436       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23437       if (IsStrict)
23438         return DAG.getMergeValues({Res, Chain}, dl);
23439       return Res;
23440     }
23441
23442     // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
23443     // use fisttp which will be handled later.
23444     if (!Subtarget.hasSSE3())
23445       return SDValue();
23446   }
23447
23448   // Promote i16 to i32 if we can use a SSE operation or the type is f128.
23449   // FIXME: This does not generate an invalid exception if the input does not
23450   // fit in i16. PR44019
23451   if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
23452     assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
23453     if (IsStrict) {
23454       Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
23455                         {Chain, Src});
23456       Chain = Res.getValue(1);
23457     } else
23458       Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
23459
23460     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
23461     if (IsStrict)
23462       return DAG.getMergeValues({Res, Chain}, dl);
23463     return Res;
23464   }
23465
23466   // If this is a FP_TO_SINT using SSEReg we're done.
23467   if (UseSSEReg && IsSigned)
23468     return Op;
23469
23470   // fp128 needs to use a libcall.
23471   if (SrcVT == MVT::f128) {
23472     RTLIB::Libcall LC;
23473     if (IsSigned)
23474       LC = RTLIB::getFPTOSINT(SrcVT, VT);
23475     else
23476       LC = RTLIB::getFPTOUINT(SrcVT, VT);
23477
23478     MakeLibCallOptions CallOptions;
23479     std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
23480                                                   SDLoc(Op), Chain);
23481
23482     if (IsStrict)
23483       return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
23484
23485     return Tmp.first;
23486   }
23487
23488   // Fall back to X87.
23489   if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
23490     if (IsStrict)
23491       return DAG.getMergeValues({V, Chain}, dl);
23492     return V;
23493   }
23494
23495   llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
23496 }
23497
23498 SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
23499                                              SelectionDAG &DAG) const {
23500   SDValue Src = Op.getOperand(0);
23501   MVT SrcVT = Src.getSimpleValueType();
23502
23503   if (SrcVT == MVT::f16)
23504     return SDValue();
23505
23506   // If the source is in an SSE register, the node is Legal.
23507   if (isScalarFPTypeInSSEReg(SrcVT))
23508     return Op;
23509
23510   return LRINT_LLRINTHelper(Op.getNode(), DAG);
23511 }
23512
23513 SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
23514                                               SelectionDAG &DAG) const {
23515   EVT DstVT = N->getValueType(0);
23516   SDValue Src = N->getOperand(0);
23517   EVT SrcVT = Src.getValueType();
23518
23519   if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
23520     // f16 must be promoted before using the lowering in this routine.
23521     // fp128 does not use this lowering.
23522     return SDValue();
23523   }
23524
23525   SDLoc DL(N);
23526   SDValue Chain = DAG.getEntryNode();
23527
23528   bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
23529
23530   // If we're converting from SSE, the stack slot needs to hold both types.
23531   // Otherwise it only needs to hold the DstVT.
23532   EVT OtherVT = UseSSE ? SrcVT : DstVT;
23533   SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
23534   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
23535   MachinePointerInfo MPI =
23536       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
23537
23538   if (UseSSE) {
23539     assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
23540     Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
23541     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
23542     SDValue Ops[] = { Chain, StackPtr };
23543
23544     Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
23545                                   /*Align*/ std::nullopt,
23546                                   MachineMemOperand::MOLoad);
23547     Chain = Src.getValue(1);
23548   }
23549
23550   SDValue StoreOps[] = { Chain, Src, StackPtr };
23551   Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
23552                                   StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
23553                                   MachineMemOperand::MOStore);
23554
23555   return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
23556 }
23557
23558 SDValue
23559 X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
23560   // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
23561   // but making use of X86 specifics to produce better instruction sequences.
23562   SDNode *Node = Op.getNode();
23563   bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
23564   unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
23565   SDLoc dl(SDValue(Node, 0));
23566   SDValue Src = Node->getOperand(0);
23567
23568   // There are three types involved here: SrcVT is the source floating point
23569   // type, DstVT is the type of the result, and TmpVT is the result of the
23570   // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
23571   // DstVT).
23572   EVT SrcVT = Src.getValueType();
23573   EVT DstVT = Node->getValueType(0);
23574   EVT TmpVT = DstVT;
23575
23576   // This code is only for floats and doubles. Fall back to generic code for
23577   // anything else.
23578   if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))
23579     return SDValue();
23580
23581   EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
23582   unsigned SatWidth = SatVT.getScalarSizeInBits();
23583   unsigned DstWidth = DstVT.getScalarSizeInBits();
23584   unsigned TmpWidth = TmpVT.getScalarSizeInBits();
23585   assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
23586          "Expected saturation width smaller than result width");
23587
23588   // Promote result of FP_TO_*INT to at least 32 bits.
23589   if (TmpWidth < 32) {
23590     TmpVT = MVT::i32;
23591     TmpWidth = 32;
23592   }
23593
23594   // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
23595   // us to use a native signed conversion instead.
23596   if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
23597     TmpVT = MVT::i64;
23598     TmpWidth = 64;
23599   }
23600
23601   // If the saturation width is smaller than the size of the temporary result,
23602   // we can always use signed conversion, which is native.
23603   if (SatWidth < TmpWidth)
23604     FpToIntOpcode = ISD::FP_TO_SINT;
23605
23606   // Determine minimum and maximum integer values and their corresponding
23607   // floating-point values.
23608   APInt MinInt, MaxInt;
23609   if (IsSigned) {
23610     MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
23611     MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
23612   } else {
23613     MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
23614     MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
23615   }
23616
23617   APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
23618   APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
23619
23620   APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
23621     MinInt, IsSigned, APFloat::rmTowardZero);
23622   APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
23623     MaxInt, IsSigned, APFloat::rmTowardZero);
23624   bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
23625                           && !(MaxStatus & APFloat::opStatus::opInexact);
23626
23627   SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
23628   SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
23629
23630   // If the integer bounds are exactly representable as floats, emit a
23631   // min+max+fptoi sequence. Otherwise use comparisons and selects.
23632   if (AreExactFloatBounds) {
23633     if (DstVT != TmpVT) {
23634       // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
23635       SDValue MinClamped = DAG.getNode(
23636         X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
23637       // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
23638       SDValue BothClamped = DAG.getNode(
23639         X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
23640       // Convert clamped value to integer.
23641       SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
23642
23643       // NaN will become INDVAL, with the top bit set and the rest zero.
23644       // Truncation will discard the top bit, resulting in zero.
23645       return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
23646     }
23647
23648     // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
23649     SDValue MinClamped = DAG.getNode(
23650       X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
23651     // Clamp by MaxFloat from above. NaN cannot occur.
23652     SDValue BothClamped = DAG.getNode(
23653       X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
23654     // Convert clamped value to integer.
23655     SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
23656
23657     if (!IsSigned) {
23658       // In the unsigned case we're done, because we mapped NaN to MinFloat,
23659       // which is zero.
23660       return FpToInt;
23661     }
23662
23663     // Otherwise, select zero if Src is NaN.
23664     SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
23665     return DAG.getSelectCC(
23666       dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
23667   }
23668
23669   SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
23670   SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
23671
23672   // Result of direct conversion, which may be selected away.
23673   SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
23674
23675   if (DstVT != TmpVT) {
23676     // NaN will become INDVAL, with the top bit set and the rest zero.
23677     // Truncation will discard the top bit, resulting in zero.
23678     FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
23679   }
23680
23681   SDValue Select = FpToInt;
23682   // For signed conversions where we saturate to the same size as the
23683   // result type of the fptoi instructions, INDVAL coincides with integer
23684   // minimum, so we don't need to explicitly check it.
23685   if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
23686     // If Src ULT MinFloat, select MinInt. In particular, this also selects
23687     // MinInt if Src is NaN.
23688     Select = DAG.getSelectCC(
23689       dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
23690   }
23691
23692   // If Src OGT MaxFloat, select MaxInt.
23693   Select = DAG.getSelectCC(
23694     dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
23695
23696   // In the unsigned case we are done, because we mapped NaN to MinInt, which
23697   // is already zero. The promoted case was already handled above.
23698   if (!IsSigned || DstVT != TmpVT) {
23699     return Select;
23700   }
23701
23702   // Otherwise, select 0 if Src is NaN.
23703   SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
23704   return DAG.getSelectCC(
23705     dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
23706 }
23707
23708 SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
23709   bool IsStrict = Op->isStrictFPOpcode();
23710
23711   SDLoc DL(Op);
23712   MVT VT = Op.getSimpleValueType();
23713   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23714   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
23715   MVT SVT = In.getSimpleValueType();
23716
23717   // Let f16->f80 get lowered to a libcall, except for darwin, where we should
23718   // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
23719   if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
23720                           !Subtarget.getTargetTriple().isOSDarwin()))
23721     return SDValue();
23722
23723   if (SVT == MVT::f16) {
23724     if (Subtarget.hasFP16())
23725       return Op;
23726
23727     if (VT != MVT::f32) {
23728       if (IsStrict)
23729         return DAG.getNode(
23730             ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
23731             {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
23732                                 {MVT::f32, MVT::Other}, {Chain, In})});
23733
23734       return DAG.getNode(ISD::FP_EXTEND, DL, VT,
23735                          DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
23736     }
23737
23738     if (!Subtarget.hasF16C()) {
23739       if (!Subtarget.getTargetTriple().isOSDarwin())
23740         return SDValue();
23741
23742       assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
23743
23744       // Need a libcall, but ABI for f16 is soft-float on MacOS.
23745       TargetLowering::CallLoweringInfo CLI(DAG);
23746       Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
23747
23748       In = DAG.getBitcast(MVT::i16, In);
23749       TargetLowering::ArgListTy Args;
23750       TargetLowering::ArgListEntry Entry;
23751       Entry.Node = In;
23752       Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
23753       Entry.IsSExt = false;
23754       Entry.IsZExt = true;
23755       Args.push_back(Entry);
23756
23757       SDValue Callee = DAG.getExternalSymbol(
23758           getLibcallName(RTLIB::FPEXT_F16_F32),
23759           getPointerTy(DAG.getDataLayout()));
23760       CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
23761           CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
23762           std::move(Args));
23763
23764       SDValue Res;
23765       std::tie(Res,Chain) = LowerCallTo(CLI);
23766       if (IsStrict)
23767         Res = DAG.getMergeValues({Res, Chain}, DL);
23768
23769       return Res;
23770     }
23771
23772     In = DAG.getBitcast(MVT::i16, In);
23773     In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
23774                      getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
23775                      DAG.getIntPtrConstant(0, DL));
23776     SDValue Res;
23777     if (IsStrict) {
23778       Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
23779                         {Chain, In});
23780       Chain = Res.getValue(1);
23781     } else {
23782       Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
23783                         DAG.getTargetConstant(4, DL, MVT::i32));
23784     }
23785     Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
23786                       DAG.getIntPtrConstant(0, DL));
23787     if (IsStrict)
23788       return DAG.getMergeValues({Res, Chain}, DL);
23789     return Res;
23790   }
23791
23792   if (!SVT.isVector())
23793     return Op;
23794
23795   if (SVT.getVectorElementType() == MVT::f16) {
23796     assert(Subtarget.hasF16C() && "Unexpected features!");
23797     if (SVT == MVT::v2f16)
23798       In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
23799                        DAG.getUNDEF(MVT::v2f16));
23800     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
23801                               DAG.getUNDEF(MVT::v4f16));
23802     if (IsStrict)
23803       return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
23804                          {Op->getOperand(0), Res});
23805     return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
23806   } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
23807     return Op;
23808   }
23809
23810   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
23811
23812   SDValue Res =
23813       DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
23814   if (IsStrict)
23815     return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
23816                        {Op->getOperand(0), Res});
23817   return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
23818 }
23819
23820 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
23821   bool IsStrict = Op->isStrictFPOpcode();
23822
23823   SDLoc DL(Op);
23824   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23825   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
23826   MVT VT = Op.getSimpleValueType();
23827   MVT SVT = In.getSimpleValueType();
23828
23829   if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
23830     return SDValue();
23831
23832   if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
23833       !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
23834     if (!Subtarget.getTargetTriple().isOSDarwin())
23835       return SDValue();
23836
23837     // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
23838     TargetLowering::CallLoweringInfo CLI(DAG);
23839     Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
23840
23841     TargetLowering::ArgListTy Args;
23842     TargetLowering::ArgListEntry Entry;
23843     Entry.Node = In;
23844     Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
23845     Entry.IsSExt = false;
23846     Entry.IsZExt = true;
23847     Args.push_back(Entry);
23848
23849     SDValue Callee = DAG.getExternalSymbol(
23850         getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
23851                                        : RTLIB::FPROUND_F32_F16),
23852         getPointerTy(DAG.getDataLayout()));
23853     CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
23854         CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
23855         std::move(Args));
23856
23857     SDValue Res;
23858     std::tie(Res, Chain) = LowerCallTo(CLI);
23859
23860     Res = DAG.getBitcast(MVT::f16, Res);
23861
23862     if (IsStrict)
23863       Res = DAG.getMergeValues({Res, Chain}, DL);
23864
23865     return Res;
23866   }
23867
23868   if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
23869     if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
23870       return SDValue();
23871
23872     if (VT.isVector())
23873       return Op;
23874
23875     SDValue Res;
23876     SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
23877                                         MVT::i32);
23878     if (IsStrict) {
23879       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
23880                         DAG.getConstantFP(0, DL, MVT::v4f32), In,
23881                         DAG.getIntPtrConstant(0, DL));
23882       Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
23883                         {Chain, Res, Rnd});
23884       Chain = Res.getValue(1);
23885     } else {
23886       // FIXME: Should we use zeros for upper elements for non-strict?
23887       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
23888       Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
23889     }
23890
23891     Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
23892                       DAG.getIntPtrConstant(0, DL));
23893     Res = DAG.getBitcast(MVT::f16, Res);
23894
23895     if (IsStrict)
23896       return DAG.getMergeValues({Res, Chain}, DL);
23897
23898     return Res;
23899   }
23900
23901   return Op;
23902 }
23903
23904 static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
23905   bool IsStrict = Op->isStrictFPOpcode();
23906   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23907   assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
23908          "Unexpected VT!");
23909
23910   SDLoc dl(Op);
23911   SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
23912                             DAG.getConstant(0, dl, MVT::v8i16), Src,
23913                             DAG.getIntPtrConstant(0, dl));
23914
23915   SDValue Chain;
23916   if (IsStrict) {
23917     Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
23918                       {Op.getOperand(0), Res});
23919     Chain = Res.getValue(1);
23920   } else {
23921     Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
23922   }
23923
23924   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
23925                     DAG.getIntPtrConstant(0, dl));
23926
23927   if (IsStrict)
23928     return DAG.getMergeValues({Res, Chain}, dl);
23929
23930   return Res;
23931 }
23932
23933 static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
23934   bool IsStrict = Op->isStrictFPOpcode();
23935   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
23936   assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
23937          "Unexpected VT!");
23938
23939   SDLoc dl(Op);
23940   SDValue Res, Chain;
23941   if (IsStrict) {
23942     Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
23943                       DAG.getConstantFP(0, dl, MVT::v4f32), Src,
23944                       DAG.getIntPtrConstant(0, dl));
23945     Res = DAG.getNode(
23946         X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
23947         {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
23948     Chain = Res.getValue(1);
23949   } else {
23950     // FIXME: Should we use zeros for upper elements for non-strict?
23951     Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
23952     Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
23953                       DAG.getTargetConstant(4, dl, MVT::i32));
23954   }
23955
23956   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
23957                     DAG.getIntPtrConstant(0, dl));
23958
23959   if (IsStrict)
23960     return DAG.getMergeValues({Res, Chain}, dl);
23961
23962   return Res;
23963 }
23964
23965 SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
23966                                            SelectionDAG &DAG) const {
23967   SDLoc DL(Op);
23968   MakeLibCallOptions CallOptions;
23969   RTLIB::Libcall LC =
23970       RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
23971   SDValue Res =
23972       makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
23973   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,
23974                      DAG.getBitcast(MVT::i32, Res));
23975 }
23976
23977 /// Depending on uarch and/or optimizing for size, we might prefer to use a
23978 /// vector operation in place of the typical scalar operation.
23979 static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
23980                                          const X86Subtarget &Subtarget) {
23981   // If both operands have other uses, this is probably not profitable.
23982   SDValue LHS = Op.getOperand(0);
23983   SDValue RHS = Op.getOperand(1);
23984   if (!LHS.hasOneUse() && !RHS.hasOneUse())
23985     return Op;
23986
23987   // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
23988   bool IsFP = Op.getSimpleValueType().isFloatingPoint();
23989   if (IsFP && !Subtarget.hasSSE3())
23990     return Op;
23991   if (!IsFP && !Subtarget.hasSSSE3())
23992     return Op;
23993
23994   // Extract from a common vector.
23995   if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23996       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
23997       LHS.getOperand(0) != RHS.getOperand(0) ||
23998       !isa<ConstantSDNode>(LHS.getOperand(1)) ||
23999       !isa<ConstantSDNode>(RHS.getOperand(1)) ||
24000       !shouldUseHorizontalOp(true, DAG, Subtarget))
24001     return Op;
24002
24003   // Allow commuted 'hadd' ops.
24004   // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
24005   unsigned HOpcode;
24006   switch (Op.getOpcode()) {
24007     case ISD::ADD: HOpcode = X86ISD::HADD; break;
24008     case ISD::SUB: HOpcode = X86ISD::HSUB; break;
24009     case ISD::FADD: HOpcode = X86ISD::FHADD; break;
24010     case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
24011     default:
24012       llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
24013   }
24014   unsigned LExtIndex = LHS.getConstantOperandVal(1);
24015   unsigned RExtIndex = RHS.getConstantOperandVal(1);
24016   if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
24017       (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
24018     std::swap(LExtIndex, RExtIndex);
24019
24020   if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
24021     return Op;
24022
24023   SDValue X = LHS.getOperand(0);
24024   EVT VecVT = X.getValueType();
24025   unsigned BitWidth = VecVT.getSizeInBits();
24026   unsigned NumLanes = BitWidth / 128;
24027   unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
24028   assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
24029          "Not expecting illegal vector widths here");
24030
24031   // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
24032   // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
24033   SDLoc DL(Op);
24034   if (BitWidth == 256 || BitWidth == 512) {
24035     unsigned LaneIdx = LExtIndex / NumEltsPerLane;
24036     X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
24037     LExtIndex %= NumEltsPerLane;
24038   }
24039
24040   // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
24041   // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
24042   // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
24043   // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
24044   SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
24045   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
24046                      DAG.getIntPtrConstant(LExtIndex / 2, DL));
24047 }
24048
24049 /// Depending on uarch and/or optimizing for size, we might prefer to use a
24050 /// vector operation in place of the typical scalar operation.
24051 SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
24052   assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
24053          "Only expecting float/double");
24054   return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
24055 }
24056
24057 /// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
24058 /// This mode isn't supported in hardware on X86. But as long as we aren't
24059 /// compiling with trapping math, we can emulate this with
24060 /// trunc(X + copysign(nextafter(0.5, 0.0), X)).
24061 static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
24062   SDValue N0 = Op.getOperand(0);
24063   SDLoc dl(Op);
24064   MVT VT = Op.getSimpleValueType();
24065
24066   // N0 += copysign(nextafter(0.5, 0.0), N0)
24067   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
24068   bool Ignored;
24069   APFloat Point5Pred = APFloat(0.5f);
24070   Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
24071   Point5Pred.next(/*nextDown*/true);
24072
24073   SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
24074                               DAG.getConstantFP(Point5Pred, dl, VT), N0);
24075   N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
24076
24077   // Truncate the result to remove fraction.
24078   return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
24079 }
24080
24081 /// The only differences between FABS and FNEG are the mask and the logic op.
24082 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
24083 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
24084   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
24085          "Wrong opcode for lowering FABS or FNEG.");
24086
24087   bool IsFABS = (Op.getOpcode() == ISD::FABS);
24088
24089   // If this is a FABS and it has an FNEG user, bail out to fold the combination
24090   // into an FNABS. We'll lower the FABS after that if it is still in use.
24091   if (IsFABS)
24092     for (SDNode *User : Op->uses())
24093       if (User->getOpcode() == ISD::FNEG)
24094         return Op;
24095
24096   SDLoc dl(Op);
24097   MVT VT = Op.getSimpleValueType();
24098
24099   bool IsF128 = (VT == MVT::f128);
24100   assert(VT.isFloatingPoint() && VT != MVT::f80 &&
24101          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
24102          "Unexpected type in LowerFABSorFNEG");
24103
24104   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
24105   // decide if we should generate a 16-byte constant mask when we only need 4 or
24106   // 8 bytes for the scalar case.
24107
24108   // There are no scalar bitwise logical SSE/AVX instructions, so we
24109   // generate a 16-byte vector constant and logic op even for the scalar case.
24110   // Using a 16-byte mask allows folding the load of the mask with
24111   // the logic op, so it can save (~4 bytes) on code size.
24112   bool IsFakeVector = !VT.isVector() && !IsF128;
24113   MVT LogicVT = VT;
24114   if (IsFakeVector)
24115     LogicVT = (VT == MVT::f64)   ? MVT::v2f64
24116               : (VT == MVT::f32) ? MVT::v4f32
24117                                  : MVT::v8f16;
24118
24119   unsigned EltBits = VT.getScalarSizeInBits();
24120   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
24121   APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
24122                            APInt::getSignMask(EltBits);
24123   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
24124   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
24125
24126   SDValue Op0 = Op.getOperand(0);
24127   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
24128   unsigned LogicOp = IsFABS  ? X86ISD::FAND :
24129                      IsFNABS ? X86ISD::FOR  :
24130                                X86ISD::FXOR;
24131   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
24132
24133   if (VT.isVector() || IsF128)
24134     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
24135
24136   // For the scalar case extend to a 128-bit vector, perform the logic op,
24137   // and extract the scalar result back out.
24138   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
24139   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
24140   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
24141                      DAG.getIntPtrConstant(0, dl));
24142 }
24143
24144 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
24145   SDValue Mag = Op.getOperand(0);
24146   SDValue Sign = Op.getOperand(1);
24147   SDLoc dl(Op);
24148
24149   // If the sign operand is smaller, extend it first.
24150   MVT VT = Op.getSimpleValueType();
24151   if (Sign.getSimpleValueType().bitsLT(VT))
24152     Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
24153
24154   // And if it is bigger, shrink it first.
24155   if (Sign.getSimpleValueType().bitsGT(VT))
24156     Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
24157                        DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
24158
24159   // At this point the operands and the result should have the same
24160   // type, and that won't be f80 since that is not custom lowered.
24161   bool IsF128 = (VT == MVT::f128);
24162   assert(VT.isFloatingPoint() && VT != MVT::f80 &&
24163          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
24164          "Unexpected type in LowerFCOPYSIGN");
24165
24166   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
24167
24168   // Perform all scalar logic operations as 16-byte vectors because there are no
24169   // scalar FP logic instructions in SSE.
24170   // TODO: This isn't necessary. If we used scalar types, we might avoid some
24171   // unnecessary splats, but we might miss load folding opportunities. Should
24172   // this decision be based on OptimizeForSize?
24173   bool IsFakeVector = !VT.isVector() && !IsF128;
24174   MVT LogicVT = VT;
24175   if (IsFakeVector)
24176     LogicVT = (VT == MVT::f64)   ? MVT::v2f64
24177               : (VT == MVT::f32) ? MVT::v4f32
24178                                  : MVT::v8f16;
24179
24180   // The mask constants are automatically splatted for vector types.
24181   unsigned EltSizeInBits = VT.getScalarSizeInBits();
24182   SDValue SignMask = DAG.getConstantFP(
24183       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
24184   SDValue MagMask = DAG.getConstantFP(
24185       APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
24186
24187   // First, clear all bits but the sign bit from the second operand (sign).
24188   if (IsFakeVector)
24189     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
24190   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
24191
24192   // Next, clear the sign bit from the first operand (magnitude).
24193   // TODO: If we had general constant folding for FP logic ops, this check
24194   // wouldn't be necessary.
24195   SDValue MagBits;
24196   if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
24197     APFloat APF = Op0CN->getValueAPF();
24198     APF.clearSign();
24199     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
24200   } else {
24201     // If the magnitude operand wasn't a constant, we need to AND out the sign.
24202     if (IsFakeVector)
24203       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
24204     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
24205   }
24206
24207   // OR the magnitude value with the sign bit.
24208   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
24209   return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
24210                                           DAG.getIntPtrConstant(0, dl));
24211 }
24212
24213 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
24214   SDValue N0 = Op.getOperand(0);
24215   SDLoc dl(Op);
24216   MVT VT = Op.getSimpleValueType();
24217
24218   MVT OpVT = N0.getSimpleValueType();
24219   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
24220          "Unexpected type for FGETSIGN");
24221
24222   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
24223   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
24224   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
24225   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
24226   Res = DAG.getZExtOrTrunc(Res, dl, VT);
24227   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
24228   return Res;
24229 }
24230
24231 /// Helper for attempting to create a X86ISD::BT node.
24232 static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
24233   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
24234   // instruction.  Since the shift amount is in-range-or-undefined, we know
24235   // that doing a bittest on the i32 value is ok.  We extend to i32 because
24236   // the encoding for the i16 version is larger than the i32 version.
24237   // Also promote i16 to i32 for performance / code size reason.
24238   if (Src.getValueType().getScalarSizeInBits() < 32)
24239     Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
24240
24241   // No legal type found, give up.
24242   if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
24243     return SDValue();
24244
24245   // See if we can use the 32-bit instruction instead of the 64-bit one for a
24246   // shorter encoding. Since the former takes the modulo 32 of BitNo and the
24247   // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
24248   // known to be zero.
24249   if (Src.getValueType() == MVT::i64 &&
24250       DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
24251     Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
24252
24253   // If the operand types disagree, extend the shift amount to match.  Since
24254   // BT ignores high bits (like shifts) we can use anyextend.
24255   if (Src.getValueType() != BitNo.getValueType()) {
24256     // Peek through a mask/modulo operation.
24257     // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
24258     // we probably need a better IsDesirableToPromoteOp to handle this as well.
24259     if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
24260       BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
24261                           DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
24262                                       BitNo.getOperand(0)),
24263                           DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
24264                                       BitNo.getOperand(1)));
24265     else
24266       BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
24267   }
24268
24269   return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
24270 }
24271
24272 /// Helper for creating a X86ISD::SETCC node.
24273 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
24274                         SelectionDAG &DAG) {
24275   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
24276                      DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
24277 }
24278
24279 /// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
24280 /// recognizable memcmp expansion.
24281 static bool isOrXorXorTree(SDValue X, bool Root = true) {
24282   if (X.getOpcode() == ISD::OR)
24283     return isOrXorXorTree(X.getOperand(0), false) &&
24284            isOrXorXorTree(X.getOperand(1), false);
24285   if (Root)
24286     return false;
24287   return X.getOpcode() == ISD::XOR;
24288 }
24289
24290 /// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
24291 /// expansion.
24292 template <typename F>
24293 static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,
24294                                 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
24295   SDValue Op0 = X.getOperand(0);
24296   SDValue Op1 = X.getOperand(1);
24297   if (X.getOpcode() == ISD::OR) {
24298     SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
24299     SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
24300     if (VecVT != CmpVT)
24301       return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
24302     if (HasPT)
24303       return DAG.getNode(ISD::OR, DL, VecVT, A, B);
24304     return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
24305   }
24306   if (X.getOpcode() == ISD::XOR) {
24307     SDValue A = SToV(Op0);
24308     SDValue B = SToV(Op1);
24309     if (VecVT != CmpVT)
24310       return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
24311     if (HasPT)
24312       return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
24313     return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
24314   }
24315   llvm_unreachable("Impossible");
24316 }
24317
24318 /// Try to map a 128-bit or larger integer comparison to vector instructions
24319 /// before type legalization splits it up into chunks.
24320 static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
24321                                                ISD::CondCode CC,
24322                                                const SDLoc &DL,
24323                                                SelectionDAG &DAG,
24324                                                const X86Subtarget &Subtarget) {
24325   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
24326
24327   // We're looking for an oversized integer equality comparison.
24328   EVT OpVT = X.getValueType();
24329   unsigned OpSize = OpVT.getSizeInBits();
24330   if (!OpVT.isScalarInteger() || OpSize < 128)
24331     return SDValue();
24332
24333   // Ignore a comparison with zero because that gets special treatment in
24334   // EmitTest(). But make an exception for the special case of a pair of
24335   // logically-combined vector-sized operands compared to zero. This pattern may
24336   // be generated by the memcmp expansion pass with oversized integer compares
24337   // (see PR33325).
24338   bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
24339   if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
24340     return SDValue();
24341
24342   // Don't perform this combine if constructing the vector will be expensive.
24343   auto IsVectorBitCastCheap = [](SDValue X) {
24344     X = peekThroughBitcasts(X);
24345     return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
24346            X.getOpcode() == ISD::LOAD;
24347   };
24348   if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
24349       !IsOrXorXorTreeCCZero)
24350     return SDValue();
24351
24352   // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
24353   // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
24354   // Otherwise use PCMPEQ (plus AND) and mask testing.
24355   bool NoImplicitFloatOps =
24356       DAG.getMachineFunction().getFunction().hasFnAttribute(
24357           Attribute::NoImplicitFloat);
24358   if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
24359       ((OpSize == 128 && Subtarget.hasSSE2()) ||
24360        (OpSize == 256 && Subtarget.hasAVX()) ||
24361        (OpSize == 512 && Subtarget.useAVX512Regs()))) {
24362     bool HasPT = Subtarget.hasSSE41();
24363
24364     // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
24365     // vector registers are essentially free. (Technically, widening registers
24366     // prevents load folding, but the tradeoff is worth it.)
24367     bool PreferKOT = Subtarget.preferMaskRegisters();
24368     bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
24369
24370     EVT VecVT = MVT::v16i8;
24371     EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
24372     if (OpSize == 256) {
24373       VecVT = MVT::v32i8;
24374       CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
24375     }
24376     EVT CastVT = VecVT;
24377     bool NeedsAVX512FCast = false;
24378     if (OpSize == 512 || NeedZExt) {
24379       if (Subtarget.hasBWI()) {
24380         VecVT = MVT::v64i8;
24381         CmpVT = MVT::v64i1;
24382         if (OpSize == 512)
24383           CastVT = VecVT;
24384       } else {
24385         VecVT = MVT::v16i32;
24386         CmpVT = MVT::v16i1;
24387         CastVT = OpSize == 512   ? VecVT
24388                  : OpSize == 256 ? MVT::v8i32
24389                                  : MVT::v4i32;
24390         NeedsAVX512FCast = true;
24391       }
24392     }
24393
24394     auto ScalarToVector = [&](SDValue X) -> SDValue {
24395       bool TmpZext = false;
24396       EVT TmpCastVT = CastVT;
24397       if (X.getOpcode() == ISD::ZERO_EXTEND) {
24398         SDValue OrigX = X.getOperand(0);
24399         unsigned OrigSize = OrigX.getScalarValueSizeInBits();
24400         if (OrigSize < OpSize) {
24401           if (OrigSize == 128) {
24402             TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
24403             X = OrigX;
24404             TmpZext = true;
24405           } else if (OrigSize == 256) {
24406             TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
24407             X = OrigX;
24408             TmpZext = true;
24409           }
24410         }
24411       }
24412       X = DAG.getBitcast(TmpCastVT, X);
24413       if (!NeedZExt && !TmpZext)
24414         return X;
24415       return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
24416                          DAG.getConstant(0, DL, VecVT), X,
24417                          DAG.getVectorIdxConstant(0, DL));
24418     };
24419
24420     SDValue Cmp;
24421     if (IsOrXorXorTreeCCZero) {
24422       // This is a bitwise-combined equality comparison of 2 pairs of vectors:
24423       // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
24424       // Use 2 vector equality compares and 'and' the results before doing a
24425       // MOVMSK.
24426       Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
24427     } else {
24428       SDValue VecX = ScalarToVector(X);
24429       SDValue VecY = ScalarToVector(Y);
24430       if (VecVT != CmpVT) {
24431         Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
24432       } else if (HasPT) {
24433         Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
24434       } else {
24435         Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
24436       }
24437     }
24438     // AVX512 should emit a setcc that will lower to kortest.
24439     if (VecVT != CmpVT) {
24440       EVT KRegVT = CmpVT == MVT::v64i1   ? MVT::i64
24441                    : CmpVT == MVT::v32i1 ? MVT::i32
24442                                          : MVT::i16;
24443       return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
24444                           DAG.getConstant(0, DL, KRegVT), CC);
24445     }
24446     if (HasPT) {
24447       SDValue BCCmp =
24448           DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
24449       SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
24450       X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24451       SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
24452       return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
24453     }
24454     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
24455     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
24456     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
24457     assert(Cmp.getValueType() == MVT::v16i8 &&
24458            "Non 128-bit vector on pre-SSE41 target");
24459     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
24460     SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
24461     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
24462   }
24463
24464   return SDValue();
24465 }
24466
24467 /// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
24468 /// style scalarized (associative) reduction patterns. Partial reductions
24469 /// are supported when the pointer SrcMask is non-null.
24470 /// TODO - move this to SelectionDAG?
24471 static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
24472                                  SmallVectorImpl<SDValue> &SrcOps,
24473                                  SmallVectorImpl<APInt> *SrcMask = nullptr) {
24474   SmallVector<SDValue, 8> Opnds;
24475   DenseMap<SDValue, APInt> SrcOpMap;
24476   EVT VT = MVT::Other;
24477
24478   // Recognize a special case where a vector is casted into wide integer to
24479   // test all 0s.
24480   assert(Op.getOpcode() == unsigned(BinOp) &&
24481          "Unexpected bit reduction opcode");
24482   Opnds.push_back(Op.getOperand(0));
24483   Opnds.push_back(Op.getOperand(1));
24484
24485   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
24486     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
24487     // BFS traverse all BinOp operands.
24488     if (I->getOpcode() == unsigned(BinOp)) {
24489       Opnds.push_back(I->getOperand(0));
24490       Opnds.push_back(I->getOperand(1));
24491       // Re-evaluate the number of nodes to be traversed.
24492       e += 2; // 2 more nodes (LHS and RHS) are pushed.
24493       continue;
24494     }
24495
24496     // Quit if a non-EXTRACT_VECTOR_ELT
24497     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24498       return false;
24499
24500     // Quit if without a constant index.
24501     auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
24502     if (!Idx)
24503       return false;
24504
24505     SDValue Src = I->getOperand(0);
24506     DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
24507     if (M == SrcOpMap.end()) {
24508       VT = Src.getValueType();
24509       // Quit if not the same type.
24510       if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
24511         return false;
24512       unsigned NumElts = VT.getVectorNumElements();
24513       APInt EltCount = APInt::getZero(NumElts);
24514       M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
24515       SrcOps.push_back(Src);
24516     }
24517
24518     // Quit if element already used.
24519     unsigned CIdx = Idx->getZExtValue();
24520     if (M->second[CIdx])
24521       return false;
24522     M->second.setBit(CIdx);
24523   }
24524
24525   if (SrcMask) {
24526     // Collect the source partial masks.
24527     for (SDValue &SrcOp : SrcOps)
24528       SrcMask->push_back(SrcOpMap[SrcOp]);
24529   } else {
24530     // Quit if not all elements are used.
24531     for (const auto &I : SrcOpMap)
24532       if (!I.second.isAllOnes())
24533         return false;
24534   }
24535
24536   return true;
24537 }
24538
24539 // Helper function for comparing all bits of two vectors.
24540 static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS,
24541                                    ISD::CondCode CC, const APInt &OriginalMask,
24542                                    const X86Subtarget &Subtarget,
24543                                    SelectionDAG &DAG, X86::CondCode &X86CC) {
24544   EVT VT = LHS.getValueType();
24545   unsigned ScalarSize = VT.getScalarSizeInBits();
24546   if (OriginalMask.getBitWidth() != ScalarSize) {
24547     assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
24548     return SDValue();
24549   }
24550
24551   // Quit if not convertable to legal scalar or 128/256-bit vector.
24552   if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
24553     return SDValue();
24554
24555   // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
24556   if (VT.isFloatingPoint())
24557     return SDValue();
24558
24559   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24560   X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
24561
24562   APInt Mask = OriginalMask;
24563
24564   auto MaskBits = [&](SDValue Src) {
24565     if (Mask.isAllOnes())
24566       return Src;
24567     EVT SrcVT = Src.getValueType();
24568     SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
24569     return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
24570   };
24571
24572   // For sub-128-bit vector, cast to (legal) integer and compare with zero.
24573   if (VT.getSizeInBits() < 128) {
24574     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
24575     if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
24576       if (IntVT != MVT::i64)
24577         return SDValue();
24578       auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
24579                                       MVT::i32, MVT::i32);
24580       auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
24581                                       MVT::i32, MVT::i32);
24582       SDValue Lo =
24583           DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
24584       SDValue Hi =
24585           DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
24586       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
24587                          DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
24588                          DAG.getConstant(0, DL, MVT::i32));
24589     }
24590     return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
24591                        DAG.getBitcast(IntVT, MaskBits(LHS)),
24592                        DAG.getBitcast(IntVT, MaskBits(RHS)));
24593   }
24594
24595   // Without PTEST, a masked v2i64 or-reduction is not faster than
24596   // scalarization.
24597   bool UseKORTEST = Subtarget.useAVX512Regs();
24598   bool UsePTEST = Subtarget.hasSSE41();
24599   if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
24600     return SDValue();
24601
24602   // Split down to 128/256/512-bit vector.
24603   unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
24604
24605   // If the input vector has vector elements wider than the target test size,
24606   // then cast to <X x i64> so it will safely split.
24607   if (ScalarSize > TestSize) {
24608     if (!Mask.isAllOnes())
24609       return SDValue();
24610     VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
24611     LHS = DAG.getBitcast(VT, LHS);
24612     RHS = DAG.getBitcast(VT, RHS);
24613     Mask = APInt::getAllOnes(64);
24614   }
24615
24616   if (VT.getSizeInBits() > TestSize) {
24617     KnownBits KnownRHS = DAG.computeKnownBits(RHS);
24618     if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
24619       // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
24620       while (VT.getSizeInBits() > TestSize) {
24621         auto Split = DAG.SplitVector(LHS, DL);
24622         VT = Split.first.getValueType();
24623         LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
24624       }
24625       RHS = DAG.getAllOnesConstant(DL, VT);
24626     } else if (!UsePTEST && !KnownRHS.isZero()) {
24627       // MOVMSK Special Case:
24628       // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
24629       MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
24630       VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
24631       LHS = DAG.getBitcast(VT, MaskBits(LHS));
24632       RHS = DAG.getBitcast(VT, MaskBits(RHS));
24633       EVT BoolVT = VT.changeVectorElementType(MVT::i1);
24634       SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
24635       V = DAG.getSExtOrTrunc(V, DL, VT);
24636       while (VT.getSizeInBits() > TestSize) {
24637         auto Split = DAG.SplitVector(V, DL);
24638         VT = Split.first.getValueType();
24639         V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
24640       }
24641       V = DAG.getNOT(DL, V, VT);
24642       V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24643       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
24644                          DAG.getConstant(0, DL, MVT::i32));
24645     } else {
24646       // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
24647       SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24648       while (VT.getSizeInBits() > TestSize) {
24649         auto Split = DAG.SplitVector(V, DL);
24650         VT = Split.first.getValueType();
24651         V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
24652       }
24653       LHS = V;
24654       RHS = DAG.getConstant(0, DL, VT);
24655     }
24656   }
24657
24658   if (UseKORTEST && VT.is512BitVector()) {
24659     MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
24660     MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
24661     LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
24662     RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
24663     SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
24664     return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
24665   }
24666
24667   if (UsePTEST) {
24668     MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
24669     LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
24670     RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
24671     SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
24672     return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
24673   }
24674
24675   assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
24676   MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
24677   LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
24678   RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
24679   SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
24680   V = DAG.getNOT(DL, V, MaskVT);
24681   V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24682   return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
24683                      DAG.getConstant(0, DL, MVT::i32));
24684 }
24685
24686 // Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
24687 // to CMP(MOVMSK(PCMPEQB(X,Y))).
24688 static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,
24689                                        ISD::CondCode CC, const SDLoc &DL,
24690                                        const X86Subtarget &Subtarget,
24691                                        SelectionDAG &DAG,
24692                                        X86::CondCode &X86CC) {
24693   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24694
24695   bool CmpNull = isNullConstant(RHS);
24696   bool CmpAllOnes = isAllOnesConstant(RHS);
24697   if (!CmpNull && !CmpAllOnes)
24698     return SDValue();
24699
24700   SDValue Op = LHS;
24701   if (!Subtarget.hasSSE2() || !Op->hasOneUse())
24702     return SDValue();
24703
24704   // Check whether we're masking/truncating an OR-reduction result, in which
24705   // case track the masked bits.
24706   // TODO: Add CmpAllOnes support.
24707   APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
24708   if (CmpNull) {
24709     switch (Op.getOpcode()) {
24710     case ISD::TRUNCATE: {
24711       SDValue Src = Op.getOperand(0);
24712       Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
24713                                   Op.getScalarValueSizeInBits());
24714       Op = Src;
24715       break;
24716     }
24717     case ISD::AND: {
24718       if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
24719         Mask = Cst->getAPIntValue();
24720         Op = Op.getOperand(0);
24721       }
24722       break;
24723     }
24724     }
24725   }
24726
24727   ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
24728
24729   // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
24730   // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
24731   SmallVector<SDValue, 8> VecIns;
24732   if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
24733     EVT VT = VecIns[0].getValueType();
24734     assert(llvm::all_of(VecIns,
24735                         [VT](SDValue V) { return VT == V.getValueType(); }) &&
24736            "Reduction source vector mismatch");
24737
24738     // Quit if not splittable to scalar/128/256/512-bit vector.
24739     if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
24740       return SDValue();
24741
24742     // If more than one full vector is evaluated, AND/OR them first before
24743     // PTEST.
24744     for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
24745          Slot += 2, e += 1) {
24746       // Each iteration will AND/OR 2 nodes and append the result until there is
24747       // only 1 node left, i.e. the final value of all vectors.
24748       SDValue LHS = VecIns[Slot];
24749       SDValue RHS = VecIns[Slot + 1];
24750       VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
24751     }
24752
24753     return LowerVectorAllEqual(DL, VecIns.back(),
24754                                CmpNull ? DAG.getConstant(0, DL, VT)
24755                                        : DAG.getAllOnesConstant(DL, VT),
24756                                CC, Mask, Subtarget, DAG, X86CC);
24757   }
24758
24759   // Match icmp(reduce_or(X),0) anyof reduction patterns.
24760   // Match icmp(reduce_and(X),-1) allof reduction patterns.
24761   if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24762     ISD::NodeType BinOp;
24763     if (SDValue Match =
24764             DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
24765       EVT MatchVT = Match.getValueType();
24766       return LowerVectorAllEqual(DL, Match,
24767                                  CmpNull ? DAG.getConstant(0, DL, MatchVT)
24768                                          : DAG.getAllOnesConstant(DL, MatchVT),
24769                                  CC, Mask, Subtarget, DAG, X86CC);
24770     }
24771   }
24772
24773   if (Mask.isAllOnes()) {
24774     assert(!Op.getValueType().isVector() &&
24775            "Illegal vector type for reduction pattern");
24776     SDValue Src = peekThroughBitcasts(Op);
24777     if (Src.getValueType().isFixedLengthVector() &&
24778         Src.getValueType().getScalarType() == MVT::i1) {
24779       // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
24780       // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
24781       if (Src.getOpcode() == ISD::SETCC) {
24782         SDValue LHS = Src.getOperand(0);
24783         SDValue RHS = Src.getOperand(1);
24784         EVT LHSVT = LHS.getValueType();
24785         ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
24786         if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
24787             llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
24788           APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
24789           return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
24790                                      X86CC);
24791         }
24792       }
24793       // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
24794       // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
24795       // Peek through truncation, mask the LSB and compare against zero/LSB.
24796       if (Src.getOpcode() == ISD::TRUNCATE) {
24797         SDValue Inner = Src.getOperand(0);
24798         EVT InnerVT = Inner.getValueType();
24799         if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
24800           unsigned BW = InnerVT.getScalarSizeInBits();
24801           APInt SrcMask = APInt(BW, 1);
24802           APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
24803           return LowerVectorAllEqual(DL, Inner,
24804                                      DAG.getConstant(Cmp, DL, InnerVT), CC,
24805                                      SrcMask, Subtarget, DAG, X86CC);
24806         }
24807       }
24808     }
24809   }
24810
24811   return SDValue();
24812 }
24813
24814 /// return true if \c Op has a use that doesn't just read flags.
24815 static bool hasNonFlagsUse(SDValue Op) {
24816   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
24817        ++UI) {
24818     SDNode *User = *UI;
24819     unsigned UOpNo = UI.getOperandNo();
24820     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
24821       // Look pass truncate.
24822       UOpNo = User->use_begin().getOperandNo();
24823       User = *User->use_begin();
24824     }
24825
24826     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
24827         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
24828       return true;
24829   }
24830   return false;
24831 }
24832
24833 // Transform to an x86-specific ALU node with flags if there is a chance of
24834 // using an RMW op or only the flags are used. Otherwise, leave
24835 // the node alone and emit a 'cmp' or 'test' instruction.
24836 static bool isProfitableToUseFlagOp(SDValue Op) {
24837   for (SDNode *U : Op->uses())
24838     if (U->getOpcode() != ISD::CopyToReg &&
24839         U->getOpcode() != ISD::SETCC &&
24840         U->getOpcode() != ISD::STORE)
24841       return false;
24842
24843   return true;
24844 }
24845
24846 /// Emit nodes that will be selected as "test Op0,Op0", or something
24847 /// equivalent.
24848 static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
24849                         SelectionDAG &DAG, const X86Subtarget &Subtarget) {
24850   // CF and OF aren't always set the way we want. Determine which
24851   // of these we need.
24852   bool NeedCF = false;
24853   bool NeedOF = false;
24854   switch (X86CC) {
24855   default: break;
24856   case X86::COND_A: case X86::COND_AE:
24857   case X86::COND_B: case X86::COND_BE:
24858     NeedCF = true;
24859     break;
24860   case X86::COND_G: case X86::COND_GE:
24861   case X86::COND_L: case X86::COND_LE:
24862   case X86::COND_O: case X86::COND_NO: {
24863     // Check if we really need to set the
24864     // Overflow flag. If NoSignedWrap is present
24865     // that is not actually needed.
24866     switch (Op->getOpcode()) {
24867     case ISD::ADD:
24868     case ISD::SUB:
24869     case ISD::MUL:
24870     case ISD::SHL:
24871       if (Op.getNode()->getFlags().hasNoSignedWrap())
24872         break;
24873       [[fallthrough]];
24874     default:
24875       NeedOF = true;
24876       break;
24877     }
24878     break;
24879   }
24880   }
24881   // See if we can use the EFLAGS value from the operand instead of
24882   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
24883   // we prove that the arithmetic won't overflow, we can't use OF or CF.
24884   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
24885     // Emit a CMP with 0, which is the TEST pattern.
24886     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
24887                        DAG.getConstant(0, dl, Op.getValueType()));
24888   }
24889   unsigned Opcode = 0;
24890   unsigned NumOperands = 0;
24891
24892   SDValue ArithOp = Op;
24893
24894   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
24895   // which may be the result of a CAST.  We use the variable 'Op', which is the
24896   // non-casted variable when we check for possible users.
24897   switch (ArithOp.getOpcode()) {
24898   case ISD::AND:
24899     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
24900     // because a TEST instruction will be better.
24901     if (!hasNonFlagsUse(Op))
24902       break;
24903
24904     [[fallthrough]];
24905   case ISD::ADD:
24906   case ISD::SUB:
24907   case ISD::OR:
24908   case ISD::XOR:
24909     if (!isProfitableToUseFlagOp(Op))
24910       break;
24911
24912     // Otherwise use a regular EFLAGS-setting instruction.
24913     switch (ArithOp.getOpcode()) {
24914     default: llvm_unreachable("unexpected operator!");
24915     case ISD::ADD: Opcode = X86ISD::ADD; break;
24916     case ISD::SUB: Opcode = X86ISD::SUB; break;
24917     case ISD::XOR: Opcode = X86ISD::XOR; break;
24918     case ISD::AND: Opcode = X86ISD::AND; break;
24919     case ISD::OR:  Opcode = X86ISD::OR;  break;
24920     }
24921
24922     NumOperands = 2;
24923     break;
24924   case X86ISD::ADD:
24925   case X86ISD::SUB:
24926   case X86ISD::OR:
24927   case X86ISD::XOR:
24928   case X86ISD::AND:
24929     return SDValue(Op.getNode(), 1);
24930   case ISD::SSUBO:
24931   case ISD::USUBO: {
24932     // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
24933     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24934     return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
24935                        Op->getOperand(1)).getValue(1);
24936   }
24937   default:
24938     break;
24939   }
24940
24941   if (Opcode == 0) {
24942     // Emit a CMP with 0, which is the TEST pattern.
24943     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
24944                        DAG.getConstant(0, dl, Op.getValueType()));
24945   }
24946   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24947   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
24948
24949   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
24950   DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
24951   return SDValue(New.getNode(), 1);
24952 }
24953
24954 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
24955 /// equivalent.
24956 static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
24957                        const SDLoc &dl, SelectionDAG &DAG,
24958                        const X86Subtarget &Subtarget) {
24959   if (isNullConstant(Op1))
24960     return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
24961
24962   EVT CmpVT = Op0.getValueType();
24963
24964   assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
24965           CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
24966
24967   // Only promote the compare up to I32 if it is a 16 bit operation
24968   // with an immediate.  16 bit immediates are to be avoided.
24969   if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
24970       !DAG.getMachineFunction().getFunction().hasMinSize()) {
24971     ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
24972     ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
24973     // Don't do this if the immediate can fit in 8-bits.
24974     if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
24975         (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
24976       unsigned ExtendOp =
24977           isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
24978       if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
24979         // For equality comparisons try to use SIGN_EXTEND if the input was
24980         // truncate from something with enough sign bits.
24981         if (Op0.getOpcode() == ISD::TRUNCATE) {
24982           if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
24983             ExtendOp = ISD::SIGN_EXTEND;
24984         } else if (Op1.getOpcode() == ISD::TRUNCATE) {
24985           if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
24986             ExtendOp = ISD::SIGN_EXTEND;
24987         }
24988       }
24989
24990       CmpVT = MVT::i32;
24991       Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
24992       Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
24993     }
24994   }
24995
24996   // Try to shrink i64 compares if the input has enough zero bits.
24997   // FIXME: Do this for non-constant compares for constant on LHS?
24998   if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
24999       Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
25000       cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
25001       DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
25002     CmpVT = MVT::i32;
25003     Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
25004     Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
25005   }
25006
25007   // 0-x == y --> x+y == 0
25008   // 0-x != y --> x+y != 0
25009   if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
25010       Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
25011     SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
25012     SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
25013     return Add.getValue(1);
25014   }
25015
25016   // x == 0-y --> x+y == 0
25017   // x != 0-y --> x+y != 0
25018   if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
25019       Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
25020     SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
25021     SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
25022     return Add.getValue(1);
25023   }
25024
25025   // Use SUB instead of CMP to enable CSE between SUB and CMP.
25026   SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
25027   SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
25028   return Sub.getValue(1);
25029 }
25030
25031 /// Check if replacement of SQRT with RSQRT should be disabled.
25032 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
25033   EVT VT = Op.getValueType();
25034
25035   // We don't need to replace SQRT with RSQRT for half type.
25036   if (VT.getScalarType() == MVT::f16)
25037     return true;
25038
25039   // We never want to use both SQRT and RSQRT instructions for the same input.
25040   if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
25041     return false;
25042
25043   if (VT.isVector())
25044     return Subtarget.hasFastVectorFSQRT();
25045   return Subtarget.hasFastScalarFSQRT();
25046 }
25047
25048 /// The minimum architected relative accuracy is 2^-12. We need one
25049 /// Newton-Raphson step to have a good float result (24 bits of precision).
25050 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
25051                                            SelectionDAG &DAG, int Enabled,
25052                                            int &RefinementSteps,
25053                                            bool &UseOneConstNR,
25054                                            bool Reciprocal) const {
25055   SDLoc DL(Op);
25056   EVT VT = Op.getValueType();
25057
25058   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
25059   // It is likely not profitable to do this for f64 because a double-precision
25060   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
25061   // instructions: convert to single, rsqrtss, convert back to double, refine
25062   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
25063   // along with FMA, this could be a throughput win.
25064   // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
25065   // after legalize types.
25066   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
25067       (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
25068       (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
25069       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
25070       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
25071     if (RefinementSteps == ReciprocalEstimate::Unspecified)
25072       RefinementSteps = 1;
25073
25074     UseOneConstNR = false;
25075     // There is no FSQRT for 512-bits, but there is RSQRT14.
25076     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
25077     SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
25078     if (RefinementSteps == 0 && !Reciprocal)
25079       Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
25080     return Estimate;
25081   }
25082
25083   if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
25084       Subtarget.hasFP16()) {
25085     assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
25086     if (RefinementSteps == ReciprocalEstimate::Unspecified)
25087       RefinementSteps = 0;
25088
25089     if (VT == MVT::f16) {
25090       SDValue Zero = DAG.getIntPtrConstant(0, DL);
25091       SDValue Undef = DAG.getUNDEF(MVT::v8f16);
25092       Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
25093       Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
25094       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
25095     }
25096
25097     return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
25098   }
25099   return SDValue();
25100 }
25101
25102 /// The minimum architected relative accuracy is 2^-12. We need one
25103 /// Newton-Raphson step to have a good float result (24 bits of precision).
25104 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
25105                                             int Enabled,
25106                                             int &RefinementSteps) const {
25107   SDLoc DL(Op);
25108   EVT VT = Op.getValueType();
25109
25110   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
25111   // It is likely not profitable to do this for f64 because a double-precision
25112   // reciprocal estimate with refinement on x86 prior to FMA requires
25113   // 15 instructions: convert to single, rcpss, convert back to double, refine
25114   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
25115   // along with FMA, this could be a throughput win.
25116
25117   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
25118       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
25119       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
25120       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
25121     // Enable estimate codegen with 1 refinement step for vector division.
25122     // Scalar division estimates are disabled because they break too much
25123     // real-world code. These defaults are intended to match GCC behavior.
25124     if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
25125       return SDValue();
25126
25127     if (RefinementSteps == ReciprocalEstimate::Unspecified)
25128       RefinementSteps = 1;
25129
25130     // There is no FSQRT for 512-bits, but there is RCP14.
25131     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
25132     return DAG.getNode(Opcode, DL, VT, Op);
25133   }
25134
25135   if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
25136       Subtarget.hasFP16()) {
25137     if (RefinementSteps == ReciprocalEstimate::Unspecified)
25138       RefinementSteps = 0;
25139
25140     if (VT == MVT::f16) {
25141       SDValue Zero = DAG.getIntPtrConstant(0, DL);
25142       SDValue Undef = DAG.getUNDEF(MVT::v8f16);
25143       Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
25144       Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
25145       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
25146     }
25147
25148     return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
25149   }
25150   return SDValue();
25151 }
25152
25153 /// If we have at least two divisions that use the same divisor, convert to
25154 /// multiplication by a reciprocal. This may need to be adjusted for a given
25155 /// CPU if a division's cost is not at least twice the cost of a multiplication.
25156 /// This is because we still need one division to calculate the reciprocal and
25157 /// then we need two multiplies by that reciprocal as replacements for the
25158 /// original divisions.
25159 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
25160   return 2;
25161 }
25162
25163 SDValue
25164 X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
25165                                  SelectionDAG &DAG,
25166                                  SmallVectorImpl<SDNode *> &Created) const {
25167   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
25168   if (isIntDivCheap(N->getValueType(0), Attr))
25169     return SDValue(N,0); // Lower SDIV as SDIV
25170
25171   assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
25172          "Unexpected divisor!");
25173
25174   // Only perform this transform if CMOV is supported otherwise the select
25175   // below will become a branch.
25176   if (!Subtarget.canUseCMOV())
25177     return SDValue();
25178
25179   // fold (sdiv X, pow2)
25180   EVT VT = N->getValueType(0);
25181   // FIXME: Support i8.
25182   if (VT != MVT::i16 && VT != MVT::i32 &&
25183       !(Subtarget.is64Bit() && VT == MVT::i64))
25184     return SDValue();
25185
25186   unsigned Lg2 = Divisor.countr_zero();
25187
25188   // If the divisor is 2 or -2, the default expansion is better.
25189   if (Lg2 == 1)
25190     return SDValue();
25191
25192   SDLoc DL(N);
25193   SDValue N0 = N->getOperand(0);
25194   SDValue Zero = DAG.getConstant(0, DL, VT);
25195   APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
25196   SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
25197
25198   // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
25199   SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
25200   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
25201   SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
25202
25203   Created.push_back(Cmp.getNode());
25204   Created.push_back(Add.getNode());
25205   Created.push_back(CMov.getNode());
25206
25207   // Divide by pow2.
25208   SDValue SRA =
25209       DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
25210
25211   // If we're dividing by a positive value, we're done.  Otherwise, we must
25212   // negate the result.
25213   if (Divisor.isNonNegative())
25214     return SRA;
25215
25216   Created.push_back(SRA.getNode());
25217   return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
25218 }
25219
25220 /// Result of 'and' is compared against zero. Change to a BT node if possible.
25221 /// Returns the BT node and the condition code needed to use it.
25222 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
25223                             SelectionDAG &DAG, X86::CondCode &X86CC) {
25224   assert(And.getOpcode() == ISD::AND && "Expected AND node!");
25225   SDValue Op0 = And.getOperand(0);
25226   SDValue Op1 = And.getOperand(1);
25227   if (Op0.getOpcode() == ISD::TRUNCATE)
25228     Op0 = Op0.getOperand(0);
25229   if (Op1.getOpcode() == ISD::TRUNCATE)
25230     Op1 = Op1.getOperand(0);
25231
25232   SDValue Src, BitNo;
25233   if (Op1.getOpcode() == ISD::SHL)
25234     std::swap(Op0, Op1);
25235   if (Op0.getOpcode() == ISD::SHL) {
25236     if (isOneConstant(Op0.getOperand(0))) {
25237       // If we looked past a truncate, check that it's only truncating away
25238       // known zeros.
25239       unsigned BitWidth = Op0.getValueSizeInBits();
25240       unsigned AndBitWidth = And.getValueSizeInBits();
25241       if (BitWidth > AndBitWidth) {
25242         KnownBits Known = DAG.computeKnownBits(Op0);
25243         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
25244           return SDValue();
25245       }
25246       Src = Op1;
25247       BitNo = Op0.getOperand(1);
25248     }
25249   } else if (Op1.getOpcode() == ISD::Constant) {
25250     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
25251     uint64_t AndRHSVal = AndRHS->getZExtValue();
25252     SDValue AndLHS = Op0;
25253
25254     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
25255       Src = AndLHS.getOperand(0);
25256       BitNo = AndLHS.getOperand(1);
25257     } else {
25258       // Use BT if the immediate can't be encoded in a TEST instruction or we
25259       // are optimizing for size and the immedaite won't fit in a byte.
25260       bool OptForSize = DAG.shouldOptForSize();
25261       if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
25262           isPowerOf2_64(AndRHSVal)) {
25263         Src = AndLHS;
25264         BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
25265                                 Src.getValueType());
25266       }
25267     }
25268   }
25269
25270   // No patterns found, give up.
25271   if (!Src.getNode())
25272     return SDValue();
25273
25274   // Remove any bit flip.
25275   if (isBitwiseNot(Src)) {
25276     Src = Src.getOperand(0);
25277     CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
25278   }
25279
25280   // Attempt to create the X86ISD::BT node.
25281   if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
25282     X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
25283     return BT;
25284   }
25285
25286   return SDValue();
25287 }
25288
25289 // Check if pre-AVX condcode can be performed by a single FCMP op.
25290 static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
25291   return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
25292 }
25293
25294 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
25295 /// CMPs.
25296 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
25297                                    SDValue &Op1, bool &IsAlwaysSignaling) {
25298   unsigned SSECC;
25299   bool Swap = false;
25300
25301   // SSE Condition code mapping:
25302   //  0 - EQ
25303   //  1 - LT
25304   //  2 - LE
25305   //  3 - UNORD
25306   //  4 - NEQ
25307   //  5 - NLT
25308   //  6 - NLE
25309   //  7 - ORD
25310   switch (SetCCOpcode) {
25311   default: llvm_unreachable("Unexpected SETCC condition");
25312   case ISD::SETOEQ:
25313   case ISD::SETEQ:  SSECC = 0; break;
25314   case ISD::SETOGT:
25315   case ISD::SETGT:  Swap = true; [[fallthrough]];
25316   case ISD::SETLT:
25317   case ISD::SETOLT: SSECC = 1; break;
25318   case ISD::SETOGE:
25319   case ISD::SETGE:  Swap = true; [[fallthrough]];
25320   case ISD::SETLE:
25321   case ISD::SETOLE: SSECC = 2; break;
25322   case ISD::SETUO:  SSECC = 3; break;
25323   case ISD::SETUNE:
25324   case ISD::SETNE:  SSECC = 4; break;
25325   case ISD::SETULE: Swap = true; [[fallthrough]];
25326   case ISD::SETUGE: SSECC = 5; break;
25327   case ISD::SETULT: Swap = true; [[fallthrough]];
25328   case ISD::SETUGT: SSECC = 6; break;
25329   case ISD::SETO:   SSECC = 7; break;
25330   case ISD::SETUEQ: SSECC = 8; break;
25331   case ISD::SETONE: SSECC = 12; break;
25332   }
25333   if (Swap)
25334     std::swap(Op0, Op1);
25335
25336   switch (SetCCOpcode) {
25337   default:
25338     IsAlwaysSignaling = true;
25339     break;
25340   case ISD::SETEQ:
25341   case ISD::SETOEQ:
25342   case ISD::SETUEQ:
25343   case ISD::SETNE:
25344   case ISD::SETONE:
25345   case ISD::SETUNE:
25346   case ISD::SETO:
25347   case ISD::SETUO:
25348     IsAlwaysSignaling = false;
25349     break;
25350   }
25351
25352   return SSECC;
25353 }
25354
25355 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
25356 /// concatenate the result back.
25357 static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
25358                               ISD::CondCode Cond, SelectionDAG &DAG,
25359                               const SDLoc &dl) {
25360   assert(VT.isInteger() && VT == LHS.getValueType() &&
25361          VT == RHS.getValueType() && "Unsupported VTs!");
25362
25363   SDValue CC = DAG.getCondCode(Cond);
25364
25365   // Extract the LHS Lo/Hi vectors
25366   SDValue LHS1, LHS2;
25367   std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
25368
25369   // Extract the RHS Lo/Hi vectors
25370   SDValue RHS1, RHS2;
25371   std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
25372
25373   // Issue the operation on the smaller types and concatenate the result back
25374   EVT LoVT, HiVT;
25375   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
25376   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
25377                      DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
25378                      DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
25379 }
25380
25381 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
25382
25383   SDValue Op0 = Op.getOperand(0);
25384   SDValue Op1 = Op.getOperand(1);
25385   SDValue CC = Op.getOperand(2);
25386   MVT VT = Op.getSimpleValueType();
25387   SDLoc dl(Op);
25388
25389   assert(VT.getVectorElementType() == MVT::i1 &&
25390          "Cannot set masked compare for this operation");
25391
25392   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
25393
25394   // Prefer SETGT over SETLT.
25395   if (SetCCOpcode == ISD::SETLT) {
25396     SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
25397     std::swap(Op0, Op1);
25398   }
25399
25400   return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
25401 }
25402
25403 /// Given a buildvector constant, return a new vector constant with each element
25404 /// incremented or decremented. If incrementing or decrementing would result in
25405 /// unsigned overflow or underflow or this is not a simple vector constant,
25406 /// return an empty value.
25407 static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc,
25408                                     bool NSW) {
25409   auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
25410   if (!BV || !V.getValueType().isSimple())
25411     return SDValue();
25412
25413   MVT VT = V.getSimpleValueType();
25414   MVT EltVT = VT.getVectorElementType();
25415   unsigned NumElts = VT.getVectorNumElements();
25416   SmallVector<SDValue, 8> NewVecC;
25417   SDLoc DL(V);
25418   for (unsigned i = 0; i < NumElts; ++i) {
25419     auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
25420     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
25421       return SDValue();
25422
25423     // Avoid overflow/underflow.
25424     const APInt &EltC = Elt->getAPIntValue();
25425     if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
25426       return SDValue();
25427     if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
25428                 (!IsInc && EltC.isMinSignedValue())))
25429       return SDValue();
25430
25431     NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
25432   }
25433
25434   return DAG.getBuildVector(VT, DL, NewVecC);
25435 }
25436
25437 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
25438 /// Op0 u<= Op1:
25439 ///   t = psubus Op0, Op1
25440 ///   pcmpeq t, <0..0>
25441 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
25442                                     ISD::CondCode Cond, const SDLoc &dl,
25443                                     const X86Subtarget &Subtarget,
25444                                     SelectionDAG &DAG) {
25445   if (!Subtarget.hasSSE2())
25446     return SDValue();
25447
25448   MVT VET = VT.getVectorElementType();
25449   if (VET != MVT::i8 && VET != MVT::i16)
25450     return SDValue();
25451
25452   switch (Cond) {
25453   default:
25454     return SDValue();
25455   case ISD::SETULT: {
25456     // If the comparison is against a constant we can turn this into a
25457     // setule.  With psubus, setule does not require a swap.  This is
25458     // beneficial because the constant in the register is no longer
25459     // destructed as the destination so it can be hoisted out of a loop.
25460     // Only do this pre-AVX since vpcmp* is no longer destructive.
25461     if (Subtarget.hasAVX())
25462       return SDValue();
25463     SDValue ULEOp1 =
25464         incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
25465     if (!ULEOp1)
25466       return SDValue();
25467     Op1 = ULEOp1;
25468     break;
25469   }
25470   case ISD::SETUGT: {
25471     // If the comparison is against a constant, we can turn this into a setuge.
25472     // This is beneficial because materializing a constant 0 for the PCMPEQ is
25473     // probably cheaper than XOR+PCMPGT using 2 different vector constants:
25474     // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
25475     SDValue UGEOp1 =
25476         incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
25477     if (!UGEOp1)
25478       return SDValue();
25479     Op1 = Op0;
25480     Op0 = UGEOp1;
25481     break;
25482   }
25483   // Psubus is better than flip-sign because it requires no inversion.
25484   case ISD::SETUGE:
25485     std::swap(Op0, Op1);
25486     break;
25487   case ISD::SETULE:
25488     break;
25489   }
25490
25491   SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
25492   return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
25493                      DAG.getConstant(0, dl, VT));
25494 }
25495
25496 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
25497                            SelectionDAG &DAG) {
25498   bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
25499                   Op.getOpcode() == ISD::STRICT_FSETCCS;
25500   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
25501   SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
25502   SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
25503   MVT VT = Op->getSimpleValueType(0);
25504   ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
25505   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
25506   SDLoc dl(Op);
25507
25508   if (isFP) {
25509     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
25510     assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
25511     if (isSoftFP16(EltVT, Subtarget))
25512       return SDValue();
25513
25514     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
25515     SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
25516
25517     // If we have a strict compare with a vXi1 result and the input is 128/256
25518     // bits we can't use a masked compare unless we have VLX. If we use a wider
25519     // compare like we do for non-strict, we might trigger spurious exceptions
25520     // from the upper elements. Instead emit a AVX compare and convert to mask.
25521     unsigned Opc;
25522     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
25523         (!IsStrict || Subtarget.hasVLX() ||
25524          Op0.getSimpleValueType().is512BitVector())) {
25525 #ifndef NDEBUG
25526       unsigned Num = VT.getVectorNumElements();
25527       assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
25528 #endif
25529       Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
25530     } else {
25531       Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
25532       // The SSE/AVX packed FP comparison nodes are defined with a
25533       // floating-point vector result that matches the operand type. This allows
25534       // them to work with an SSE1 target (integer vector types are not legal).
25535       VT = Op0.getSimpleValueType();
25536     }
25537
25538     SDValue Cmp;
25539     bool IsAlwaysSignaling;
25540     unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
25541     if (!Subtarget.hasAVX()) {
25542       // TODO: We could use following steps to handle a quiet compare with
25543       // signaling encodings.
25544       // 1. Get ordered masks from a quiet ISD::SETO
25545       // 2. Use the masks to mask potential unordered elements in operand A, B
25546       // 3. Get the compare results of masked A, B
25547       // 4. Calculating final result using the mask and result from 3
25548       // But currently, we just fall back to scalar operations.
25549       if (IsStrict && IsAlwaysSignaling && !IsSignaling)
25550         return SDValue();
25551
25552       // Insert an extra signaling instruction to raise exception.
25553       if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
25554         SDValue SignalCmp = DAG.getNode(
25555             Opc, dl, {VT, MVT::Other},
25556             {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
25557         // FIXME: It seems we need to update the flags of all new strict nodes.
25558         // Otherwise, mayRaiseFPException in MI will return false due to
25559         // NoFPExcept = false by default. However, I didn't find it in other
25560         // patches.
25561         SignalCmp->setFlags(Op->getFlags());
25562         Chain = SignalCmp.getValue(1);
25563       }
25564
25565       // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
25566       // emit two comparisons and a logic op to tie them together.
25567       if (!cheapX86FSETCC_SSE(Cond)) {
25568         // LLVM predicate is SETUEQ or SETONE.
25569         unsigned CC0, CC1;
25570         unsigned CombineOpc;
25571         if (Cond == ISD::SETUEQ) {
25572           CC0 = 3; // UNORD
25573           CC1 = 0; // EQ
25574           CombineOpc = X86ISD::FOR;
25575         } else {
25576           assert(Cond == ISD::SETONE);
25577           CC0 = 7; // ORD
25578           CC1 = 4; // NEQ
25579           CombineOpc = X86ISD::FAND;
25580         }
25581
25582         SDValue Cmp0, Cmp1;
25583         if (IsStrict) {
25584           Cmp0 = DAG.getNode(
25585               Opc, dl, {VT, MVT::Other},
25586               {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
25587           Cmp1 = DAG.getNode(
25588               Opc, dl, {VT, MVT::Other},
25589               {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
25590           Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
25591                               Cmp1.getValue(1));
25592         } else {
25593           Cmp0 = DAG.getNode(
25594               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
25595           Cmp1 = DAG.getNode(
25596               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
25597         }
25598         Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
25599       } else {
25600         if (IsStrict) {
25601           Cmp = DAG.getNode(
25602               Opc, dl, {VT, MVT::Other},
25603               {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
25604           Chain = Cmp.getValue(1);
25605         } else
25606           Cmp = DAG.getNode(
25607               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
25608       }
25609     } else {
25610       // Handle all other FP comparisons here.
25611       if (IsStrict) {
25612         // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
25613         SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
25614         Cmp = DAG.getNode(
25615             Opc, dl, {VT, MVT::Other},
25616             {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
25617         Chain = Cmp.getValue(1);
25618       } else
25619         Cmp = DAG.getNode(
25620             Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
25621     }
25622
25623     if (VT.getFixedSizeInBits() >
25624         Op.getSimpleValueType().getFixedSizeInBits()) {
25625       // We emitted a compare with an XMM/YMM result. Finish converting to a
25626       // mask register using a vptestm.
25627       EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
25628       Cmp = DAG.getBitcast(CastVT, Cmp);
25629       Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
25630                          DAG.getConstant(0, dl, CastVT), ISD::SETNE);
25631     } else {
25632       // If this is SSE/AVX CMPP, bitcast the result back to integer to match
25633       // the result type of SETCC. The bitcast is expected to be optimized
25634       // away during combining/isel.
25635       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
25636     }
25637
25638     if (IsStrict)
25639       return DAG.getMergeValues({Cmp, Chain}, dl);
25640
25641     return Cmp;
25642   }
25643
25644   assert(!IsStrict && "Strict SETCC only handles FP operands.");
25645
25646   MVT VTOp0 = Op0.getSimpleValueType();
25647   (void)VTOp0;
25648   assert(VTOp0 == Op1.getSimpleValueType() &&
25649          "Expected operands with same type!");
25650   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
25651          "Invalid number of packed elements for source and destination!");
25652
25653   // The non-AVX512 code below works under the assumption that source and
25654   // destination types are the same.
25655   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
25656          "Value types for source and destination must be the same!");
25657
25658   // The result is boolean, but operands are int/float
25659   if (VT.getVectorElementType() == MVT::i1) {
25660     // In AVX-512 architecture setcc returns mask with i1 elements,
25661     // But there is no compare instruction for i8 and i16 elements in KNL.
25662     assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
25663            "Unexpected operand type");
25664     return LowerIntVSETCC_AVX512(Op, DAG);
25665   }
25666
25667   // Lower using XOP integer comparisons.
25668   if (VT.is128BitVector() && Subtarget.hasXOP()) {
25669     // Translate compare code to XOP PCOM compare mode.
25670     unsigned CmpMode = 0;
25671     switch (Cond) {
25672     default: llvm_unreachable("Unexpected SETCC condition");
25673     case ISD::SETULT:
25674     case ISD::SETLT: CmpMode = 0x00; break;
25675     case ISD::SETULE:
25676     case ISD::SETLE: CmpMode = 0x01; break;
25677     case ISD::SETUGT:
25678     case ISD::SETGT: CmpMode = 0x02; break;
25679     case ISD::SETUGE:
25680     case ISD::SETGE: CmpMode = 0x03; break;
25681     case ISD::SETEQ: CmpMode = 0x04; break;
25682     case ISD::SETNE: CmpMode = 0x05; break;
25683     }
25684
25685     // Are we comparing unsigned or signed integers?
25686     unsigned Opc =
25687         ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
25688
25689     return DAG.getNode(Opc, dl, VT, Op0, Op1,
25690                        DAG.getTargetConstant(CmpMode, dl, MVT::i8));
25691   }
25692
25693   // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
25694   // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
25695   if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
25696     SDValue BC0 = peekThroughBitcasts(Op0);
25697     if (BC0.getOpcode() == ISD::AND) {
25698       APInt UndefElts;
25699       SmallVector<APInt, 64> EltBits;
25700       if (getTargetConstantBitsFromNode(BC0.getOperand(1),
25701                                         VT.getScalarSizeInBits(), UndefElts,
25702                                         EltBits, false, false)) {
25703         if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
25704           Cond = ISD::SETEQ;
25705           Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
25706         }
25707       }
25708     }
25709   }
25710
25711   // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
25712   if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
25713       Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
25714     ConstantSDNode *C1 = isConstOrConstSplat(Op1);
25715     if (C1 && C1->getAPIntValue().isPowerOf2()) {
25716       unsigned BitWidth = VT.getScalarSizeInBits();
25717       unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
25718
25719       SDValue Result = Op0.getOperand(0);
25720       Result = DAG.getNode(ISD::SHL, dl, VT, Result,
25721                            DAG.getConstant(ShiftAmt, dl, VT));
25722       Result = DAG.getNode(ISD::SRA, dl, VT, Result,
25723                            DAG.getConstant(BitWidth - 1, dl, VT));
25724       return Result;
25725     }
25726   }
25727
25728   // Break 256-bit integer vector compare into smaller ones.
25729   if (VT.is256BitVector() && !Subtarget.hasInt256())
25730     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
25731
25732   // Break 512-bit integer vector compare into smaller ones.
25733   // TODO: Try harder to use VPCMPx + VPMOV2x?
25734   if (VT.is512BitVector())
25735     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
25736
25737   // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
25738   // not-of-PCMPEQ:
25739   // X != INT_MIN --> X >s INT_MIN
25740   // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
25741   // +X != 0 --> +X >s 0
25742   APInt ConstValue;
25743   if (Cond == ISD::SETNE &&
25744       ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
25745     if (ConstValue.isMinSignedValue())
25746       Cond = ISD::SETGT;
25747     else if (ConstValue.isMaxSignedValue())
25748       Cond = ISD::SETLT;
25749     else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
25750       Cond = ISD::SETGT;
25751   }
25752
25753   // If both operands are known non-negative, then an unsigned compare is the
25754   // same as a signed compare and there's no need to flip signbits.
25755   // TODO: We could check for more general simplifications here since we're
25756   // computing known bits.
25757   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
25758                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
25759
25760   // Special case: Use min/max operations for unsigned compares.
25761   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25762   if (ISD::isUnsignedIntSetCC(Cond) &&
25763       (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
25764       TLI.isOperationLegal(ISD::UMIN, VT)) {
25765     // If we have a constant operand, increment/decrement it and change the
25766     // condition to avoid an invert.
25767     if (Cond == ISD::SETUGT) {
25768       // X > C --> X >= (C+1) --> X == umax(X, C+1)
25769       if (SDValue UGTOp1 =
25770               incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
25771         Op1 = UGTOp1;
25772         Cond = ISD::SETUGE;
25773       }
25774     }
25775     if (Cond == ISD::SETULT) {
25776       // X < C --> X <= (C-1) --> X == umin(X, C-1)
25777       if (SDValue ULTOp1 =
25778               incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
25779         Op1 = ULTOp1;
25780         Cond = ISD::SETULE;
25781       }
25782     }
25783     bool Invert = false;
25784     unsigned Opc;
25785     switch (Cond) {
25786     default: llvm_unreachable("Unexpected condition code");
25787     case ISD::SETUGT: Invert = true; [[fallthrough]];
25788     case ISD::SETULE: Opc = ISD::UMIN; break;
25789     case ISD::SETULT: Invert = true; [[fallthrough]];
25790     case ISD::SETUGE: Opc = ISD::UMAX; break;
25791     }
25792
25793     SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
25794     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
25795
25796     // If the logical-not of the result is required, perform that now.
25797     if (Invert)
25798       Result = DAG.getNOT(dl, Result, VT);
25799
25800     return Result;
25801   }
25802
25803   // Try to use SUBUS and PCMPEQ.
25804   if (FlipSigns)
25805     if (SDValue V =
25806             LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
25807       return V;
25808
25809   // We are handling one of the integer comparisons here. Since SSE only has
25810   // GT and EQ comparisons for integer, swapping operands and multiple
25811   // operations may be required for some comparisons.
25812   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
25813                                                             : X86ISD::PCMPGT;
25814   bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
25815               Cond == ISD::SETGE || Cond == ISD::SETUGE;
25816   bool Invert = Cond == ISD::SETNE ||
25817                 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
25818
25819   if (Swap)
25820     std::swap(Op0, Op1);
25821
25822   // Check that the operation in question is available (most are plain SSE2,
25823   // but PCMPGTQ and PCMPEQQ have different requirements).
25824   if (VT == MVT::v2i64) {
25825     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
25826       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
25827
25828       // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
25829       // the odd elements over the even elements.
25830       if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
25831         Op0 = DAG.getConstant(0, dl, MVT::v4i32);
25832         Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25833
25834         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25835         static const int MaskHi[] = { 1, 1, 3, 3 };
25836         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25837
25838         return DAG.getBitcast(VT, Result);
25839       }
25840
25841       if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
25842         Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25843         Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
25844
25845         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25846         static const int MaskHi[] = { 1, 1, 3, 3 };
25847         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25848
25849         return DAG.getBitcast(VT, Result);
25850       }
25851
25852       // Since SSE has no unsigned integer comparisons, we need to flip the sign
25853       // bits of the inputs before performing those operations. The lower
25854       // compare is always unsigned.
25855       SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
25856                                              : 0x0000000080000000ULL,
25857                                    dl, MVT::v2i64);
25858
25859       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
25860       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
25861
25862       // Cast everything to the right type.
25863       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25864       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25865
25866       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
25867       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
25868       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
25869
25870       // Create masks for only the low parts/high parts of the 64 bit integers.
25871       static const int MaskHi[] = { 1, 1, 3, 3 };
25872       static const int MaskLo[] = { 0, 0, 2, 2 };
25873       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
25874       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
25875       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
25876
25877       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
25878       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
25879
25880       if (Invert)
25881         Result = DAG.getNOT(dl, Result, MVT::v4i32);
25882
25883       return DAG.getBitcast(VT, Result);
25884     }
25885
25886     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
25887       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
25888       // pcmpeqd + pshufd + pand.
25889       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
25890
25891       // First cast everything to the right type.
25892       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
25893       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
25894
25895       // Do the compare.
25896       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
25897
25898       // Make sure the lower and upper halves are both all-ones.
25899       static const int Mask[] = { 1, 0, 3, 2 };
25900       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
25901       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
25902
25903       if (Invert)
25904         Result = DAG.getNOT(dl, Result, MVT::v4i32);
25905
25906       return DAG.getBitcast(VT, Result);
25907     }
25908   }
25909
25910   // Since SSE has no unsigned integer comparisons, we need to flip the sign
25911   // bits of the inputs before performing those operations.
25912   if (FlipSigns) {
25913     MVT EltVT = VT.getVectorElementType();
25914     SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
25915                                  VT);
25916     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
25917     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
25918   }
25919
25920   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
25921
25922   // If the logical-not of the result is required, perform that now.
25923   if (Invert)
25924     Result = DAG.getNOT(dl, Result, VT);
25925
25926   return Result;
25927 }
25928
25929 // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
25930 static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
25931                               const SDLoc &dl, SelectionDAG &DAG,
25932                               const X86Subtarget &Subtarget,
25933                               SDValue &X86CC) {
25934   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
25935
25936   // Must be a bitcast from vXi1.
25937   if (Op0.getOpcode() != ISD::BITCAST)
25938     return SDValue();
25939
25940   Op0 = Op0.getOperand(0);
25941   MVT VT = Op0.getSimpleValueType();
25942   if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
25943       !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
25944       !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
25945     return SDValue();
25946
25947   X86::CondCode X86Cond;
25948   if (isNullConstant(Op1)) {
25949     X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
25950   } else if (isAllOnesConstant(Op1)) {
25951     // C flag is set for all ones.
25952     X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
25953   } else
25954     return SDValue();
25955
25956   // If the input is an AND, we can combine it's operands into the KTEST.
25957   bool KTestable = false;
25958   if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
25959     KTestable = true;
25960   if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
25961     KTestable = true;
25962   if (!isNullConstant(Op1))
25963     KTestable = false;
25964   if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
25965     SDValue LHS = Op0.getOperand(0);
25966     SDValue RHS = Op0.getOperand(1);
25967     X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25968     return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
25969   }
25970
25971   // If the input is an OR, we can combine it's operands into the KORTEST.
25972   SDValue LHS = Op0;
25973   SDValue RHS = Op0;
25974   if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
25975     LHS = Op0.getOperand(0);
25976     RHS = Op0.getOperand(1);
25977   }
25978
25979   X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25980   return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
25981 }
25982
25983 /// Emit flags for the given setcc condition and operands. Also returns the
25984 /// corresponding X86 condition code constant in X86CC.
25985 SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
25986                                              ISD::CondCode CC, const SDLoc &dl,
25987                                              SelectionDAG &DAG,
25988                                              SDValue &X86CC) const {
25989   // Equality Combines.
25990   if (CC == ISD::SETEQ || CC == ISD::SETNE) {
25991     X86::CondCode X86CondCode;
25992
25993     // Optimize to BT if possible.
25994     // Lower (X & (1 << N)) == 0 to BT(X, N).
25995     // Lower ((X >>u N) & 1) != 0 to BT(X, N).
25996     // Lower ((X >>s N) & 1) != 0 to BT(X, N).
25997     if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
25998       if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
25999         X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
26000         return BT;
26001       }
26002     }
26003
26004     // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
26005     if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
26006                                                X86CondCode)) {
26007       X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
26008       return CmpZ;
26009     }
26010
26011     // Try to lower using KORTEST or KTEST.
26012     if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
26013       return Test;
26014
26015     // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms
26016     // of these.
26017     if (isOneConstant(Op1) || isNullConstant(Op1)) {
26018       // If the input is a setcc, then reuse the input setcc or use a new one
26019       // with the inverted condition.
26020       if (Op0.getOpcode() == X86ISD::SETCC) {
26021         bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
26022
26023         X86CC = Op0.getOperand(0);
26024         if (Invert) {
26025           X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
26026           X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
26027           X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
26028         }
26029
26030         return Op0.getOperand(1);
26031       }
26032     }
26033
26034     // Try to use the carry flag from the add in place of an separate CMP for:
26035     // (seteq (add X, -1), -1). Similar for setne.
26036     if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
26037         Op0.getOperand(1) == Op1) {
26038       if (isProfitableToUseFlagOp(Op0)) {
26039         SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
26040
26041         SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
26042                                   Op0.getOperand(1));
26043         DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
26044         X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
26045         X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
26046         return SDValue(New.getNode(), 1);
26047       }
26048     }
26049   }
26050
26051   X86::CondCode CondCode =
26052       TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
26053   assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
26054
26055   SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
26056   X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
26057   return EFLAGS;
26058 }
26059
26060 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
26061
26062   bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
26063                   Op.getOpcode() == ISD::STRICT_FSETCCS;
26064   MVT VT = Op->getSimpleValueType(0);
26065
26066   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
26067
26068   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
26069   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
26070   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
26071   SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
26072   SDLoc dl(Op);
26073   ISD::CondCode CC =
26074       cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
26075
26076   if (isSoftFP16(Op0.getValueType()))
26077     return SDValue();
26078
26079   // Handle f128 first, since one possible outcome is a normal integer
26080   // comparison which gets handled by emitFlagsForSetcc.
26081   if (Op0.getValueType() == MVT::f128) {
26082     softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
26083                         Op.getOpcode() == ISD::STRICT_FSETCCS);
26084
26085     // If softenSetCCOperands returned a scalar, use it.
26086     if (!Op1.getNode()) {
26087       assert(Op0.getValueType() == Op.getValueType() &&
26088              "Unexpected setcc expansion!");
26089       if (IsStrict)
26090         return DAG.getMergeValues({Op0, Chain}, dl);
26091       return Op0;
26092     }
26093   }
26094
26095   if (Op0.getSimpleValueType().isInteger()) {
26096     // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
26097     // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
26098     // this may translate to less uops depending on uarch implementation. The
26099     // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
26100     // canonicalize to that CondCode.
26101     // NOTE: Only do this if incrementing the constant doesn't increase the bit
26102     // encoding size - so it must either already be a i8 or i32 immediate, or it
26103     // shrinks down to that. We don't do this for any i64's to avoid additional
26104     // constant materializations.
26105     // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
26106     if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
26107       const APInt &Op1Val = Op1C->getAPIntValue();
26108       if (!Op1Val.isZero()) {
26109         // Ensure the constant+1 doesn't overflow.
26110         if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
26111             (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
26112           APInt Op1ValPlusOne = Op1Val + 1;
26113           if (Op1ValPlusOne.isSignedIntN(32) &&
26114               (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
26115             Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
26116             CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
26117                                             : ISD::CondCode::SETUGE;
26118           }
26119         }
26120       }
26121     }
26122
26123     SDValue X86CC;
26124     SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
26125     SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
26126     return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
26127   }
26128
26129   // Handle floating point.
26130   X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
26131   if (CondCode == X86::COND_INVALID)
26132     return SDValue();
26133
26134   SDValue EFLAGS;
26135   if (IsStrict) {
26136     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
26137     EFLAGS =
26138         DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
26139                     dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
26140     Chain = EFLAGS.getValue(1);
26141   } else {
26142     EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
26143   }
26144
26145   SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
26146   SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
26147   return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
26148 }
26149
26150 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
26151   SDValue LHS = Op.getOperand(0);
26152   SDValue RHS = Op.getOperand(1);
26153   SDValue Carry = Op.getOperand(2);
26154   SDValue Cond = Op.getOperand(3);
26155   SDLoc DL(Op);
26156
26157   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
26158   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
26159
26160   // Recreate the carry if needed.
26161   EVT CarryVT = Carry.getValueType();
26162   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
26163                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
26164
26165   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
26166   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
26167   return getSETCC(CC, Cmp.getValue(1), DL, DAG);
26168 }
26169
26170 // This function returns three things: the arithmetic computation itself
26171 // (Value), an EFLAGS result (Overflow), and a condition code (Cond).  The
26172 // flag and the condition code define the case in which the arithmetic
26173 // computation overflows.
26174 static std::pair<SDValue, SDValue>
26175 getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
26176   assert(Op.getResNo() == 0 && "Unexpected result number!");
26177   SDValue Value, Overflow;
26178   SDValue LHS = Op.getOperand(0);
26179   SDValue RHS = Op.getOperand(1);
26180   unsigned BaseOp = 0;
26181   SDLoc DL(Op);
26182   switch (Op.getOpcode()) {
26183   default: llvm_unreachable("Unknown ovf instruction!");
26184   case ISD::SADDO:
26185     BaseOp = X86ISD::ADD;
26186     Cond = X86::COND_O;
26187     break;
26188   case ISD::UADDO:
26189     BaseOp = X86ISD::ADD;
26190     Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
26191     break;
26192   case ISD::SSUBO:
26193     BaseOp = X86ISD::SUB;
26194     Cond = X86::COND_O;
26195     break;
26196   case ISD::USUBO:
26197     BaseOp = X86ISD::SUB;
26198     Cond = X86::COND_B;
26199     break;
26200   case ISD::SMULO:
26201     BaseOp = X86ISD::SMUL;
26202     Cond = X86::COND_O;
26203     break;
26204   case ISD::UMULO:
26205     BaseOp = X86ISD::UMUL;
26206     Cond = X86::COND_O;
26207     break;
26208   }
26209
26210   if (BaseOp) {
26211     // Also sets EFLAGS.
26212     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
26213     Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
26214     Overflow = Value.getValue(1);
26215   }
26216
26217   return std::make_pair(Value, Overflow);
26218 }
26219
26220 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
26221   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
26222   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
26223   // looks for this combo and may remove the "setcc" instruction if the "setcc"
26224   // has only one use.
26225   SDLoc DL(Op);
26226   X86::CondCode Cond;
26227   SDValue Value, Overflow;
26228   std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
26229
26230   SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
26231   assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
26232   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
26233 }
26234
26235 /// Return true if opcode is a X86 logical comparison.
26236 static bool isX86LogicalCmp(SDValue Op) {
26237   unsigned Opc = Op.getOpcode();
26238   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
26239       Opc == X86ISD::FCMP)
26240     return true;
26241   if (Op.getResNo() == 1 &&
26242       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
26243        Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
26244        Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
26245     return true;
26246
26247   return false;
26248 }
26249
26250 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
26251   if (V.getOpcode() != ISD::TRUNCATE)
26252     return false;
26253
26254   SDValue VOp0 = V.getOperand(0);
26255   unsigned InBits = VOp0.getValueSizeInBits();
26256   unsigned Bits = V.getValueSizeInBits();
26257   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
26258 }
26259
26260 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
26261   bool AddTest = true;
26262   SDValue Cond  = Op.getOperand(0);
26263   SDValue Op1 = Op.getOperand(1);
26264   SDValue Op2 = Op.getOperand(2);
26265   SDLoc DL(Op);
26266   MVT VT = Op1.getSimpleValueType();
26267   SDValue CC;
26268
26269   if (isSoftFP16(VT)) {
26270     MVT NVT = VT.changeTypeToInteger();
26271     return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
26272                                           DAG.getBitcast(NVT, Op1),
26273                                           DAG.getBitcast(NVT, Op2)));
26274   }
26275
26276   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
26277   // are available or VBLENDV if AVX is available.
26278   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
26279   if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
26280       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
26281     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
26282     bool IsAlwaysSignaling;
26283     unsigned SSECC =
26284         translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
26285                            CondOp0, CondOp1, IsAlwaysSignaling);
26286
26287     if (Subtarget.hasAVX512()) {
26288       SDValue Cmp =
26289           DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
26290                       DAG.getTargetConstant(SSECC, DL, MVT::i8));
26291       assert(!VT.isVector() && "Not a scalar type?");
26292       return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
26293     }
26294
26295     if (SSECC < 8 || Subtarget.hasAVX()) {
26296       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
26297                                 DAG.getTargetConstant(SSECC, DL, MVT::i8));
26298
26299       // If we have AVX, we can use a variable vector select (VBLENDV) instead
26300       // of 3 logic instructions for size savings and potentially speed.
26301       // Unfortunately, there is no scalar form of VBLENDV.
26302
26303       // If either operand is a +0.0 constant, don't try this. We can expect to
26304       // optimize away at least one of the logic instructions later in that
26305       // case, so that sequence would be faster than a variable blend.
26306
26307       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
26308       // uses XMM0 as the selection register. That may need just as many
26309       // instructions as the AND/ANDN/OR sequence due to register moves, so
26310       // don't bother.
26311       if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
26312           !isNullFPConstant(Op2)) {
26313         // Convert to vectors, do a VSELECT, and convert back to scalar.
26314         // All of the conversions should be optimized away.
26315         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
26316         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
26317         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
26318         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
26319
26320         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
26321         VCmp = DAG.getBitcast(VCmpVT, VCmp);
26322
26323         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
26324
26325         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
26326                            VSel, DAG.getIntPtrConstant(0, DL));
26327       }
26328       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
26329       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
26330       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
26331     }
26332   }
26333
26334   // AVX512 fallback is to lower selects of scalar floats to masked moves.
26335   if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
26336     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
26337     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
26338   }
26339
26340   if (Cond.getOpcode() == ISD::SETCC &&
26341       !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {
26342     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
26343       Cond = NewCond;
26344       // If the condition was updated, it's possible that the operands of the
26345       // select were also updated (for example, EmitTest has a RAUW). Refresh
26346       // the local references to the select operands in case they got stale.
26347       Op1 = Op.getOperand(1);
26348       Op2 = Op.getOperand(2);
26349     }
26350   }
26351
26352   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
26353   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
26354   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
26355   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
26356   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
26357   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
26358   // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
26359   // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
26360   if (Cond.getOpcode() == X86ISD::SETCC &&
26361       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
26362       isNullConstant(Cond.getOperand(1).getOperand(1))) {
26363     SDValue Cmp = Cond.getOperand(1);
26364     SDValue CmpOp0 = Cmp.getOperand(0);
26365     unsigned CondCode = Cond.getConstantOperandVal(0);
26366
26367     // Special handling for __builtin_ffs(X) - 1 pattern which looks like
26368     // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
26369     // handle to keep the CMP with 0. This should be removed by
26370     // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
26371     // cttz_zero_undef.
26372     auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
26373       return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
26374               Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
26375     };
26376     if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
26377         ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
26378          (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
26379       // Keep Cmp.
26380     } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
26381         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
26382       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
26383       SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
26384
26385       // 'X - 1' sets the carry flag if X == 0.
26386       // '0 - X' sets the carry flag if X != 0.
26387       // Convert the carry flag to a -1/0 mask with sbb:
26388       // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
26389       // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
26390       // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
26391       // select (X == 0), -1, Y --> X - 1; or (sbb), Y
26392       SDValue Sub;
26393       if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
26394         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
26395         Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
26396       } else {
26397         SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
26398         Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
26399       }
26400       SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
26401                                 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
26402                                 Sub.getValue(1));
26403       return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
26404     } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
26405                CmpOp0.getOpcode() == ISD::AND &&
26406                isOneConstant(CmpOp0.getOperand(1))) {
26407       SDValue Src1, Src2;
26408       // true if Op2 is XOR or OR operator and one of its operands
26409       // is equal to Op1
26410       // ( a , a op b) || ( b , a op b)
26411       auto isOrXorPattern = [&]() {
26412         if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
26413             (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
26414           Src1 =
26415               Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
26416           Src2 = Op1;
26417           return true;
26418         }
26419         return false;
26420       };
26421
26422       if (isOrXorPattern()) {
26423         SDValue Neg;
26424         unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
26425         // we need mask of all zeros or ones with same size of the other
26426         // operands.
26427         if (CmpSz > VT.getSizeInBits())
26428           Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
26429         else if (CmpSz < VT.getSizeInBits())
26430           Neg = DAG.getNode(ISD::AND, DL, VT,
26431               DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
26432               DAG.getConstant(1, DL, VT));
26433         else
26434           Neg = CmpOp0;
26435         SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
26436                                    Neg); // -(and (x, 0x1))
26437         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
26438         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
26439       }
26440     } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
26441                Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
26442                ((CondCode == X86::COND_S) ||                    // smin(x, 0)
26443                 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
26444       // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
26445       //
26446       // If the comparison is testing for a positive value, we have to invert
26447       // the sign bit mask, so only do that transform if the target has a
26448       // bitwise 'and not' instruction (the invert is free).
26449       // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
26450       unsigned ShCt = VT.getSizeInBits() - 1;
26451       SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
26452       SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
26453       if (CondCode == X86::COND_G)
26454         Shift = DAG.getNOT(DL, Shift, VT);
26455       return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
26456     }
26457   }
26458
26459   // Look past (and (setcc_carry (cmp ...)), 1).
26460   if (Cond.getOpcode() == ISD::AND &&
26461       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
26462       isOneConstant(Cond.getOperand(1)))
26463     Cond = Cond.getOperand(0);
26464
26465   // If condition flag is set by a X86ISD::CMP, then use it as the condition
26466   // setting operand in place of the X86ISD::SETCC.
26467   unsigned CondOpcode = Cond.getOpcode();
26468   if (CondOpcode == X86ISD::SETCC ||
26469       CondOpcode == X86ISD::SETCC_CARRY) {
26470     CC = Cond.getOperand(0);
26471
26472     SDValue Cmp = Cond.getOperand(1);
26473     bool IllegalFPCMov = false;
26474     if (VT.isFloatingPoint() && !VT.isVector() &&
26475         !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV())  // FPStack?
26476       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
26477
26478     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
26479         Cmp.getOpcode() == X86ISD::BT) { // FIXME
26480       Cond = Cmp;
26481       AddTest = false;
26482     }
26483   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
26484              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
26485              CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
26486     SDValue Value;
26487     X86::CondCode X86Cond;
26488     std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
26489
26490     CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
26491     AddTest = false;
26492   }
26493
26494   if (AddTest) {
26495     // Look past the truncate if the high bits are known zero.
26496     if (isTruncWithZeroHighBitsInput(Cond, DAG))
26497       Cond = Cond.getOperand(0);
26498
26499     // We know the result of AND is compared against zero. Try to match
26500     // it to BT.
26501     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
26502       X86::CondCode X86CondCode;
26503       if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
26504         CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
26505         Cond = BT;
26506         AddTest = false;
26507       }
26508     }
26509   }
26510
26511   if (AddTest) {
26512     CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
26513     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
26514   }
26515
26516   // a <  b ? -1 :  0 -> RES = ~setcc_carry
26517   // a <  b ?  0 : -1 -> RES = setcc_carry
26518   // a >= b ? -1 :  0 -> RES = setcc_carry
26519   // a >= b ?  0 : -1 -> RES = ~setcc_carry
26520   if (Cond.getOpcode() == X86ISD::SUB) {
26521     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
26522
26523     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
26524         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
26525         (isNullConstant(Op1) || isNullConstant(Op2))) {
26526       SDValue Res =
26527           DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
26528                       DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
26529       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
26530         return DAG.getNOT(DL, Res, Res.getValueType());
26531       return Res;
26532     }
26533   }
26534
26535   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
26536   // widen the cmov and push the truncate through. This avoids introducing a new
26537   // branch during isel and doesn't add any extensions.
26538   if (Op.getValueType() == MVT::i8 &&
26539       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
26540     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
26541     if (T1.getValueType() == T2.getValueType() &&
26542         // Exclude CopyFromReg to avoid partial register stalls.
26543         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
26544       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
26545                                  CC, Cond);
26546       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
26547     }
26548   }
26549
26550   // Or finally, promote i8 cmovs if we have CMOV,
26551   //                 or i16 cmovs if it won't prevent folding a load.
26552   // FIXME: we should not limit promotion of i8 case to only when the CMOV is
26553   //        legal, but EmitLoweredSelect() can not deal with these extensions
26554   //        being inserted between two CMOV's. (in i16 case too TBN)
26555   //        https://bugs.llvm.org/show_bug.cgi?id=40974
26556   if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
26557       (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
26558        !X86::mayFoldLoad(Op2, Subtarget))) {
26559     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
26560     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
26561     SDValue Ops[] = { Op2, Op1, CC, Cond };
26562     SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
26563     return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
26564   }
26565
26566   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
26567   // condition is true.
26568   SDValue Ops[] = { Op2, Op1, CC, Cond };
26569   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
26570 }
26571
26572 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
26573                                      const X86Subtarget &Subtarget,
26574                                      SelectionDAG &DAG) {
26575   MVT VT = Op->getSimpleValueType(0);
26576   SDValue In = Op->getOperand(0);
26577   MVT InVT = In.getSimpleValueType();
26578   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
26579   MVT VTElt = VT.getVectorElementType();
26580   SDLoc dl(Op);
26581
26582   unsigned NumElts = VT.getVectorNumElements();
26583
26584   // Extend VT if the scalar type is i8/i16 and BWI is not supported.
26585   MVT ExtVT = VT;
26586   if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
26587     // If v16i32 is to be avoided, we'll need to split and concatenate.
26588     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
26589       return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
26590
26591     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
26592   }
26593
26594   // Widen to 512-bits if VLX is not supported.
26595   MVT WideVT = ExtVT;
26596   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
26597     NumElts *= 512 / ExtVT.getSizeInBits();
26598     InVT = MVT::getVectorVT(MVT::i1, NumElts);
26599     In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
26600                      In, DAG.getIntPtrConstant(0, dl));
26601     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
26602   }
26603
26604   SDValue V;
26605   MVT WideEltVT = WideVT.getVectorElementType();
26606   if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
26607       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
26608     V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
26609   } else {
26610     SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
26611     SDValue Zero = DAG.getConstant(0, dl, WideVT);
26612     V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
26613   }
26614
26615   // Truncate if we had to extend i16/i8 above.
26616   if (VT != ExtVT) {
26617     WideVT = MVT::getVectorVT(VTElt, NumElts);
26618     V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
26619   }
26620
26621   // Extract back to 128/256-bit if we widened.
26622   if (WideVT != VT)
26623     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
26624                     DAG.getIntPtrConstant(0, dl));
26625
26626   return V;
26627 }
26628
26629 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
26630                                SelectionDAG &DAG) {
26631   SDValue In = Op->getOperand(0);
26632   MVT InVT = In.getSimpleValueType();
26633
26634   if (InVT.getVectorElementType() == MVT::i1)
26635     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
26636
26637   assert(Subtarget.hasAVX() && "Expected AVX support");
26638   return LowerAVXExtend(Op, DAG, Subtarget);
26639 }
26640
26641 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
26642 // For sign extend this needs to handle all vector sizes and SSE4.1 and
26643 // non-SSE4.1 targets. For zero extend this should only handle inputs of
26644 // MVT::v64i8 when BWI is not supported, but AVX512 is.
26645 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
26646                                         const X86Subtarget &Subtarget,
26647                                         SelectionDAG &DAG) {
26648   SDValue In = Op->getOperand(0);
26649   MVT VT = Op->getSimpleValueType(0);
26650   MVT InVT = In.getSimpleValueType();
26651
26652   MVT SVT = VT.getVectorElementType();
26653   MVT InSVT = InVT.getVectorElementType();
26654   assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
26655
26656   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
26657     return SDValue();
26658   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
26659     return SDValue();
26660   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
26661       !(VT.is256BitVector() && Subtarget.hasAVX()) &&
26662       !(VT.is512BitVector() && Subtarget.hasAVX512()))
26663     return SDValue();
26664
26665   SDLoc dl(Op);
26666   unsigned Opc = Op.getOpcode();
26667   unsigned NumElts = VT.getVectorNumElements();
26668
26669   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
26670   // For 512-bit vectors, we need 128-bits or 256-bits.
26671   if (InVT.getSizeInBits() > 128) {
26672     // Input needs to be at least the same number of elements as output, and
26673     // at least 128-bits.
26674     int InSize = InSVT.getSizeInBits() * NumElts;
26675     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
26676     InVT = In.getSimpleValueType();
26677   }
26678
26679   // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
26680   // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
26681   // need to be handled here for 256/512-bit results.
26682   if (Subtarget.hasInt256()) {
26683     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
26684
26685     if (InVT.getVectorNumElements() != NumElts)
26686       return DAG.getNode(Op.getOpcode(), dl, VT, In);
26687
26688     // FIXME: Apparently we create inreg operations that could be regular
26689     // extends.
26690     unsigned ExtOpc =
26691         Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
26692                                              : ISD::ZERO_EXTEND;
26693     return DAG.getNode(ExtOpc, dl, VT, In);
26694   }
26695
26696   // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
26697   if (Subtarget.hasAVX()) {
26698     assert(VT.is256BitVector() && "256-bit vector expected");
26699     MVT HalfVT = VT.getHalfNumVectorElementsVT();
26700     int HalfNumElts = HalfVT.getVectorNumElements();
26701
26702     unsigned NumSrcElts = InVT.getVectorNumElements();
26703     SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
26704     for (int i = 0; i != HalfNumElts; ++i)
26705       HiMask[i] = HalfNumElts + i;
26706
26707     SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
26708     SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
26709     Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
26710     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
26711   }
26712
26713   // We should only get here for sign extend.
26714   assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
26715   assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
26716
26717   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
26718   SDValue Curr = In;
26719   SDValue SignExt = Curr;
26720
26721   // As SRAI is only available on i16/i32 types, we expand only up to i32
26722   // and handle i64 separately.
26723   if (InVT != MVT::v4i32) {
26724     MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
26725
26726     unsigned DestWidth = DestVT.getScalarSizeInBits();
26727     unsigned Scale = DestWidth / InSVT.getSizeInBits();
26728
26729     unsigned InNumElts = InVT.getVectorNumElements();
26730     unsigned DestElts = DestVT.getVectorNumElements();
26731
26732     // Build a shuffle mask that takes each input element and places it in the
26733     // MSBs of the new element size.
26734     SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
26735     for (unsigned i = 0; i != DestElts; ++i)
26736       Mask[i * Scale + (Scale - 1)] = i;
26737
26738     Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
26739     Curr = DAG.getBitcast(DestVT, Curr);
26740
26741     unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
26742     SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
26743                           DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
26744   }
26745
26746   if (VT == MVT::v2i64) {
26747     assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
26748     SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
26749     SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
26750     SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
26751     SignExt = DAG.getBitcast(VT, SignExt);
26752   }
26753
26754   return SignExt;
26755 }
26756
26757 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
26758                                 SelectionDAG &DAG) {
26759   MVT VT = Op->getSimpleValueType(0);
26760   SDValue In = Op->getOperand(0);
26761   MVT InVT = In.getSimpleValueType();
26762   SDLoc dl(Op);
26763
26764   if (InVT.getVectorElementType() == MVT::i1)
26765     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
26766
26767   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
26768   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
26769          "Expected same number of elements");
26770   assert((VT.getVectorElementType() == MVT::i16 ||
26771           VT.getVectorElementType() == MVT::i32 ||
26772           VT.getVectorElementType() == MVT::i64) &&
26773          "Unexpected element type");
26774   assert((InVT.getVectorElementType() == MVT::i8 ||
26775           InVT.getVectorElementType() == MVT::i16 ||
26776           InVT.getVectorElementType() == MVT::i32) &&
26777          "Unexpected element type");
26778
26779   if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
26780     assert(InVT == MVT::v32i8 && "Unexpected VT!");
26781     return splitVectorIntUnary(Op, DAG);
26782   }
26783
26784   if (Subtarget.hasInt256())
26785     return Op;
26786
26787   // Optimize vectors in AVX mode
26788   // Sign extend  v8i16 to v8i32 and
26789   //              v4i32 to v4i64
26790   //
26791   // Divide input vector into two parts
26792   // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
26793   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
26794   // concat the vectors to original VT
26795   MVT HalfVT = VT.getHalfNumVectorElementsVT();
26796   SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
26797
26798   unsigned NumElems = InVT.getVectorNumElements();
26799   SmallVector<int,8> ShufMask(NumElems, -1);
26800   for (unsigned i = 0; i != NumElems/2; ++i)
26801     ShufMask[i] = i + NumElems/2;
26802
26803   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
26804   OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
26805
26806   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
26807 }
26808
26809 /// Change a vector store into a pair of half-size vector stores.
26810 static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
26811   SDValue StoredVal = Store->getValue();
26812   assert((StoredVal.getValueType().is256BitVector() ||
26813           StoredVal.getValueType().is512BitVector()) &&
26814          "Expecting 256/512-bit op");
26815
26816   // Splitting volatile memory ops is not allowed unless the operation was not
26817   // legal to begin with. Assume the input store is legal (this transform is
26818   // only used for targets with AVX). Note: It is possible that we have an
26819   // illegal type like v2i128, and so we could allow splitting a volatile store
26820   // in that case if that is important.
26821   if (!Store->isSimple())
26822     return SDValue();
26823
26824   SDLoc DL(Store);
26825   SDValue Value0, Value1;
26826   std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
26827   unsigned HalfOffset = Value0.getValueType().getStoreSize();
26828   SDValue Ptr0 = Store->getBasePtr();
26829   SDValue Ptr1 =
26830       DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
26831   SDValue Ch0 =
26832       DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
26833                    Store->getOriginalAlign(),
26834                    Store->getMemOperand()->getFlags());
26835   SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
26836                              Store->getPointerInfo().getWithOffset(HalfOffset),
26837                              Store->getOriginalAlign(),
26838                              Store->getMemOperand()->getFlags());
26839   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
26840 }
26841
26842 /// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
26843 /// type.
26844 static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
26845                                     SelectionDAG &DAG) {
26846   SDValue StoredVal = Store->getValue();
26847   assert(StoreVT.is128BitVector() &&
26848          StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
26849   StoredVal = DAG.getBitcast(StoreVT, StoredVal);
26850
26851   // Splitting volatile memory ops is not allowed unless the operation was not
26852   // legal to begin with. We are assuming the input op is legal (this transform
26853   // is only used for targets with AVX).
26854   if (!Store->isSimple())
26855     return SDValue();
26856
26857   MVT StoreSVT = StoreVT.getScalarType();
26858   unsigned NumElems = StoreVT.getVectorNumElements();
26859   unsigned ScalarSize = StoreSVT.getStoreSize();
26860
26861   SDLoc DL(Store);
26862   SmallVector<SDValue, 4> Stores;
26863   for (unsigned i = 0; i != NumElems; ++i) {
26864     unsigned Offset = i * ScalarSize;
26865     SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
26866                                            TypeSize::Fixed(Offset), DL);
26867     SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
26868                               DAG.getIntPtrConstant(i, DL));
26869     SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
26870                               Store->getPointerInfo().getWithOffset(Offset),
26871                               Store->getOriginalAlign(),
26872                               Store->getMemOperand()->getFlags());
26873     Stores.push_back(Ch);
26874   }
26875   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
26876 }
26877
26878 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
26879                           SelectionDAG &DAG) {
26880   StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
26881   SDLoc dl(St);
26882   SDValue StoredVal = St->getValue();
26883
26884   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
26885   if (StoredVal.getValueType().isVector() &&
26886       StoredVal.getValueType().getVectorElementType() == MVT::i1) {
26887     unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
26888     assert(NumElts <= 8 && "Unexpected VT");
26889     assert(!St->isTruncatingStore() && "Expected non-truncating store");
26890     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
26891            "Expected AVX512F without AVX512DQI");
26892
26893     // We must pad with zeros to ensure we store zeroes to any unused bits.
26894     StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26895                             DAG.getUNDEF(MVT::v16i1), StoredVal,
26896                             DAG.getIntPtrConstant(0, dl));
26897     StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
26898     StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
26899     // Make sure we store zeros in the extra bits.
26900     if (NumElts < 8)
26901       StoredVal = DAG.getZeroExtendInReg(
26902           StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
26903
26904     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
26905                         St->getPointerInfo(), St->getOriginalAlign(),
26906                         St->getMemOperand()->getFlags());
26907   }
26908
26909   if (St->isTruncatingStore())
26910     return SDValue();
26911
26912   // If this is a 256-bit store of concatenated ops, we are better off splitting
26913   // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
26914   // and each half can execute independently. Some cores would split the op into
26915   // halves anyway, so the concat (vinsertf128) is purely an extra op.
26916   MVT StoreVT = StoredVal.getSimpleValueType();
26917   if (StoreVT.is256BitVector() ||
26918       ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
26919        !Subtarget.hasBWI())) {
26920     if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG))
26921       return splitVectorStore(St, DAG);
26922     return SDValue();
26923   }
26924
26925   if (StoreVT.is32BitVector())
26926     return SDValue();
26927
26928   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26929   assert(StoreVT.is64BitVector() && "Unexpected VT");
26930   assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
26931              TargetLowering::TypeWidenVector &&
26932          "Unexpected type action!");
26933
26934   EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
26935   StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
26936                           DAG.getUNDEF(StoreVT));
26937
26938   if (Subtarget.hasSSE2()) {
26939     // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
26940     // and store it.
26941     MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
26942     MVT CastVT = MVT::getVectorVT(StVT, 2);
26943     StoredVal = DAG.getBitcast(CastVT, StoredVal);
26944     StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
26945                             DAG.getIntPtrConstant(0, dl));
26946
26947     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
26948                         St->getPointerInfo(), St->getOriginalAlign(),
26949                         St->getMemOperand()->getFlags());
26950   }
26951   assert(Subtarget.hasSSE1() && "Expected SSE");
26952   SDVTList Tys = DAG.getVTList(MVT::Other);
26953   SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
26954   return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
26955                                  St->getMemOperand());
26956 }
26957
26958 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
26959 // may emit an illegal shuffle but the expansion is still better than scalar
26960 // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
26961 // we'll emit a shuffle and a arithmetic shift.
26962 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
26963 // TODO: It is possible to support ZExt by zeroing the undef values during
26964 // the shuffle phase or after the shuffle.
26965 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
26966                                  SelectionDAG &DAG) {
26967   MVT RegVT = Op.getSimpleValueType();
26968   assert(RegVT.isVector() && "We only custom lower vector loads.");
26969   assert(RegVT.isInteger() &&
26970          "We only custom lower integer vector loads.");
26971
26972   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
26973   SDLoc dl(Ld);
26974
26975   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
26976   if (RegVT.getVectorElementType() == MVT::i1) {
26977     assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
26978     assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
26979     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
26980            "Expected AVX512F without AVX512DQI");
26981
26982     SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
26983                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
26984                                 Ld->getMemOperand()->getFlags());
26985
26986     // Replace chain users with the new chain.
26987     assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
26988
26989     SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
26990     Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
26991                       DAG.getBitcast(MVT::v16i1, Val),
26992                       DAG.getIntPtrConstant(0, dl));
26993     return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
26994   }
26995
26996   return SDValue();
26997 }
26998
26999 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
27000 /// each of which has no other use apart from the AND / OR.
27001 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
27002   Opc = Op.getOpcode();
27003   if (Opc != ISD::OR && Opc != ISD::AND)
27004     return false;
27005   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
27006           Op.getOperand(0).hasOneUse() &&
27007           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
27008           Op.getOperand(1).hasOneUse());
27009 }
27010
27011 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
27012   SDValue Chain = Op.getOperand(0);
27013   SDValue Cond  = Op.getOperand(1);
27014   SDValue Dest  = Op.getOperand(2);
27015   SDLoc dl(Op);
27016
27017   // Bail out when we don't have native compare instructions.
27018   if (Cond.getOpcode() == ISD::SETCC &&
27019       Cond.getOperand(0).getValueType() != MVT::f128 &&
27020       !isSoftFP16(Cond.getOperand(0).getValueType())) {
27021     SDValue LHS = Cond.getOperand(0);
27022     SDValue RHS = Cond.getOperand(1);
27023     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
27024
27025     // Special case for
27026     // setcc([su]{add,sub,mul}o == 0)
27027     // setcc([su]{add,sub,mul}o != 1)
27028     if (ISD::isOverflowIntrOpRes(LHS) &&
27029         (CC == ISD::SETEQ || CC == ISD::SETNE) &&
27030         (isNullConstant(RHS) || isOneConstant(RHS))) {
27031       SDValue Value, Overflow;
27032       X86::CondCode X86Cond;
27033       std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
27034
27035       if ((CC == ISD::SETEQ) == isNullConstant(RHS))
27036         X86Cond = X86::GetOppositeBranchCondition(X86Cond);
27037
27038       SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
27039       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
27040                          Overflow);
27041     }
27042
27043     if (LHS.getSimpleValueType().isInteger()) {
27044       SDValue CCVal;
27045       SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
27046       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
27047                          EFLAGS);
27048     }
27049
27050     if (CC == ISD::SETOEQ) {
27051       // For FCMP_OEQ, we can emit
27052       // two branches instead of an explicit AND instruction with a
27053       // separate test. However, we only do this if this block doesn't
27054       // have a fall-through edge, because this requires an explicit
27055       // jmp when the condition is false.
27056       if (Op.getNode()->hasOneUse()) {
27057         SDNode *User = *Op.getNode()->use_begin();
27058         // Look for an unconditional branch following this conditional branch.
27059         // We need this because we need to reverse the successors in order
27060         // to implement FCMP_OEQ.
27061         if (User->getOpcode() == ISD::BR) {
27062           SDValue FalseBB = User->getOperand(1);
27063           SDNode *NewBR =
27064             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
27065           assert(NewBR == User);
27066           (void)NewBR;
27067           Dest = FalseBB;
27068
27069           SDValue Cmp =
27070               DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
27071           SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
27072           Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
27073                               CCVal, Cmp);
27074           CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
27075           return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
27076                              Cmp);
27077         }
27078       }
27079     } else if (CC == ISD::SETUNE) {
27080       // For FCMP_UNE, we can emit
27081       // two branches instead of an explicit OR instruction with a
27082       // separate test.
27083       SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
27084       SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
27085       Chain =
27086           DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
27087       CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
27088       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
27089                          Cmp);
27090     } else {
27091       X86::CondCode X86Cond =
27092           TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
27093       SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
27094       SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
27095       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
27096                          Cmp);
27097     }
27098   }
27099
27100   if (ISD::isOverflowIntrOpRes(Cond)) {
27101     SDValue Value, Overflow;
27102     X86::CondCode X86Cond;
27103     std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
27104
27105     SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
27106     return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
27107                        Overflow);
27108   }
27109
27110   // Look past the truncate if the high bits are known zero.
27111   if (isTruncWithZeroHighBitsInput(Cond, DAG))
27112     Cond = Cond.getOperand(0);
27113
27114   EVT CondVT = Cond.getValueType();
27115
27116   // Add an AND with 1 if we don't already have one.
27117   if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
27118     Cond =
27119         DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
27120
27121   SDValue LHS = Cond;
27122   SDValue RHS = DAG.getConstant(0, dl, CondVT);
27123
27124   SDValue CCVal;
27125   SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
27126   return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
27127                      EFLAGS);
27128 }
27129
27130 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
27131 // Calls to _alloca are needed to probe the stack when allocating more than 4k
27132 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
27133 // that the guard pages used by the OS virtual memory manager are allocated in
27134 // correct sequence.
27135 SDValue
27136 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
27137                                            SelectionDAG &DAG) const {
27138   MachineFunction &MF = DAG.getMachineFunction();
27139   bool SplitStack = MF.shouldSplitStack();
27140   bool EmitStackProbeCall = hasStackProbeSymbol(MF);
27141   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
27142                SplitStack || EmitStackProbeCall;
27143   SDLoc dl(Op);
27144
27145   // Get the inputs.
27146   SDNode *Node = Op.getNode();
27147   SDValue Chain = Op.getOperand(0);
27148   SDValue Size  = Op.getOperand(1);
27149   MaybeAlign Alignment(Op.getConstantOperandVal(2));
27150   EVT VT = Node->getValueType(0);
27151
27152   // Chain the dynamic stack allocation so that it doesn't modify the stack
27153   // pointer when other instructions are using the stack.
27154   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
27155
27156   bool Is64Bit = Subtarget.is64Bit();
27157   MVT SPTy = getPointerTy(DAG.getDataLayout());
27158
27159   SDValue Result;
27160   if (!Lower) {
27161     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27162     Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
27163     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
27164                     " not tell us which reg is the stack pointer!");
27165
27166     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27167     const Align StackAlign = TFI.getStackAlign();
27168     if (hasInlineStackProbe(MF)) {
27169       MachineRegisterInfo &MRI = MF.getRegInfo();
27170
27171       const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27172       Register Vreg = MRI.createVirtualRegister(AddrRegClass);
27173       Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
27174       Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
27175                            DAG.getRegister(Vreg, SPTy));
27176     } else {
27177       SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
27178       Chain = SP.getValue(1);
27179       Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
27180     }
27181     if (Alignment && *Alignment > StackAlign)
27182       Result =
27183           DAG.getNode(ISD::AND, dl, VT, Result,
27184                       DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
27185     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
27186   } else if (SplitStack) {
27187     MachineRegisterInfo &MRI = MF.getRegInfo();
27188
27189     if (Is64Bit) {
27190       // The 64 bit implementation of segmented stacks needs to clobber both r10
27191       // r11. This makes it impossible to use it along with nested parameters.
27192       const Function &F = MF.getFunction();
27193       for (const auto &A : F.args()) {
27194         if (A.hasNestAttr())
27195           report_fatal_error("Cannot use segmented stacks with functions that "
27196                              "have nested arguments.");
27197       }
27198     }
27199
27200     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27201     Register Vreg = MRI.createVirtualRegister(AddrRegClass);
27202     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
27203     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
27204                                 DAG.getRegister(Vreg, SPTy));
27205   } else {
27206     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
27207     Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
27208     MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
27209
27210     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27211     Register SPReg = RegInfo->getStackRegister();
27212     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
27213     Chain = SP.getValue(1);
27214
27215     if (Alignment) {
27216       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
27217                        DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
27218       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
27219     }
27220
27221     Result = SP;
27222   }
27223
27224   Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
27225
27226   SDValue Ops[2] = {Result, Chain};
27227   return DAG.getMergeValues(Ops, dl);
27228 }
27229
27230 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
27231   MachineFunction &MF = DAG.getMachineFunction();
27232   auto PtrVT = getPointerTy(MF.getDataLayout());
27233   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
27234
27235   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
27236   SDLoc DL(Op);
27237
27238   if (!Subtarget.is64Bit() ||
27239       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
27240     // vastart just stores the address of the VarArgsFrameIndex slot into the
27241     // memory location argument.
27242     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
27243     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
27244                         MachinePointerInfo(SV));
27245   }
27246
27247   // __va_list_tag:
27248   //   gp_offset         (0 - 6 * 8)
27249   //   fp_offset         (48 - 48 + 8 * 16)
27250   //   overflow_arg_area (point to parameters coming in memory).
27251   //   reg_save_area
27252   SmallVector<SDValue, 8> MemOps;
27253   SDValue FIN = Op.getOperand(1);
27254   // Store gp_offset
27255   SDValue Store = DAG.getStore(
27256       Op.getOperand(0), DL,
27257       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
27258       MachinePointerInfo(SV));
27259   MemOps.push_back(Store);
27260
27261   // Store fp_offset
27262   FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
27263   Store = DAG.getStore(
27264       Op.getOperand(0), DL,
27265       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
27266       MachinePointerInfo(SV, 4));
27267   MemOps.push_back(Store);
27268
27269   // Store ptr to overflow_arg_area
27270   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
27271   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
27272   Store =
27273       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
27274   MemOps.push_back(Store);
27275
27276   // Store ptr to reg_save_area.
27277   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
27278       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
27279   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
27280   Store = DAG.getStore(
27281       Op.getOperand(0), DL, RSFIN, FIN,
27282       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
27283   MemOps.push_back(Store);
27284   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
27285 }
27286
27287 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
27288   assert(Subtarget.is64Bit() &&
27289          "LowerVAARG only handles 64-bit va_arg!");
27290   assert(Op.getNumOperands() == 4);
27291
27292   MachineFunction &MF = DAG.getMachineFunction();
27293   if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
27294     // The Win64 ABI uses char* instead of a structure.
27295     return DAG.expandVAArg(Op.getNode());
27296
27297   SDValue Chain = Op.getOperand(0);
27298   SDValue SrcPtr = Op.getOperand(1);
27299   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
27300   unsigned Align = Op.getConstantOperandVal(3);
27301   SDLoc dl(Op);
27302
27303   EVT ArgVT = Op.getNode()->getValueType(0);
27304   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
27305   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
27306   uint8_t ArgMode;
27307
27308   // Decide which area this value should be read from.
27309   // TODO: Implement the AMD64 ABI in its entirety. This simple
27310   // selection mechanism works only for the basic types.
27311   assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
27312   if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
27313     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
27314   } else {
27315     assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
27316            "Unhandled argument type in LowerVAARG");
27317     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
27318   }
27319
27320   if (ArgMode == 2) {
27321     // Make sure using fp_offset makes sense.
27322     assert(!Subtarget.useSoftFloat() &&
27323            !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
27324            Subtarget.hasSSE1());
27325   }
27326
27327   // Insert VAARG node into the DAG
27328   // VAARG returns two values: Variable Argument Address, Chain
27329   SDValue InstOps[] = {Chain, SrcPtr,
27330                        DAG.getTargetConstant(ArgSize, dl, MVT::i32),
27331                        DAG.getTargetConstant(ArgMode, dl, MVT::i8),
27332                        DAG.getTargetConstant(Align, dl, MVT::i32)};
27333   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
27334   SDValue VAARG = DAG.getMemIntrinsicNode(
27335       Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
27336       VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
27337       /*Alignment=*/std::nullopt,
27338       MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
27339   Chain = VAARG.getValue(1);
27340
27341   // Load the next argument and return it
27342   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
27343 }
27344
27345 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
27346                            SelectionDAG &DAG) {
27347   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
27348   // where a va_list is still an i8*.
27349   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
27350   if (Subtarget.isCallingConvWin64(
27351         DAG.getMachineFunction().getFunction().getCallingConv()))
27352     // Probably a Win64 va_copy.
27353     return DAG.expandVACopy(Op.getNode());
27354
27355   SDValue Chain = Op.getOperand(0);
27356   SDValue DstPtr = Op.getOperand(1);
27357   SDValue SrcPtr = Op.getOperand(2);
27358   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
27359   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27360   SDLoc DL(Op);
27361
27362   return DAG.getMemcpy(
27363       Chain, DL, DstPtr, SrcPtr,
27364       DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
27365       Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
27366       false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
27367 }
27368
27369 // Helper to get immediate/variable SSE shift opcode from other shift opcodes.
27370 static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
27371   switch (Opc) {
27372   case ISD::SHL:
27373   case X86ISD::VSHL:
27374   case X86ISD::VSHLI:
27375     return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
27376   case ISD::SRL:
27377   case X86ISD::VSRL:
27378   case X86ISD::VSRLI:
27379     return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
27380   case ISD::SRA:
27381   case X86ISD::VSRA:
27382   case X86ISD::VSRAI:
27383     return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
27384   }
27385   llvm_unreachable("Unknown target vector shift node");
27386 }
27387
27388 /// Handle vector element shifts where the shift amount is a constant.
27389 /// Takes immediate version of shift as input.
27390 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
27391                                           SDValue SrcOp, uint64_t ShiftAmt,
27392                                           SelectionDAG &DAG) {
27393   MVT ElementType = VT.getVectorElementType();
27394
27395   // Bitcast the source vector to the output type, this is mainly necessary for
27396   // vXi8/vXi64 shifts.
27397   if (VT != SrcOp.getSimpleValueType())
27398     SrcOp = DAG.getBitcast(VT, SrcOp);
27399
27400   // Fold this packed shift into its first operand if ShiftAmt is 0.
27401   if (ShiftAmt == 0)
27402     return SrcOp;
27403
27404   // Check for ShiftAmt >= element width
27405   if (ShiftAmt >= ElementType.getSizeInBits()) {
27406     if (Opc == X86ISD::VSRAI)
27407       ShiftAmt = ElementType.getSizeInBits() - 1;
27408     else
27409       return DAG.getConstant(0, dl, VT);
27410   }
27411
27412   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
27413          && "Unknown target vector shift-by-constant node");
27414
27415   // Fold this packed vector shift into a build vector if SrcOp is a
27416   // vector of Constants or UNDEFs.
27417   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
27418     unsigned ShiftOpc;
27419     switch (Opc) {
27420     default: llvm_unreachable("Unknown opcode!");
27421     case X86ISD::VSHLI:
27422       ShiftOpc = ISD::SHL;
27423       break;
27424     case X86ISD::VSRLI:
27425       ShiftOpc = ISD::SRL;
27426       break;
27427     case X86ISD::VSRAI:
27428       ShiftOpc = ISD::SRA;
27429       break;
27430     }
27431
27432     SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
27433     if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
27434       return C;
27435   }
27436
27437   return DAG.getNode(Opc, dl, VT, SrcOp,
27438                      DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
27439 }
27440
27441 /// Handle vector element shifts by a splat shift amount
27442 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
27443                                    SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
27444                                    const X86Subtarget &Subtarget,
27445                                    SelectionDAG &DAG) {
27446   MVT AmtVT = ShAmt.getSimpleValueType();
27447   assert(AmtVT.isVector() && "Vector shift type mismatch");
27448   assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
27449          "Illegal vector splat index");
27450
27451   // Move the splat element to the bottom element.
27452   if (ShAmtIdx != 0) {
27453     SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
27454     Mask[0] = ShAmtIdx;
27455     ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
27456   }
27457
27458   // Peek through any zext node if we can get back to a 128-bit source.
27459   if (AmtVT.getScalarSizeInBits() == 64 &&
27460       (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
27461        ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
27462       ShAmt.getOperand(0).getValueType().isSimple() &&
27463       ShAmt.getOperand(0).getValueType().is128BitVector()) {
27464     ShAmt = ShAmt.getOperand(0);
27465     AmtVT = ShAmt.getSimpleValueType();
27466   }
27467
27468   // See if we can mask off the upper elements using the existing source node.
27469   // The shift uses the entire lower 64-bits of the amount vector, so no need to
27470   // do this for vXi64 types.
27471   bool IsMasked = false;
27472   if (AmtVT.getScalarSizeInBits() < 64) {
27473     if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
27474         ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
27475       // If the shift amount has come from a scalar, then zero-extend the scalar
27476       // before moving to the vector.
27477       ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
27478       ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
27479       ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
27480       AmtVT = MVT::v4i32;
27481       IsMasked = true;
27482     } else if (ShAmt.getOpcode() == ISD::AND) {
27483       // See if the shift amount is already masked (e.g. for rotation modulo),
27484       // then we can zero-extend it by setting all the other mask elements to
27485       // zero.
27486       SmallVector<SDValue> MaskElts(
27487           AmtVT.getVectorNumElements(),
27488           DAG.getConstant(0, dl, AmtVT.getScalarType()));
27489       MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
27490       SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
27491       if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
27492                                              {ShAmt.getOperand(1), Mask}))) {
27493         ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
27494         IsMasked = true;
27495       }
27496     }
27497   }
27498
27499   // Extract if the shift amount vector is larger than 128-bits.
27500   if (AmtVT.getSizeInBits() > 128) {
27501     ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
27502     AmtVT = ShAmt.getSimpleValueType();
27503   }
27504
27505   // Zero-extend bottom element to v2i64 vector type, either by extension or
27506   // shuffle masking.
27507   if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
27508     if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
27509                                 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
27510       ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
27511     } else if (Subtarget.hasSSE41()) {
27512       ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
27513                           MVT::v2i64, ShAmt);
27514     } else {
27515       SDValue ByteShift = DAG.getTargetConstant(
27516           (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
27517       ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
27518       ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
27519                           ByteShift);
27520       ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
27521                           ByteShift);
27522     }
27523   }
27524
27525   // Change opcode to non-immediate version.
27526   Opc = getTargetVShiftUniformOpcode(Opc, true);
27527
27528   // The return type has to be a 128-bit type with the same element
27529   // type as the input type.
27530   MVT EltVT = VT.getVectorElementType();
27531   MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
27532
27533   ShAmt = DAG.getBitcast(ShVT, ShAmt);
27534   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
27535 }
27536
27537 /// Return Mask with the necessary casting or extending
27538 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
27539 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
27540                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
27541                            const SDLoc &dl) {
27542
27543   if (isAllOnesConstant(Mask))
27544     return DAG.getConstant(1, dl, MaskVT);
27545   if (X86::isZeroNode(Mask))
27546     return DAG.getConstant(0, dl, MaskVT);
27547
27548   assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
27549
27550   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
27551     assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
27552     assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
27553     // In case 32bit mode, bitcast i64 is illegal, extend/split it.
27554     SDValue Lo, Hi;
27555     std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
27556     Lo = DAG.getBitcast(MVT::v32i1, Lo);
27557     Hi = DAG.getBitcast(MVT::v32i1, Hi);
27558     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
27559   } else {
27560     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
27561                                      Mask.getSimpleValueType().getSizeInBits());
27562     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
27563     // are extracted by EXTRACT_SUBVECTOR.
27564     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
27565                        DAG.getBitcast(BitcastVT, Mask),
27566                        DAG.getIntPtrConstant(0, dl));
27567   }
27568 }
27569
27570 /// Return (and \p Op, \p Mask) for compare instructions or
27571 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
27572 /// necessary casting or extending for \p Mask when lowering masking intrinsics
27573 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
27574                                     SDValue PreservedSrc,
27575                                     const X86Subtarget &Subtarget,
27576                                     SelectionDAG &DAG) {
27577   MVT VT = Op.getSimpleValueType();
27578   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
27579   unsigned OpcodeSelect = ISD::VSELECT;
27580   SDLoc dl(Op);
27581
27582   if (isAllOnesConstant(Mask))
27583     return Op;
27584
27585   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27586
27587   if (PreservedSrc.isUndef())
27588     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
27589   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
27590 }
27591
27592 /// Creates an SDNode for a predicated scalar operation.
27593 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
27594 /// The mask is coming as MVT::i8 and it should be transformed
27595 /// to MVT::v1i1 while lowering masking intrinsics.
27596 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
27597 /// "X86select" instead of "vselect". We just can't create the "vselect" node
27598 /// for a scalar instruction.
27599 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
27600                                     SDValue PreservedSrc,
27601                                     const X86Subtarget &Subtarget,
27602                                     SelectionDAG &DAG) {
27603
27604   if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
27605     if (MaskConst->getZExtValue() & 0x1)
27606       return Op;
27607
27608   MVT VT = Op.getSimpleValueType();
27609   SDLoc dl(Op);
27610
27611   assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
27612   SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
27613                               DAG.getBitcast(MVT::v8i1, Mask),
27614                               DAG.getIntPtrConstant(0, dl));
27615   if (Op.getOpcode() == X86ISD::FSETCCM ||
27616       Op.getOpcode() == X86ISD::FSETCCM_SAE ||
27617       Op.getOpcode() == X86ISD::VFPCLASSS)
27618     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
27619
27620   if (PreservedSrc.isUndef())
27621     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
27622   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
27623 }
27624
27625 static int getSEHRegistrationNodeSize(const Function *Fn) {
27626   if (!Fn->hasPersonalityFn())
27627     report_fatal_error(
27628         "querying registration node size for function without personality");
27629   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
27630   // WinEHStatePass for the full struct definition.
27631   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
27632   case EHPersonality::MSVC_X86SEH: return 24;
27633   case EHPersonality::MSVC_CXX: return 16;
27634   default: break;
27635   }
27636   report_fatal_error(
27637       "can only recover FP for 32-bit MSVC EH personality functions");
27638 }
27639
27640 /// When the MSVC runtime transfers control to us, either to an outlined
27641 /// function or when returning to a parent frame after catching an exception, we
27642 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
27643 /// Here's the math:
27644 ///   RegNodeBase = EntryEBP - RegNodeSize
27645 ///   ParentFP = RegNodeBase - ParentFrameOffset
27646 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
27647 /// subtracting the offset (negative on x86) takes us back to the parent FP.
27648 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
27649                                    SDValue EntryEBP) {
27650   MachineFunction &MF = DAG.getMachineFunction();
27651   SDLoc dl;
27652
27653   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27654   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27655
27656   // It's possible that the parent function no longer has a personality function
27657   // if the exceptional code was optimized away, in which case we just return
27658   // the incoming EBP.
27659   if (!Fn->hasPersonalityFn())
27660     return EntryEBP;
27661
27662   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
27663   // registration, or the .set_setframe offset.
27664   MCSymbol *OffsetSym =
27665       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
27666           GlobalValue::dropLLVMManglingEscape(Fn->getName()));
27667   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
27668   SDValue ParentFrameOffset =
27669       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
27670
27671   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
27672   // prologue to RBP in the parent function.
27673   const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
27674   if (Subtarget.is64Bit())
27675     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
27676
27677   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
27678   // RegNodeBase = EntryEBP - RegNodeSize
27679   // ParentFP = RegNodeBase - ParentFrameOffset
27680   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
27681                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
27682   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
27683 }
27684
27685 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
27686                                                    SelectionDAG &DAG) const {
27687   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
27688   auto isRoundModeCurDirection = [](SDValue Rnd) {
27689     if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
27690       return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
27691
27692     return false;
27693   };
27694   auto isRoundModeSAE = [](SDValue Rnd) {
27695     if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
27696       unsigned RC = C->getZExtValue();
27697       if (RC & X86::STATIC_ROUNDING::NO_EXC) {
27698         // Clear the NO_EXC bit and check remaining bits.
27699         RC ^= X86::STATIC_ROUNDING::NO_EXC;
27700         // As a convenience we allow no other bits or explicitly
27701         // current direction.
27702         return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
27703       }
27704     }
27705
27706     return false;
27707   };
27708   auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
27709     if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
27710       RC = C->getZExtValue();
27711       if (RC & X86::STATIC_ROUNDING::NO_EXC) {
27712         // Clear the NO_EXC bit and check remaining bits.
27713         RC ^= X86::STATIC_ROUNDING::NO_EXC;
27714         return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
27715                RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
27716                RC == X86::STATIC_ROUNDING::TO_POS_INF ||
27717                RC == X86::STATIC_ROUNDING::TO_ZERO;
27718       }
27719     }
27720
27721     return false;
27722   };
27723
27724   SDLoc dl(Op);
27725   unsigned IntNo = Op.getConstantOperandVal(0);
27726   MVT VT = Op.getSimpleValueType();
27727   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
27728
27729   // Propagate flags from original node to transformed node(s).
27730   SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
27731
27732   if (IntrData) {
27733     switch(IntrData->Type) {
27734     case INTR_TYPE_1OP: {
27735       // We specify 2 possible opcodes for intrinsics with rounding modes.
27736       // First, we check if the intrinsic may have non-default rounding mode,
27737       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27738       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27739       if (IntrWithRoundingModeOpcode != 0) {
27740         SDValue Rnd = Op.getOperand(2);
27741         unsigned RC = 0;
27742         if (isRoundModeSAEToX(Rnd, RC))
27743           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27744                              Op.getOperand(1),
27745                              DAG.getTargetConstant(RC, dl, MVT::i32));
27746         if (!isRoundModeCurDirection(Rnd))
27747           return SDValue();
27748       }
27749       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27750                          Op.getOperand(1));
27751     }
27752     case INTR_TYPE_1OP_SAE: {
27753       SDValue Sae = Op.getOperand(2);
27754
27755       unsigned Opc;
27756       if (isRoundModeCurDirection(Sae))
27757         Opc = IntrData->Opc0;
27758       else if (isRoundModeSAE(Sae))
27759         Opc = IntrData->Opc1;
27760       else
27761         return SDValue();
27762
27763       return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
27764     }
27765     case INTR_TYPE_2OP: {
27766       SDValue Src2 = Op.getOperand(2);
27767
27768       // We specify 2 possible opcodes for intrinsics with rounding modes.
27769       // First, we check if the intrinsic may have non-default rounding mode,
27770       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27771       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27772       if (IntrWithRoundingModeOpcode != 0) {
27773         SDValue Rnd = Op.getOperand(3);
27774         unsigned RC = 0;
27775         if (isRoundModeSAEToX(Rnd, RC))
27776           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27777                              Op.getOperand(1), Src2,
27778                              DAG.getTargetConstant(RC, dl, MVT::i32));
27779         if (!isRoundModeCurDirection(Rnd))
27780           return SDValue();
27781       }
27782
27783       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27784                          Op.getOperand(1), Src2);
27785     }
27786     case INTR_TYPE_2OP_SAE: {
27787       SDValue Sae = Op.getOperand(3);
27788
27789       unsigned Opc;
27790       if (isRoundModeCurDirection(Sae))
27791         Opc = IntrData->Opc0;
27792       else if (isRoundModeSAE(Sae))
27793         Opc = IntrData->Opc1;
27794       else
27795         return SDValue();
27796
27797       return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
27798                          Op.getOperand(2));
27799     }
27800     case INTR_TYPE_3OP:
27801     case INTR_TYPE_3OP_IMM8: {
27802       SDValue Src1 = Op.getOperand(1);
27803       SDValue Src2 = Op.getOperand(2);
27804       SDValue Src3 = Op.getOperand(3);
27805
27806       if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
27807           Src3.getValueType() != MVT::i8) {
27808         Src3 = DAG.getTargetConstant(
27809             cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
27810       }
27811
27812       // We specify 2 possible opcodes for intrinsics with rounding modes.
27813       // First, we check if the intrinsic may have non-default rounding mode,
27814       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
27815       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27816       if (IntrWithRoundingModeOpcode != 0) {
27817         SDValue Rnd = Op.getOperand(4);
27818         unsigned RC = 0;
27819         if (isRoundModeSAEToX(Rnd, RC))
27820           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27821                              Src1, Src2, Src3,
27822                              DAG.getTargetConstant(RC, dl, MVT::i32));
27823         if (!isRoundModeCurDirection(Rnd))
27824           return SDValue();
27825       }
27826
27827       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27828                          {Src1, Src2, Src3});
27829     }
27830     case INTR_TYPE_4OP_IMM8: {
27831       assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
27832       SDValue Src4 = Op.getOperand(4);
27833       if (Src4.getValueType() != MVT::i8) {
27834         Src4 = DAG.getTargetConstant(
27835             cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
27836       }
27837
27838       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27839                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
27840                          Src4);
27841     }
27842     case INTR_TYPE_1OP_MASK: {
27843       SDValue Src = Op.getOperand(1);
27844       SDValue PassThru = Op.getOperand(2);
27845       SDValue Mask = Op.getOperand(3);
27846       // We add rounding mode to the Node when
27847       //   - RC Opcode is specified and
27848       //   - RC is not "current direction".
27849       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27850       if (IntrWithRoundingModeOpcode != 0) {
27851         SDValue Rnd = Op.getOperand(4);
27852         unsigned RC = 0;
27853         if (isRoundModeSAEToX(Rnd, RC))
27854           return getVectorMaskingNode(
27855               DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
27856                           Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
27857               Mask, PassThru, Subtarget, DAG);
27858         if (!isRoundModeCurDirection(Rnd))
27859           return SDValue();
27860       }
27861       return getVectorMaskingNode(
27862           DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
27863           Subtarget, DAG);
27864     }
27865     case INTR_TYPE_1OP_MASK_SAE: {
27866       SDValue Src = Op.getOperand(1);
27867       SDValue PassThru = Op.getOperand(2);
27868       SDValue Mask = Op.getOperand(3);
27869       SDValue Rnd = Op.getOperand(4);
27870
27871       unsigned Opc;
27872       if (isRoundModeCurDirection(Rnd))
27873         Opc = IntrData->Opc0;
27874       else if (isRoundModeSAE(Rnd))
27875         Opc = IntrData->Opc1;
27876       else
27877         return SDValue();
27878
27879       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
27880                                   Subtarget, DAG);
27881     }
27882     case INTR_TYPE_SCALAR_MASK: {
27883       SDValue Src1 = Op.getOperand(1);
27884       SDValue Src2 = Op.getOperand(2);
27885       SDValue passThru = Op.getOperand(3);
27886       SDValue Mask = Op.getOperand(4);
27887       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
27888       // There are 2 kinds of intrinsics in this group:
27889       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
27890       // (2) With rounding mode and sae - 7 operands.
27891       bool HasRounding = IntrWithRoundingModeOpcode != 0;
27892       if (Op.getNumOperands() == (5U + HasRounding)) {
27893         if (HasRounding) {
27894           SDValue Rnd = Op.getOperand(5);
27895           unsigned RC = 0;
27896           if (isRoundModeSAEToX(Rnd, RC))
27897             return getScalarMaskingNode(
27898                 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
27899                             DAG.getTargetConstant(RC, dl, MVT::i32)),
27900                 Mask, passThru, Subtarget, DAG);
27901           if (!isRoundModeCurDirection(Rnd))
27902             return SDValue();
27903         }
27904         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
27905                                                 Src2),
27906                                     Mask, passThru, Subtarget, DAG);
27907       }
27908
27909       assert(Op.getNumOperands() == (6U + HasRounding) &&
27910              "Unexpected intrinsic form");
27911       SDValue RoundingMode = Op.getOperand(5);
27912       unsigned Opc = IntrData->Opc0;
27913       if (HasRounding) {
27914         SDValue Sae = Op.getOperand(6);
27915         if (isRoundModeSAE(Sae))
27916           Opc = IntrWithRoundingModeOpcode;
27917         else if (!isRoundModeCurDirection(Sae))
27918           return SDValue();
27919       }
27920       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
27921                                               Src2, RoundingMode),
27922                                   Mask, passThru, Subtarget, DAG);
27923     }
27924     case INTR_TYPE_SCALAR_MASK_RND: {
27925       SDValue Src1 = Op.getOperand(1);
27926       SDValue Src2 = Op.getOperand(2);
27927       SDValue passThru = Op.getOperand(3);
27928       SDValue Mask = Op.getOperand(4);
27929       SDValue Rnd = Op.getOperand(5);
27930
27931       SDValue NewOp;
27932       unsigned RC = 0;
27933       if (isRoundModeCurDirection(Rnd))
27934         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
27935       else if (isRoundModeSAEToX(Rnd, RC))
27936         NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
27937                             DAG.getTargetConstant(RC, dl, MVT::i32));
27938       else
27939         return SDValue();
27940
27941       return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
27942     }
27943     case INTR_TYPE_SCALAR_MASK_SAE: {
27944       SDValue Src1 = Op.getOperand(1);
27945       SDValue Src2 = Op.getOperand(2);
27946       SDValue passThru = Op.getOperand(3);
27947       SDValue Mask = Op.getOperand(4);
27948       SDValue Sae = Op.getOperand(5);
27949       unsigned Opc;
27950       if (isRoundModeCurDirection(Sae))
27951         Opc = IntrData->Opc0;
27952       else if (isRoundModeSAE(Sae))
27953         Opc = IntrData->Opc1;
27954       else
27955         return SDValue();
27956
27957       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
27958                                   Mask, passThru, Subtarget, DAG);
27959     }
27960     case INTR_TYPE_2OP_MASK: {
27961       SDValue Src1 = Op.getOperand(1);
27962       SDValue Src2 = Op.getOperand(2);
27963       SDValue PassThru = Op.getOperand(3);
27964       SDValue Mask = Op.getOperand(4);
27965       SDValue NewOp;
27966       if (IntrData->Opc1 != 0) {
27967         SDValue Rnd = Op.getOperand(5);
27968         unsigned RC = 0;
27969         if (isRoundModeSAEToX(Rnd, RC))
27970           NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
27971                               DAG.getTargetConstant(RC, dl, MVT::i32));
27972         else if (!isRoundModeCurDirection(Rnd))
27973           return SDValue();
27974       }
27975       if (!NewOp)
27976         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
27977       return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
27978     }
27979     case INTR_TYPE_2OP_MASK_SAE: {
27980       SDValue Src1 = Op.getOperand(1);
27981       SDValue Src2 = Op.getOperand(2);
27982       SDValue PassThru = Op.getOperand(3);
27983       SDValue Mask = Op.getOperand(4);
27984
27985       unsigned Opc = IntrData->Opc0;
27986       if (IntrData->Opc1 != 0) {
27987         SDValue Sae = Op.getOperand(5);
27988         if (isRoundModeSAE(Sae))
27989           Opc = IntrData->Opc1;
27990         else if (!isRoundModeCurDirection(Sae))
27991           return SDValue();
27992       }
27993
27994       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
27995                                   Mask, PassThru, Subtarget, DAG);
27996     }
27997     case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
27998       SDValue Src1 = Op.getOperand(1);
27999       SDValue Src2 = Op.getOperand(2);
28000       SDValue Src3 = Op.getOperand(3);
28001       SDValue PassThru = Op.getOperand(4);
28002       SDValue Mask = Op.getOperand(5);
28003       SDValue Sae = Op.getOperand(6);
28004       unsigned Opc;
28005       if (isRoundModeCurDirection(Sae))
28006         Opc = IntrData->Opc0;
28007       else if (isRoundModeSAE(Sae))
28008         Opc = IntrData->Opc1;
28009       else
28010         return SDValue();
28011
28012       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
28013                                   Mask, PassThru, Subtarget, DAG);
28014     }
28015     case INTR_TYPE_3OP_MASK_SAE: {
28016       SDValue Src1 = Op.getOperand(1);
28017       SDValue Src2 = Op.getOperand(2);
28018       SDValue Src3 = Op.getOperand(3);
28019       SDValue PassThru = Op.getOperand(4);
28020       SDValue Mask = Op.getOperand(5);
28021
28022       unsigned Opc = IntrData->Opc0;
28023       if (IntrData->Opc1 != 0) {
28024         SDValue Sae = Op.getOperand(6);
28025         if (isRoundModeSAE(Sae))
28026           Opc = IntrData->Opc1;
28027         else if (!isRoundModeCurDirection(Sae))
28028           return SDValue();
28029       }
28030       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
28031                                   Mask, PassThru, Subtarget, DAG);
28032     }
28033     case BLENDV: {
28034       SDValue Src1 = Op.getOperand(1);
28035       SDValue Src2 = Op.getOperand(2);
28036       SDValue Src3 = Op.getOperand(3);
28037
28038       EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
28039       Src3 = DAG.getBitcast(MaskVT, Src3);
28040
28041       // Reverse the operands to match VSELECT order.
28042       return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
28043     }
28044     case VPERM_2OP : {
28045       SDValue Src1 = Op.getOperand(1);
28046       SDValue Src2 = Op.getOperand(2);
28047
28048       // Swap Src1 and Src2 in the node creation
28049       return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
28050     }
28051     case CFMA_OP_MASKZ:
28052     case CFMA_OP_MASK: {
28053       SDValue Src1 = Op.getOperand(1);
28054       SDValue Src2 = Op.getOperand(2);
28055       SDValue Src3 = Op.getOperand(3);
28056       SDValue Mask = Op.getOperand(4);
28057       MVT VT = Op.getSimpleValueType();
28058
28059       SDValue PassThru = Src3;
28060       if (IntrData->Type == CFMA_OP_MASKZ)
28061         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
28062
28063       // We add rounding mode to the Node when
28064       //   - RC Opcode is specified and
28065       //   - RC is not "current direction".
28066       SDValue NewOp;
28067       if (IntrData->Opc1 != 0) {
28068         SDValue Rnd = Op.getOperand(5);
28069         unsigned RC = 0;
28070         if (isRoundModeSAEToX(Rnd, RC))
28071           NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
28072                               DAG.getTargetConstant(RC, dl, MVT::i32));
28073         else if (!isRoundModeCurDirection(Rnd))
28074           return SDValue();
28075       }
28076       if (!NewOp)
28077         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
28078       return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
28079     }
28080     case IFMA_OP:
28081       // NOTE: We need to swizzle the operands to pass the multiply operands
28082       // first.
28083       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28084                          Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
28085     case FPCLASSS: {
28086       SDValue Src1 = Op.getOperand(1);
28087       SDValue Imm = Op.getOperand(2);
28088       SDValue Mask = Op.getOperand(3);
28089       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
28090       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
28091                                                  Subtarget, DAG);
28092       // Need to fill with zeros to ensure the bitcast will produce zeroes
28093       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
28094       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
28095                                 DAG.getConstant(0, dl, MVT::v8i1),
28096                                 FPclassMask, DAG.getIntPtrConstant(0, dl));
28097       return DAG.getBitcast(MVT::i8, Ins);
28098     }
28099
28100     case CMP_MASK_CC: {
28101       MVT MaskVT = Op.getSimpleValueType();
28102       SDValue CC = Op.getOperand(3);
28103       SDValue Mask = Op.getOperand(4);
28104       // We specify 2 possible opcodes for intrinsics with rounding modes.
28105       // First, we check if the intrinsic may have non-default rounding mode,
28106       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
28107       if (IntrData->Opc1 != 0) {
28108         SDValue Sae = Op.getOperand(5);
28109         if (isRoundModeSAE(Sae))
28110           return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
28111                              Op.getOperand(2), CC, Mask, Sae);
28112         if (!isRoundModeCurDirection(Sae))
28113           return SDValue();
28114       }
28115       //default rounding mode
28116       return DAG.getNode(IntrData->Opc0, dl, MaskVT,
28117                          {Op.getOperand(1), Op.getOperand(2), CC, Mask});
28118     }
28119     case CMP_MASK_SCALAR_CC: {
28120       SDValue Src1 = Op.getOperand(1);
28121       SDValue Src2 = Op.getOperand(2);
28122       SDValue CC = Op.getOperand(3);
28123       SDValue Mask = Op.getOperand(4);
28124
28125       SDValue Cmp;
28126       if (IntrData->Opc1 != 0) {
28127         SDValue Sae = Op.getOperand(5);
28128         if (isRoundModeSAE(Sae))
28129           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
28130         else if (!isRoundModeCurDirection(Sae))
28131           return SDValue();
28132       }
28133       //default rounding mode
28134       if (!Cmp.getNode())
28135         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
28136
28137       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
28138                                              Subtarget, DAG);
28139       // Need to fill with zeros to ensure the bitcast will produce zeroes
28140       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
28141       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
28142                                 DAG.getConstant(0, dl, MVT::v8i1),
28143                                 CmpMask, DAG.getIntPtrConstant(0, dl));
28144       return DAG.getBitcast(MVT::i8, Ins);
28145     }
28146     case COMI: { // Comparison intrinsics
28147       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
28148       SDValue LHS = Op.getOperand(1);
28149       SDValue RHS = Op.getOperand(2);
28150       // Some conditions require the operands to be swapped.
28151       if (CC == ISD::SETLT || CC == ISD::SETLE)
28152         std::swap(LHS, RHS);
28153
28154       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
28155       SDValue SetCC;
28156       switch (CC) {
28157       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
28158         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
28159         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
28160         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
28161         break;
28162       }
28163       case ISD::SETNE: { // (ZF = 1 or PF = 1)
28164         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
28165         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
28166         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
28167         break;
28168       }
28169       case ISD::SETGT: // (CF = 0 and ZF = 0)
28170       case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
28171         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
28172         break;
28173       }
28174       case ISD::SETGE: // CF = 0
28175       case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
28176         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
28177         break;
28178       default:
28179         llvm_unreachable("Unexpected illegal condition!");
28180       }
28181       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28182     }
28183     case COMI_RM: { // Comparison intrinsics with Sae
28184       SDValue LHS = Op.getOperand(1);
28185       SDValue RHS = Op.getOperand(2);
28186       unsigned CondVal = Op.getConstantOperandVal(3);
28187       SDValue Sae = Op.getOperand(4);
28188
28189       SDValue FCmp;
28190       if (isRoundModeCurDirection(Sae))
28191         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
28192                            DAG.getTargetConstant(CondVal, dl, MVT::i8));
28193       else if (isRoundModeSAE(Sae))
28194         FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
28195                            DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
28196       else
28197         return SDValue();
28198       // Need to fill with zeros to ensure the bitcast will produce zeroes
28199       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
28200       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
28201                                 DAG.getConstant(0, dl, MVT::v16i1),
28202                                 FCmp, DAG.getIntPtrConstant(0, dl));
28203       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
28204                          DAG.getBitcast(MVT::i16, Ins));
28205     }
28206     case VSHIFT: {
28207       SDValue SrcOp = Op.getOperand(1);
28208       SDValue ShAmt = Op.getOperand(2);
28209       assert(ShAmt.getValueType() == MVT::i32 &&
28210              "Unexpected VSHIFT amount type");
28211
28212       // Catch shift-by-constant.
28213       if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
28214         return getTargetVShiftByConstNode(IntrData->Opc0, dl,
28215                                           Op.getSimpleValueType(), SrcOp,
28216                                           CShAmt->getZExtValue(), DAG);
28217
28218       ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
28219       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
28220                                  SrcOp, ShAmt, 0, Subtarget, DAG);
28221     }
28222     case COMPRESS_EXPAND_IN_REG: {
28223       SDValue Mask = Op.getOperand(3);
28224       SDValue DataToCompress = Op.getOperand(1);
28225       SDValue PassThru = Op.getOperand(2);
28226       if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
28227         return Op.getOperand(1);
28228
28229       // Avoid false dependency.
28230       if (PassThru.isUndef())
28231         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
28232
28233       return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
28234                          Mask);
28235     }
28236     case FIXUPIMM:
28237     case FIXUPIMM_MASKZ: {
28238       SDValue Src1 = Op.getOperand(1);
28239       SDValue Src2 = Op.getOperand(2);
28240       SDValue Src3 = Op.getOperand(3);
28241       SDValue Imm = Op.getOperand(4);
28242       SDValue Mask = Op.getOperand(5);
28243       SDValue Passthru = (IntrData->Type == FIXUPIMM)
28244                              ? Src1
28245                              : getZeroVector(VT, Subtarget, DAG, dl);
28246
28247       unsigned Opc = IntrData->Opc0;
28248       if (IntrData->Opc1 != 0) {
28249         SDValue Sae = Op.getOperand(6);
28250         if (isRoundModeSAE(Sae))
28251           Opc = IntrData->Opc1;
28252         else if (!isRoundModeCurDirection(Sae))
28253           return SDValue();
28254       }
28255
28256       SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
28257
28258       if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
28259         return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
28260
28261       return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
28262     }
28263     case ROUNDP: {
28264       assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
28265       // Clear the upper bits of the rounding immediate so that the legacy
28266       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
28267       auto Round = cast<ConstantSDNode>(Op.getOperand(2));
28268       SDValue RoundingMode =
28269           DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
28270       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28271                          Op.getOperand(1), RoundingMode);
28272     }
28273     case ROUNDS: {
28274       assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
28275       // Clear the upper bits of the rounding immediate so that the legacy
28276       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
28277       auto Round = cast<ConstantSDNode>(Op.getOperand(3));
28278       SDValue RoundingMode =
28279           DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
28280       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28281                          Op.getOperand(1), Op.getOperand(2), RoundingMode);
28282     }
28283     case BEXTRI: {
28284       assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
28285
28286       uint64_t Imm = Op.getConstantOperandVal(2);
28287       SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
28288                                               Op.getValueType());
28289       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
28290                          Op.getOperand(1), Control);
28291     }
28292     // ADC/ADCX/SBB
28293     case ADX: {
28294       SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
28295       SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
28296
28297       SDValue Res;
28298       // If the carry in is zero, then we should just use ADD/SUB instead of
28299       // ADC/SBB.
28300       if (isNullConstant(Op.getOperand(1))) {
28301         Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
28302                           Op.getOperand(3));
28303       } else {
28304         SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
28305                                     DAG.getConstant(-1, dl, MVT::i8));
28306         Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
28307                           Op.getOperand(3), GenCF.getValue(1));
28308       }
28309       SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
28310       SDValue Results[] = { SetCC, Res };
28311       return DAG.getMergeValues(Results, dl);
28312     }
28313     case CVTPD2PS_MASK:
28314     case CVTPD2DQ_MASK:
28315     case CVTQQ2PS_MASK:
28316     case TRUNCATE_TO_REG: {
28317       SDValue Src = Op.getOperand(1);
28318       SDValue PassThru = Op.getOperand(2);
28319       SDValue Mask = Op.getOperand(3);
28320
28321       if (isAllOnesConstant(Mask))
28322         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
28323
28324       MVT SrcVT = Src.getSimpleValueType();
28325       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
28326       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28327       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
28328                          {Src, PassThru, Mask});
28329     }
28330     case CVTPS2PH_MASK: {
28331       SDValue Src = Op.getOperand(1);
28332       SDValue Rnd = Op.getOperand(2);
28333       SDValue PassThru = Op.getOperand(3);
28334       SDValue Mask = Op.getOperand(4);
28335
28336       unsigned RC = 0;
28337       unsigned Opc = IntrData->Opc0;
28338       bool SAE = Src.getValueType().is512BitVector() &&
28339                  (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
28340       if (SAE) {
28341         Opc = X86ISD::CVTPS2PH_SAE;
28342         Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
28343       }
28344
28345       if (isAllOnesConstant(Mask))
28346         return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
28347
28348       if (SAE)
28349         Opc = X86ISD::MCVTPS2PH_SAE;
28350       else
28351         Opc = IntrData->Opc1;
28352       MVT SrcVT = Src.getSimpleValueType();
28353       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
28354       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28355       return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
28356     }
28357     case CVTNEPS2BF16_MASK: {
28358       SDValue Src = Op.getOperand(1);
28359       SDValue PassThru = Op.getOperand(2);
28360       SDValue Mask = Op.getOperand(3);
28361
28362       if (ISD::isBuildVectorAllOnes(Mask.getNode()))
28363         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
28364
28365       // Break false dependency.
28366       if (PassThru.isUndef())
28367         PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
28368
28369       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
28370                          Mask);
28371     }
28372     default:
28373       break;
28374     }
28375   }
28376
28377   switch (IntNo) {
28378   default: return SDValue();    // Don't custom lower most intrinsics.
28379
28380   // ptest and testp intrinsics. The intrinsic these come from are designed to
28381   // return an integer value, not just an instruction so lower it to the ptest
28382   // or testp pattern and a setcc for the result.
28383   case Intrinsic::x86_avx512_ktestc_b:
28384   case Intrinsic::x86_avx512_ktestc_w:
28385   case Intrinsic::x86_avx512_ktestc_d:
28386   case Intrinsic::x86_avx512_ktestc_q:
28387   case Intrinsic::x86_avx512_ktestz_b:
28388   case Intrinsic::x86_avx512_ktestz_w:
28389   case Intrinsic::x86_avx512_ktestz_d:
28390   case Intrinsic::x86_avx512_ktestz_q:
28391   case Intrinsic::x86_sse41_ptestz:
28392   case Intrinsic::x86_sse41_ptestc:
28393   case Intrinsic::x86_sse41_ptestnzc:
28394   case Intrinsic::x86_avx_ptestz_256:
28395   case Intrinsic::x86_avx_ptestc_256:
28396   case Intrinsic::x86_avx_ptestnzc_256:
28397   case Intrinsic::x86_avx_vtestz_ps:
28398   case Intrinsic::x86_avx_vtestc_ps:
28399   case Intrinsic::x86_avx_vtestnzc_ps:
28400   case Intrinsic::x86_avx_vtestz_pd:
28401   case Intrinsic::x86_avx_vtestc_pd:
28402   case Intrinsic::x86_avx_vtestnzc_pd:
28403   case Intrinsic::x86_avx_vtestz_ps_256:
28404   case Intrinsic::x86_avx_vtestc_ps_256:
28405   case Intrinsic::x86_avx_vtestnzc_ps_256:
28406   case Intrinsic::x86_avx_vtestz_pd_256:
28407   case Intrinsic::x86_avx_vtestc_pd_256:
28408   case Intrinsic::x86_avx_vtestnzc_pd_256: {
28409     unsigned TestOpc = X86ISD::PTEST;
28410     X86::CondCode X86CC;
28411     switch (IntNo) {
28412     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
28413     case Intrinsic::x86_avx512_ktestc_b:
28414     case Intrinsic::x86_avx512_ktestc_w:
28415     case Intrinsic::x86_avx512_ktestc_d:
28416     case Intrinsic::x86_avx512_ktestc_q:
28417       // CF = 1
28418       TestOpc = X86ISD::KTEST;
28419       X86CC = X86::COND_B;
28420       break;
28421     case Intrinsic::x86_avx512_ktestz_b:
28422     case Intrinsic::x86_avx512_ktestz_w:
28423     case Intrinsic::x86_avx512_ktestz_d:
28424     case Intrinsic::x86_avx512_ktestz_q:
28425       TestOpc = X86ISD::KTEST;
28426       X86CC = X86::COND_E;
28427       break;
28428     case Intrinsic::x86_avx_vtestz_ps:
28429     case Intrinsic::x86_avx_vtestz_pd:
28430     case Intrinsic::x86_avx_vtestz_ps_256:
28431     case Intrinsic::x86_avx_vtestz_pd_256:
28432       TestOpc = X86ISD::TESTP;
28433       [[fallthrough]];
28434     case Intrinsic::x86_sse41_ptestz:
28435     case Intrinsic::x86_avx_ptestz_256:
28436       // ZF = 1
28437       X86CC = X86::COND_E;
28438       break;
28439     case Intrinsic::x86_avx_vtestc_ps:
28440     case Intrinsic::x86_avx_vtestc_pd:
28441     case Intrinsic::x86_avx_vtestc_ps_256:
28442     case Intrinsic::x86_avx_vtestc_pd_256:
28443       TestOpc = X86ISD::TESTP;
28444       [[fallthrough]];
28445     case Intrinsic::x86_sse41_ptestc:
28446     case Intrinsic::x86_avx_ptestc_256:
28447       // CF = 1
28448       X86CC = X86::COND_B;
28449       break;
28450     case Intrinsic::x86_avx_vtestnzc_ps:
28451     case Intrinsic::x86_avx_vtestnzc_pd:
28452     case Intrinsic::x86_avx_vtestnzc_ps_256:
28453     case Intrinsic::x86_avx_vtestnzc_pd_256:
28454       TestOpc = X86ISD::TESTP;
28455       [[fallthrough]];
28456     case Intrinsic::x86_sse41_ptestnzc:
28457     case Intrinsic::x86_avx_ptestnzc_256:
28458       // ZF and CF = 0
28459       X86CC = X86::COND_A;
28460       break;
28461     }
28462
28463     SDValue LHS = Op.getOperand(1);
28464     SDValue RHS = Op.getOperand(2);
28465     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
28466     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
28467     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28468   }
28469
28470   case Intrinsic::x86_sse42_pcmpistria128:
28471   case Intrinsic::x86_sse42_pcmpestria128:
28472   case Intrinsic::x86_sse42_pcmpistric128:
28473   case Intrinsic::x86_sse42_pcmpestric128:
28474   case Intrinsic::x86_sse42_pcmpistrio128:
28475   case Intrinsic::x86_sse42_pcmpestrio128:
28476   case Intrinsic::x86_sse42_pcmpistris128:
28477   case Intrinsic::x86_sse42_pcmpestris128:
28478   case Intrinsic::x86_sse42_pcmpistriz128:
28479   case Intrinsic::x86_sse42_pcmpestriz128: {
28480     unsigned Opcode;
28481     X86::CondCode X86CC;
28482     switch (IntNo) {
28483     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
28484     case Intrinsic::x86_sse42_pcmpistria128:
28485       Opcode = X86ISD::PCMPISTR;
28486       X86CC = X86::COND_A;
28487       break;
28488     case Intrinsic::x86_sse42_pcmpestria128:
28489       Opcode = X86ISD::PCMPESTR;
28490       X86CC = X86::COND_A;
28491       break;
28492     case Intrinsic::x86_sse42_pcmpistric128:
28493       Opcode = X86ISD::PCMPISTR;
28494       X86CC = X86::COND_B;
28495       break;
28496     case Intrinsic::x86_sse42_pcmpestric128:
28497       Opcode = X86ISD::PCMPESTR;
28498       X86CC = X86::COND_B;
28499       break;
28500     case Intrinsic::x86_sse42_pcmpistrio128:
28501       Opcode = X86ISD::PCMPISTR;
28502       X86CC = X86::COND_O;
28503       break;
28504     case Intrinsic::x86_sse42_pcmpestrio128:
28505       Opcode = X86ISD::PCMPESTR;
28506       X86CC = X86::COND_O;
28507       break;
28508     case Intrinsic::x86_sse42_pcmpistris128:
28509       Opcode = X86ISD::PCMPISTR;
28510       X86CC = X86::COND_S;
28511       break;
28512     case Intrinsic::x86_sse42_pcmpestris128:
28513       Opcode = X86ISD::PCMPESTR;
28514       X86CC = X86::COND_S;
28515       break;
28516     case Intrinsic::x86_sse42_pcmpistriz128:
28517       Opcode = X86ISD::PCMPISTR;
28518       X86CC = X86::COND_E;
28519       break;
28520     case Intrinsic::x86_sse42_pcmpestriz128:
28521       Opcode = X86ISD::PCMPESTR;
28522       X86CC = X86::COND_E;
28523       break;
28524     }
28525     SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28526     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28527     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
28528     SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
28529     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
28530   }
28531
28532   case Intrinsic::x86_sse42_pcmpistri128:
28533   case Intrinsic::x86_sse42_pcmpestri128: {
28534     unsigned Opcode;
28535     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
28536       Opcode = X86ISD::PCMPISTR;
28537     else
28538       Opcode = X86ISD::PCMPESTR;
28539
28540     SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28541     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28542     return DAG.getNode(Opcode, dl, VTs, NewOps);
28543   }
28544
28545   case Intrinsic::x86_sse42_pcmpistrm128:
28546   case Intrinsic::x86_sse42_pcmpestrm128: {
28547     unsigned Opcode;
28548     if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
28549       Opcode = X86ISD::PCMPISTR;
28550     else
28551       Opcode = X86ISD::PCMPESTR;
28552
28553     SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
28554     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
28555     return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
28556   }
28557
28558   case Intrinsic::eh_sjlj_lsda: {
28559     MachineFunction &MF = DAG.getMachineFunction();
28560     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28561     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
28562     auto &Context = MF.getMMI().getContext();
28563     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
28564                                             Twine(MF.getFunctionNumber()));
28565     return DAG.getNode(getGlobalWrapperKind(), dl, VT,
28566                        DAG.getMCSymbol(S, PtrVT));
28567   }
28568
28569   case Intrinsic::x86_seh_lsda: {
28570     // Compute the symbol for the LSDA. We know it'll get emitted later.
28571     MachineFunction &MF = DAG.getMachineFunction();
28572     SDValue Op1 = Op.getOperand(1);
28573     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
28574     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
28575         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
28576
28577     // Generate a simple absolute symbol reference. This intrinsic is only
28578     // supported on 32-bit Windows, which isn't PIC.
28579     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
28580     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
28581   }
28582
28583   case Intrinsic::eh_recoverfp: {
28584     SDValue FnOp = Op.getOperand(1);
28585     SDValue IncomingFPOp = Op.getOperand(2);
28586     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
28587     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
28588     if (!Fn)
28589       report_fatal_error(
28590           "llvm.eh.recoverfp must take a function as the first argument");
28591     return recoverFramePointer(DAG, Fn, IncomingFPOp);
28592   }
28593
28594   case Intrinsic::localaddress: {
28595     // Returns one of the stack, base, or frame pointer registers, depending on
28596     // which is used to reference local variables.
28597     MachineFunction &MF = DAG.getMachineFunction();
28598     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28599     unsigned Reg;
28600     if (RegInfo->hasBasePointer(MF))
28601       Reg = RegInfo->getBaseRegister();
28602     else { // Handles the SP or FP case.
28603       bool CantUseFP = RegInfo->hasStackRealignment(MF);
28604       if (CantUseFP)
28605         Reg = RegInfo->getPtrSizedStackRegister(MF);
28606       else
28607         Reg = RegInfo->getPtrSizedFrameRegister(MF);
28608     }
28609     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
28610   }
28611   case Intrinsic::x86_avx512_vp2intersect_q_512:
28612   case Intrinsic::x86_avx512_vp2intersect_q_256:
28613   case Intrinsic::x86_avx512_vp2intersect_q_128:
28614   case Intrinsic::x86_avx512_vp2intersect_d_512:
28615   case Intrinsic::x86_avx512_vp2intersect_d_256:
28616   case Intrinsic::x86_avx512_vp2intersect_d_128: {
28617     MVT MaskVT = Op.getSimpleValueType();
28618
28619     SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
28620     SDLoc DL(Op);
28621
28622     SDValue Operation =
28623         DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
28624                     Op->getOperand(1), Op->getOperand(2));
28625
28626     SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
28627                                                  MaskVT, Operation);
28628     SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
28629                                                  MaskVT, Operation);
28630     return DAG.getMergeValues({Result0, Result1}, DL);
28631   }
28632   case Intrinsic::x86_mmx_pslli_w:
28633   case Intrinsic::x86_mmx_pslli_d:
28634   case Intrinsic::x86_mmx_pslli_q:
28635   case Intrinsic::x86_mmx_psrli_w:
28636   case Intrinsic::x86_mmx_psrli_d:
28637   case Intrinsic::x86_mmx_psrli_q:
28638   case Intrinsic::x86_mmx_psrai_w:
28639   case Intrinsic::x86_mmx_psrai_d: {
28640     SDLoc DL(Op);
28641     SDValue ShAmt = Op.getOperand(2);
28642     // If the argument is a constant, convert it to a target constant.
28643     if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
28644       // Clamp out of bounds shift amounts since they will otherwise be masked
28645       // to 8-bits which may make it no longer out of bounds.
28646       unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
28647       if (ShiftAmount == 0)
28648         return Op.getOperand(1);
28649
28650       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
28651                          Op.getOperand(0), Op.getOperand(1),
28652                          DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
28653     }
28654
28655     unsigned NewIntrinsic;
28656     switch (IntNo) {
28657     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
28658     case Intrinsic::x86_mmx_pslli_w:
28659       NewIntrinsic = Intrinsic::x86_mmx_psll_w;
28660       break;
28661     case Intrinsic::x86_mmx_pslli_d:
28662       NewIntrinsic = Intrinsic::x86_mmx_psll_d;
28663       break;
28664     case Intrinsic::x86_mmx_pslli_q:
28665       NewIntrinsic = Intrinsic::x86_mmx_psll_q;
28666       break;
28667     case Intrinsic::x86_mmx_psrli_w:
28668       NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
28669       break;
28670     case Intrinsic::x86_mmx_psrli_d:
28671       NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
28672       break;
28673     case Intrinsic::x86_mmx_psrli_q:
28674       NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
28675       break;
28676     case Intrinsic::x86_mmx_psrai_w:
28677       NewIntrinsic = Intrinsic::x86_mmx_psra_w;
28678       break;
28679     case Intrinsic::x86_mmx_psrai_d:
28680       NewIntrinsic = Intrinsic::x86_mmx_psra_d;
28681       break;
28682     }
28683
28684     // The vector shift intrinsics with scalars uses 32b shift amounts but
28685     // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
28686     // MMX register.
28687     ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
28688     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
28689                        DAG.getTargetConstant(NewIntrinsic, DL,
28690                                              getPointerTy(DAG.getDataLayout())),
28691                        Op.getOperand(1), ShAmt);
28692   }
28693   case Intrinsic::thread_pointer: {
28694     if (Subtarget.isTargetELF()) {
28695       SDLoc dl(Op);
28696       EVT PtrVT = getPointerTy(DAG.getDataLayout());
28697       // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
28698       Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(
28699           *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
28700       return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28701                          DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
28702     }
28703     report_fatal_error(
28704         "Target OS doesn't support __builtin_thread_pointer() yet.");
28705   }
28706   }
28707 }
28708
28709 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28710                                  SDValue Src, SDValue Mask, SDValue Base,
28711                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
28712                                  const X86Subtarget &Subtarget) {
28713   SDLoc dl(Op);
28714   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28715   // Scale must be constant.
28716   if (!C)
28717     return SDValue();
28718   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28719   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28720                                         TLI.getPointerTy(DAG.getDataLayout()));
28721   EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
28722   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
28723   // If source is undef or we know it won't be used, use a zero vector
28724   // to break register dependency.
28725   // TODO: use undef instead and let BreakFalseDeps deal with it?
28726   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
28727     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
28728
28729   // Cast mask to an integer type.
28730   Mask = DAG.getBitcast(MaskVT, Mask);
28731
28732   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28733
28734   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
28735   SDValue Res =
28736       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
28737                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28738   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
28739 }
28740
28741 static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
28742                              SDValue Src, SDValue Mask, SDValue Base,
28743                              SDValue Index, SDValue ScaleOp, SDValue Chain,
28744                              const X86Subtarget &Subtarget) {
28745   MVT VT = Op.getSimpleValueType();
28746   SDLoc dl(Op);
28747   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28748   // Scale must be constant.
28749   if (!C)
28750     return SDValue();
28751   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28752   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28753                                         TLI.getPointerTy(DAG.getDataLayout()));
28754   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
28755                               VT.getVectorNumElements());
28756   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
28757
28758   // We support two versions of the gather intrinsics. One with scalar mask and
28759   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
28760   if (Mask.getValueType() != MaskVT)
28761     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28762
28763   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
28764   // If source is undef or we know it won't be used, use a zero vector
28765   // to break register dependency.
28766   // TODO: use undef instead and let BreakFalseDeps deal with it?
28767   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
28768     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
28769
28770   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28771
28772   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
28773   SDValue Res =
28774       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
28775                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28776   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
28777 }
28778
28779 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28780                                SDValue Src, SDValue Mask, SDValue Base,
28781                                SDValue Index, SDValue ScaleOp, SDValue Chain,
28782                                const X86Subtarget &Subtarget) {
28783   SDLoc dl(Op);
28784   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28785   // Scale must be constant.
28786   if (!C)
28787     return SDValue();
28788   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28789   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28790                                         TLI.getPointerTy(DAG.getDataLayout()));
28791   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
28792                               Src.getSimpleValueType().getVectorNumElements());
28793   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
28794
28795   // We support two versions of the scatter intrinsics. One with scalar mask and
28796   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
28797   if (Mask.getValueType() != MaskVT)
28798     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28799
28800   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
28801
28802   SDVTList VTs = DAG.getVTList(MVT::Other);
28803   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
28804   SDValue Res =
28805       DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
28806                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
28807   return Res;
28808 }
28809
28810 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
28811                                SDValue Mask, SDValue Base, SDValue Index,
28812                                SDValue ScaleOp, SDValue Chain,
28813                                const X86Subtarget &Subtarget) {
28814   SDLoc dl(Op);
28815   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
28816   // Scale must be constant.
28817   if (!C)
28818     return SDValue();
28819   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28820   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
28821                                         TLI.getPointerTy(DAG.getDataLayout()));
28822   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
28823   SDValue Segment = DAG.getRegister(0, MVT::i32);
28824   MVT MaskVT =
28825     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
28826   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28827   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
28828   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
28829   return SDValue(Res, 0);
28830 }
28831
28832 /// Handles the lowering of builtin intrinsics with chain that return their
28833 /// value into registers EDX:EAX.
28834 /// If operand ScrReg is a valid register identifier, then operand 2 of N is
28835 /// copied to SrcReg. The assumption is that SrcReg is an implicit input to
28836 /// TargetOpcode.
28837 /// Returns a Glue value which can be used to add extra copy-from-reg if the
28838 /// expanded intrinsics implicitly defines extra registers (i.e. not just
28839 /// EDX:EAX).
28840 static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
28841                                         SelectionDAG &DAG,
28842                                         unsigned TargetOpcode,
28843                                         unsigned SrcReg,
28844                                         const X86Subtarget &Subtarget,
28845                                         SmallVectorImpl<SDValue> &Results) {
28846   SDValue Chain = N->getOperand(0);
28847   SDValue Glue;
28848
28849   if (SrcReg) {
28850     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
28851     Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
28852     Glue = Chain.getValue(1);
28853   }
28854
28855   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
28856   SDValue N1Ops[] = {Chain, Glue};
28857   SDNode *N1 = DAG.getMachineNode(
28858       TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
28859   Chain = SDValue(N1, 0);
28860
28861   // Reads the content of XCR and returns it in registers EDX:EAX.
28862   SDValue LO, HI;
28863   if (Subtarget.is64Bit()) {
28864     LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
28865     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
28866                             LO.getValue(2));
28867   } else {
28868     LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
28869     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
28870                             LO.getValue(2));
28871   }
28872   Chain = HI.getValue(1);
28873   Glue = HI.getValue(2);
28874
28875   if (Subtarget.is64Bit()) {
28876     // Merge the two 32-bit values into a 64-bit one.
28877     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
28878                               DAG.getConstant(32, DL, MVT::i8));
28879     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
28880     Results.push_back(Chain);
28881     return Glue;
28882   }
28883
28884   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
28885   SDValue Ops[] = { LO, HI };
28886   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
28887   Results.push_back(Pair);
28888   Results.push_back(Chain);
28889   return Glue;
28890 }
28891
28892 /// Handles the lowering of builtin intrinsics that read the time stamp counter
28893 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
28894 /// READCYCLECOUNTER nodes.
28895 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
28896                                     SelectionDAG &DAG,
28897                                     const X86Subtarget &Subtarget,
28898                                     SmallVectorImpl<SDValue> &Results) {
28899   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
28900   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
28901   // and the EAX register is loaded with the low-order 32 bits.
28902   SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
28903                                              /* NoRegister */0, Subtarget,
28904                                              Results);
28905   if (Opcode != X86::RDTSCP)
28906     return;
28907
28908   SDValue Chain = Results[1];
28909   // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
28910   // the ECX register. Add 'ecx' explicitly to the chain.
28911   SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
28912   Results[1] = ecx;
28913   Results.push_back(ecx.getValue(1));
28914 }
28915
28916 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
28917                                      SelectionDAG &DAG) {
28918   SmallVector<SDValue, 3> Results;
28919   SDLoc DL(Op);
28920   getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
28921                           Results);
28922   return DAG.getMergeValues(Results, DL);
28923 }
28924
28925 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
28926   MachineFunction &MF = DAG.getMachineFunction();
28927   SDValue Chain = Op.getOperand(0);
28928   SDValue RegNode = Op.getOperand(2);
28929   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
28930   if (!EHInfo)
28931     report_fatal_error("EH registrations only live in functions using WinEH");
28932
28933   // Cast the operand to an alloca, and remember the frame index.
28934   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
28935   if (!FINode)
28936     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
28937   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
28938
28939   // Return the chain operand without making any DAG nodes.
28940   return Chain;
28941 }
28942
28943 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
28944   MachineFunction &MF = DAG.getMachineFunction();
28945   SDValue Chain = Op.getOperand(0);
28946   SDValue EHGuard = Op.getOperand(2);
28947   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
28948   if (!EHInfo)
28949     report_fatal_error("EHGuard only live in functions using WinEH");
28950
28951   // Cast the operand to an alloca, and remember the frame index.
28952   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
28953   if (!FINode)
28954     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
28955   EHInfo->EHGuardFrameIndex = FINode->getIndex();
28956
28957   // Return the chain operand without making any DAG nodes.
28958   return Chain;
28959 }
28960
28961 /// Emit Truncating Store with signed or unsigned saturation.
28962 static SDValue
28963 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
28964                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
28965                 SelectionDAG &DAG) {
28966   SDVTList VTs = DAG.getVTList(MVT::Other);
28967   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
28968   SDValue Ops[] = { Chain, Val, Ptr, Undef };
28969   unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
28970   return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
28971 }
28972
28973 /// Emit Masked Truncating Store with signed or unsigned saturation.
28974 static SDValue
28975 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
28976                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
28977                       MachineMemOperand *MMO, SelectionDAG &DAG) {
28978   SDVTList VTs = DAG.getVTList(MVT::Other);
28979   SDValue Ops[] = { Chain, Val, Ptr, Mask };
28980   unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
28981   return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
28982 }
28983
28984 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
28985                                       SelectionDAG &DAG) {
28986   unsigned IntNo = Op.getConstantOperandVal(1);
28987   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
28988   if (!IntrData) {
28989     switch (IntNo) {
28990
28991     case Intrinsic::swift_async_context_addr: {
28992       SDLoc dl(Op);
28993       auto &MF = DAG.getMachineFunction();
28994       auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
28995       if (Subtarget.is64Bit()) {
28996         MF.getFrameInfo().setFrameAddressIsTaken(true);
28997         X86FI->setHasSwiftAsyncContext(true);
28998         SDValue Chain = Op->getOperand(0);
28999         SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
29000         SDValue Result =
29001             SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
29002                                        DAG.getTargetConstant(8, dl, MVT::i32)),
29003                     0);
29004         // Return { result, chain }.
29005         return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
29006                            CopyRBP.getValue(1));
29007       } else {
29008         // 32-bit so no special extended frame, create or reuse an existing
29009         // stack slot.
29010         if (!X86FI->getSwiftAsyncContextFrameIdx())
29011           X86FI->setSwiftAsyncContextFrameIdx(
29012               MF.getFrameInfo().CreateStackObject(4, Align(4), false));
29013         SDValue Result =
29014             DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
29015         // Return { result, chain }.
29016         return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
29017                            Op->getOperand(0));
29018       }
29019     }
29020
29021     case llvm::Intrinsic::x86_seh_ehregnode:
29022       return MarkEHRegistrationNode(Op, DAG);
29023     case llvm::Intrinsic::x86_seh_ehguard:
29024       return MarkEHGuard(Op, DAG);
29025     case llvm::Intrinsic::x86_rdpkru: {
29026       SDLoc dl(Op);
29027       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
29028       // Create a RDPKRU node and pass 0 to the ECX parameter.
29029       return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
29030                          DAG.getConstant(0, dl, MVT::i32));
29031     }
29032     case llvm::Intrinsic::x86_wrpkru: {
29033       SDLoc dl(Op);
29034       // Create a WRPKRU node, pass the input to the EAX parameter,  and pass 0
29035       // to the EDX and ECX parameters.
29036       return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
29037                          Op.getOperand(0), Op.getOperand(2),
29038                          DAG.getConstant(0, dl, MVT::i32),
29039                          DAG.getConstant(0, dl, MVT::i32));
29040     }
29041     case llvm::Intrinsic::asan_check_memaccess: {
29042       // Mark this as adjustsStack because it will be lowered to a call.
29043       DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);
29044       // Don't do anything here, we will expand these intrinsics out later.
29045       return Op;
29046     }
29047     case llvm::Intrinsic::x86_flags_read_u32:
29048     case llvm::Intrinsic::x86_flags_read_u64:
29049     case llvm::Intrinsic::x86_flags_write_u32:
29050     case llvm::Intrinsic::x86_flags_write_u64: {
29051       // We need a frame pointer because this will get lowered to a PUSH/POP
29052       // sequence.
29053       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
29054       MFI.setHasCopyImplyingStackAdjustment(true);
29055       // Don't do anything here, we will expand these intrinsics out later
29056       // during FinalizeISel in EmitInstrWithCustomInserter.
29057       return Op;
29058     }
29059     case Intrinsic::x86_lwpins32:
29060     case Intrinsic::x86_lwpins64:
29061     case Intrinsic::x86_umwait:
29062     case Intrinsic::x86_tpause: {
29063       SDLoc dl(Op);
29064       SDValue Chain = Op->getOperand(0);
29065       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
29066       unsigned Opcode;
29067
29068       switch (IntNo) {
29069       default: llvm_unreachable("Impossible intrinsic");
29070       case Intrinsic::x86_umwait:
29071         Opcode = X86ISD::UMWAIT;
29072         break;
29073       case Intrinsic::x86_tpause:
29074         Opcode = X86ISD::TPAUSE;
29075         break;
29076       case Intrinsic::x86_lwpins32:
29077       case Intrinsic::x86_lwpins64:
29078         Opcode = X86ISD::LWPINS;
29079         break;
29080       }
29081
29082       SDValue Operation =
29083           DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
29084                       Op->getOperand(3), Op->getOperand(4));
29085       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
29086       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
29087                          Operation.getValue(1));
29088     }
29089     case Intrinsic::x86_enqcmd:
29090     case Intrinsic::x86_enqcmds: {
29091       SDLoc dl(Op);
29092       SDValue Chain = Op.getOperand(0);
29093       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
29094       unsigned Opcode;
29095       switch (IntNo) {
29096       default: llvm_unreachable("Impossible intrinsic!");
29097       case Intrinsic::x86_enqcmd:
29098         Opcode = X86ISD::ENQCMD;
29099         break;
29100       case Intrinsic::x86_enqcmds:
29101         Opcode = X86ISD::ENQCMDS;
29102         break;
29103       }
29104       SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
29105                                       Op.getOperand(3));
29106       SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
29107       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
29108                          Operation.getValue(1));
29109     }
29110     case Intrinsic::x86_aesenc128kl:
29111     case Intrinsic::x86_aesdec128kl:
29112     case Intrinsic::x86_aesenc256kl:
29113     case Intrinsic::x86_aesdec256kl: {
29114       SDLoc DL(Op);
29115       SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
29116       SDValue Chain = Op.getOperand(0);
29117       unsigned Opcode;
29118
29119       switch (IntNo) {
29120       default: llvm_unreachable("Impossible intrinsic");
29121       case Intrinsic::x86_aesenc128kl:
29122         Opcode = X86ISD::AESENC128KL;
29123         break;
29124       case Intrinsic::x86_aesdec128kl:
29125         Opcode = X86ISD::AESDEC128KL;
29126         break;
29127       case Intrinsic::x86_aesenc256kl:
29128         Opcode = X86ISD::AESENC256KL;
29129         break;
29130       case Intrinsic::x86_aesdec256kl:
29131         Opcode = X86ISD::AESDEC256KL;
29132         break;
29133       }
29134
29135       MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
29136       MachineMemOperand *MMO = MemIntr->getMemOperand();
29137       EVT MemVT = MemIntr->getMemoryVT();
29138       SDValue Operation = DAG.getMemIntrinsicNode(
29139           Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
29140           MMO);
29141       SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
29142
29143       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29144                          {ZF, Operation.getValue(0), Operation.getValue(2)});
29145     }
29146     case Intrinsic::x86_aesencwide128kl:
29147     case Intrinsic::x86_aesdecwide128kl:
29148     case Intrinsic::x86_aesencwide256kl:
29149     case Intrinsic::x86_aesdecwide256kl: {
29150       SDLoc DL(Op);
29151       SDVTList VTs = DAG.getVTList(
29152           {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
29153            MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
29154       SDValue Chain = Op.getOperand(0);
29155       unsigned Opcode;
29156
29157       switch (IntNo) {
29158       default: llvm_unreachable("Impossible intrinsic");
29159       case Intrinsic::x86_aesencwide128kl:
29160         Opcode = X86ISD::AESENCWIDE128KL;
29161         break;
29162       case Intrinsic::x86_aesdecwide128kl:
29163         Opcode = X86ISD::AESDECWIDE128KL;
29164         break;
29165       case Intrinsic::x86_aesencwide256kl:
29166         Opcode = X86ISD::AESENCWIDE256KL;
29167         break;
29168       case Intrinsic::x86_aesdecwide256kl:
29169         Opcode = X86ISD::AESDECWIDE256KL;
29170         break;
29171       }
29172
29173       MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
29174       MachineMemOperand *MMO = MemIntr->getMemOperand();
29175       EVT MemVT = MemIntr->getMemoryVT();
29176       SDValue Operation = DAG.getMemIntrinsicNode(
29177           Opcode, DL, VTs,
29178           {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
29179            Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
29180            Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
29181           MemVT, MMO);
29182       SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
29183
29184       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29185                          {ZF, Operation.getValue(1), Operation.getValue(2),
29186                           Operation.getValue(3), Operation.getValue(4),
29187                           Operation.getValue(5), Operation.getValue(6),
29188                           Operation.getValue(7), Operation.getValue(8),
29189                           Operation.getValue(9)});
29190     }
29191     case Intrinsic::x86_testui: {
29192       SDLoc dl(Op);
29193       SDValue Chain = Op.getOperand(0);
29194       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
29195       SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
29196       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
29197       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
29198                          Operation.getValue(1));
29199     }
29200     case Intrinsic::x86_atomic_bts_rm:
29201     case Intrinsic::x86_atomic_btc_rm:
29202     case Intrinsic::x86_atomic_btr_rm: {
29203       SDLoc DL(Op);
29204       MVT VT = Op.getSimpleValueType();
29205       SDValue Chain = Op.getOperand(0);
29206       SDValue Op1 = Op.getOperand(2);
29207       SDValue Op2 = Op.getOperand(3);
29208       unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm   ? X86ISD::LBTS_RM
29209                      : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
29210                                                              : X86ISD::LBTR_RM;
29211       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29212       SDValue Res =
29213           DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29214                                   {Chain, Op1, Op2}, VT, MMO);
29215       Chain = Res.getValue(1);
29216       Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
29217       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
29218     }
29219     case Intrinsic::x86_atomic_bts:
29220     case Intrinsic::x86_atomic_btc:
29221     case Intrinsic::x86_atomic_btr: {
29222       SDLoc DL(Op);
29223       MVT VT = Op.getSimpleValueType();
29224       SDValue Chain = Op.getOperand(0);
29225       SDValue Op1 = Op.getOperand(2);
29226       SDValue Op2 = Op.getOperand(3);
29227       unsigned Opc = IntNo == Intrinsic::x86_atomic_bts   ? X86ISD::LBTS
29228                      : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
29229                                                           : X86ISD::LBTR;
29230       SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
29231       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29232       SDValue Res =
29233           DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29234                                   {Chain, Op1, Op2, Size}, VT, MMO);
29235       Chain = Res.getValue(1);
29236       Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
29237       unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
29238       if (Imm)
29239         Res = DAG.getNode(ISD::SHL, DL, VT, Res,
29240                           DAG.getShiftAmountConstant(Imm, VT, DL));
29241       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
29242     }
29243     case Intrinsic::x86_cmpccxadd32:
29244     case Intrinsic::x86_cmpccxadd64: {
29245       SDLoc DL(Op);
29246       SDValue Chain = Op.getOperand(0);
29247       SDValue Addr = Op.getOperand(2);
29248       SDValue Src1 = Op.getOperand(3);
29249       SDValue Src2 = Op.getOperand(4);
29250       SDValue CC = Op.getOperand(5);
29251       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29252       SDValue Operation = DAG.getMemIntrinsicNode(
29253           X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
29254           MVT::i32, MMO);
29255       return Operation;
29256     }
29257     case Intrinsic::x86_aadd32:
29258     case Intrinsic::x86_aadd64:
29259     case Intrinsic::x86_aand32:
29260     case Intrinsic::x86_aand64:
29261     case Intrinsic::x86_aor32:
29262     case Intrinsic::x86_aor64:
29263     case Intrinsic::x86_axor32:
29264     case Intrinsic::x86_axor64: {
29265       SDLoc DL(Op);
29266       SDValue Chain = Op.getOperand(0);
29267       SDValue Op1 = Op.getOperand(2);
29268       SDValue Op2 = Op.getOperand(3);
29269       MVT VT = Op2.getSimpleValueType();
29270       unsigned Opc = 0;
29271       switch (IntNo) {
29272       default:
29273         llvm_unreachable("Unknown Intrinsic");
29274       case Intrinsic::x86_aadd32:
29275       case Intrinsic::x86_aadd64:
29276         Opc = X86ISD::AADD;
29277         break;
29278       case Intrinsic::x86_aand32:
29279       case Intrinsic::x86_aand64:
29280         Opc = X86ISD::AAND;
29281         break;
29282       case Intrinsic::x86_aor32:
29283       case Intrinsic::x86_aor64:
29284         Opc = X86ISD::AOR;
29285         break;
29286       case Intrinsic::x86_axor32:
29287       case Intrinsic::x86_axor64:
29288         Opc = X86ISD::AXOR;
29289         break;
29290       }
29291       MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
29292       return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
29293                                      {Chain, Op1, Op2}, VT, MMO);
29294     }
29295     case Intrinsic::x86_atomic_add_cc:
29296     case Intrinsic::x86_atomic_sub_cc:
29297     case Intrinsic::x86_atomic_or_cc:
29298     case Intrinsic::x86_atomic_and_cc:
29299     case Intrinsic::x86_atomic_xor_cc: {
29300       SDLoc DL(Op);
29301       SDValue Chain = Op.getOperand(0);
29302       SDValue Op1 = Op.getOperand(2);
29303       SDValue Op2 = Op.getOperand(3);
29304       X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
29305       MVT VT = Op2.getSimpleValueType();
29306       unsigned Opc = 0;
29307       switch (IntNo) {
29308       default:
29309         llvm_unreachable("Unknown Intrinsic");
29310       case Intrinsic::x86_atomic_add_cc:
29311         Opc = X86ISD::LADD;
29312         break;
29313       case Intrinsic::x86_atomic_sub_cc:
29314         Opc = X86ISD::LSUB;
29315         break;
29316       case Intrinsic::x86_atomic_or_cc:
29317         Opc = X86ISD::LOR;
29318         break;
29319       case Intrinsic::x86_atomic_and_cc:
29320         Opc = X86ISD::LAND;
29321         break;
29322       case Intrinsic::x86_atomic_xor_cc:
29323         Opc = X86ISD::LXOR;
29324         break;
29325       }
29326       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
29327       SDValue LockArith =
29328           DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
29329                                   {Chain, Op1, Op2}, VT, MMO);
29330       Chain = LockArith.getValue(1);
29331       return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
29332     }
29333     }
29334     return SDValue();
29335   }
29336
29337   SDLoc dl(Op);
29338   switch(IntrData->Type) {
29339   default: llvm_unreachable("Unknown Intrinsic Type");
29340   case RDSEED:
29341   case RDRAND: {
29342     // Emit the node with the right value type.
29343     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
29344     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
29345
29346     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
29347     // Otherwise return the value from Rand, which is always 0, casted to i32.
29348     SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
29349                      DAG.getConstant(1, dl, Op->getValueType(1)),
29350                      DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
29351                      SDValue(Result.getNode(), 1)};
29352     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
29353
29354     // Return { result, isValid, chain }.
29355     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
29356                        SDValue(Result.getNode(), 2));
29357   }
29358   case GATHER_AVX2: {
29359     SDValue Chain = Op.getOperand(0);
29360     SDValue Src   = Op.getOperand(2);
29361     SDValue Base  = Op.getOperand(3);
29362     SDValue Index = Op.getOperand(4);
29363     SDValue Mask  = Op.getOperand(5);
29364     SDValue Scale = Op.getOperand(6);
29365     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
29366                              Scale, Chain, Subtarget);
29367   }
29368   case GATHER: {
29369   //gather(v1, mask, index, base, scale);
29370     SDValue Chain = Op.getOperand(0);
29371     SDValue Src   = Op.getOperand(2);
29372     SDValue Base  = Op.getOperand(3);
29373     SDValue Index = Op.getOperand(4);
29374     SDValue Mask  = Op.getOperand(5);
29375     SDValue Scale = Op.getOperand(6);
29376     return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
29377                          Chain, Subtarget);
29378   }
29379   case SCATTER: {
29380   //scatter(base, mask, index, v1, scale);
29381     SDValue Chain = Op.getOperand(0);
29382     SDValue Base  = Op.getOperand(2);
29383     SDValue Mask  = Op.getOperand(3);
29384     SDValue Index = Op.getOperand(4);
29385     SDValue Src   = Op.getOperand(5);
29386     SDValue Scale = Op.getOperand(6);
29387     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
29388                           Scale, Chain, Subtarget);
29389   }
29390   case PREFETCH: {
29391     const APInt &HintVal = Op.getConstantOperandAPInt(6);
29392     assert((HintVal == 2 || HintVal == 3) &&
29393            "Wrong prefetch hint in intrinsic: should be 2 or 3");
29394     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
29395     SDValue Chain = Op.getOperand(0);
29396     SDValue Mask  = Op.getOperand(2);
29397     SDValue Index = Op.getOperand(3);
29398     SDValue Base  = Op.getOperand(4);
29399     SDValue Scale = Op.getOperand(5);
29400     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
29401                            Subtarget);
29402   }
29403   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
29404   case RDTSC: {
29405     SmallVector<SDValue, 2> Results;
29406     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
29407                             Results);
29408     return DAG.getMergeValues(Results, dl);
29409   }
29410   // Read Performance Monitoring Counters.
29411   case RDPMC:
29412   // Read Processor Register.
29413   case RDPRU:
29414   // GetExtended Control Register.
29415   case XGETBV: {
29416     SmallVector<SDValue, 2> Results;
29417
29418     // RDPMC uses ECX to select the index of the performance counter to read.
29419     // RDPRU uses ECX to select the processor register to read.
29420     // XGETBV uses ECX to select the index of the XCR register to return.
29421     // The result is stored into registers EDX:EAX.
29422     expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
29423                                 Subtarget, Results);
29424     return DAG.getMergeValues(Results, dl);
29425   }
29426   // XTEST intrinsics.
29427   case XTEST: {
29428     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
29429     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
29430
29431     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
29432     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
29433     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
29434                        Ret, SDValue(InTrans.getNode(), 1));
29435   }
29436   case TRUNCATE_TO_MEM_VI8:
29437   case TRUNCATE_TO_MEM_VI16:
29438   case TRUNCATE_TO_MEM_VI32: {
29439     SDValue Mask = Op.getOperand(4);
29440     SDValue DataToTruncate = Op.getOperand(3);
29441     SDValue Addr = Op.getOperand(2);
29442     SDValue Chain = Op.getOperand(0);
29443
29444     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
29445     assert(MemIntr && "Expected MemIntrinsicSDNode!");
29446
29447     EVT MemVT  = MemIntr->getMemoryVT();
29448
29449     uint16_t TruncationOp = IntrData->Opc0;
29450     switch (TruncationOp) {
29451     case X86ISD::VTRUNC: {
29452       if (isAllOnesConstant(Mask)) // return just a truncate store
29453         return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
29454                                  MemIntr->getMemOperand());
29455
29456       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
29457       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
29458       SDValue Offset = DAG.getUNDEF(VMask.getValueType());
29459
29460       return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
29461                                 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
29462                                 true /* truncating */);
29463     }
29464     case X86ISD::VTRUNCUS:
29465     case X86ISD::VTRUNCS: {
29466       bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
29467       if (isAllOnesConstant(Mask))
29468         return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
29469                                MemIntr->getMemOperand(), DAG);
29470
29471       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
29472       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
29473
29474       return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
29475                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
29476     }
29477     default:
29478       llvm_unreachable("Unsupported truncstore intrinsic");
29479     }
29480   }
29481   }
29482 }
29483
29484 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
29485                                            SelectionDAG &DAG) const {
29486   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
29487   MFI.setReturnAddressIsTaken(true);
29488
29489   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
29490     return SDValue();
29491
29492   unsigned Depth = Op.getConstantOperandVal(0);
29493   SDLoc dl(Op);
29494   EVT PtrVT = getPointerTy(DAG.getDataLayout());
29495
29496   if (Depth > 0) {
29497     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
29498     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29499     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
29500     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
29501                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
29502                        MachinePointerInfo());
29503   }
29504
29505   // Just load the return address.
29506   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
29507   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
29508                      MachinePointerInfo());
29509 }
29510
29511 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
29512                                                  SelectionDAG &DAG) const {
29513   DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
29514   return getReturnAddressFrameIndex(DAG);
29515 }
29516
29517 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
29518   MachineFunction &MF = DAG.getMachineFunction();
29519   MachineFrameInfo &MFI = MF.getFrameInfo();
29520   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
29521   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29522   EVT VT = Op.getValueType();
29523
29524   MFI.setFrameAddressIsTaken(true);
29525
29526   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
29527     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
29528     // is not possible to crawl up the stack without looking at the unwind codes
29529     // simultaneously.
29530     int FrameAddrIndex = FuncInfo->getFAIndex();
29531     if (!FrameAddrIndex) {
29532       // Set up a frame object for the return address.
29533       unsigned SlotSize = RegInfo->getSlotSize();
29534       FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
29535           SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
29536       FuncInfo->setFAIndex(FrameAddrIndex);
29537     }
29538     return DAG.getFrameIndex(FrameAddrIndex, VT);
29539   }
29540
29541   unsigned FrameReg =
29542       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
29543   SDLoc dl(Op);  // FIXME probably not meaningful
29544   unsigned Depth = Op.getConstantOperandVal(0);
29545   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
29546           (FrameReg == X86::EBP && VT == MVT::i32)) &&
29547          "Invalid Frame Register!");
29548   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
29549   while (Depth--)
29550     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
29551                             MachinePointerInfo());
29552   return FrameAddr;
29553 }
29554
29555 // FIXME? Maybe this could be a TableGen attribute on some registers and
29556 // this table could be generated automatically from RegInfo.
29557 Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
29558                                               const MachineFunction &MF) const {
29559   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
29560
29561   Register Reg = StringSwitch<unsigned>(RegName)
29562                        .Case("esp", X86::ESP)
29563                        .Case("rsp", X86::RSP)
29564                        .Case("ebp", X86::EBP)
29565                        .Case("rbp", X86::RBP)
29566                        .Default(0);
29567
29568   if (Reg == X86::EBP || Reg == X86::RBP) {
29569     if (!TFI.hasFP(MF))
29570       report_fatal_error("register " + StringRef(RegName) +
29571                          " is allocatable: function has no frame pointer");
29572 #ifndef NDEBUG
29573     else {
29574       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29575       Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
29576       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
29577              "Invalid Frame Register!");
29578     }
29579 #endif
29580   }
29581
29582   if (Reg)
29583     return Reg;
29584
29585   report_fatal_error("Invalid register name global variable");
29586 }
29587
29588 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
29589                                                      SelectionDAG &DAG) const {
29590   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29591   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
29592 }
29593
29594 Register X86TargetLowering::getExceptionPointerRegister(
29595     const Constant *PersonalityFn) const {
29596   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
29597     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
29598
29599   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
29600 }
29601
29602 Register X86TargetLowering::getExceptionSelectorRegister(
29603     const Constant *PersonalityFn) const {
29604   // Funclet personalities don't use selectors (the runtime does the selection).
29605   if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
29606     return X86::NoRegister;
29607   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
29608 }
29609
29610 bool X86TargetLowering::needsFixedCatchObjects() const {
29611   return Subtarget.isTargetWin64();
29612 }
29613
29614 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
29615   SDValue Chain     = Op.getOperand(0);
29616   SDValue Offset    = Op.getOperand(1);
29617   SDValue Handler   = Op.getOperand(2);
29618   SDLoc dl      (Op);
29619
29620   EVT PtrVT = getPointerTy(DAG.getDataLayout());
29621   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
29622   Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
29623   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
29624           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
29625          "Invalid Frame Register!");
29626   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
29627   Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
29628
29629   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
29630                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
29631                                                        dl));
29632   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
29633   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
29634   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
29635
29636   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
29637                      DAG.getRegister(StoreAddrReg, PtrVT));
29638 }
29639
29640 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
29641                                                SelectionDAG &DAG) const {
29642   SDLoc DL(Op);
29643   // If the subtarget is not 64bit, we may need the global base reg
29644   // after isel expand pseudo, i.e., after CGBR pass ran.
29645   // Therefore, ask for the GlobalBaseReg now, so that the pass
29646   // inserts the code for us in case we need it.
29647   // Otherwise, we will end up in a situation where we will
29648   // reference a virtual register that is not defined!
29649   if (!Subtarget.is64Bit()) {
29650     const X86InstrInfo *TII = Subtarget.getInstrInfo();
29651     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
29652   }
29653   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
29654                      DAG.getVTList(MVT::i32, MVT::Other),
29655                      Op.getOperand(0), Op.getOperand(1));
29656 }
29657
29658 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
29659                                                 SelectionDAG &DAG) const {
29660   SDLoc DL(Op);
29661   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
29662                      Op.getOperand(0), Op.getOperand(1));
29663 }
29664
29665 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
29666                                                        SelectionDAG &DAG) const {
29667   SDLoc DL(Op);
29668   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
29669                      Op.getOperand(0));
29670 }
29671
29672 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
29673   return Op.getOperand(0);
29674 }
29675
29676 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
29677                                                 SelectionDAG &DAG) const {
29678   SDValue Root = Op.getOperand(0);
29679   SDValue Trmp = Op.getOperand(1); // trampoline
29680   SDValue FPtr = Op.getOperand(2); // nested function
29681   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
29682   SDLoc dl (Op);
29683
29684   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
29685   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
29686
29687   if (Subtarget.is64Bit()) {
29688     SDValue OutChains[6];
29689
29690     // Large code-model.
29691     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
29692     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
29693
29694     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
29695     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
29696
29697     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
29698
29699     // Load the pointer to the nested function into R11.
29700     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
29701     SDValue Addr = Trmp;
29702     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29703                                 Addr, MachinePointerInfo(TrmpAddr));
29704
29705     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29706                        DAG.getConstant(2, dl, MVT::i64));
29707     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
29708                                 MachinePointerInfo(TrmpAddr, 2), Align(2));
29709
29710     // Load the 'nest' parameter value into R10.
29711     // R10 is specified in X86CallingConv.td
29712     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
29713     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29714                        DAG.getConstant(10, dl, MVT::i64));
29715     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29716                                 Addr, MachinePointerInfo(TrmpAddr, 10));
29717
29718     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29719                        DAG.getConstant(12, dl, MVT::i64));
29720     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
29721                                 MachinePointerInfo(TrmpAddr, 12), Align(2));
29722
29723     // Jump to the nested function.
29724     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
29725     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29726                        DAG.getConstant(20, dl, MVT::i64));
29727     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
29728                                 Addr, MachinePointerInfo(TrmpAddr, 20));
29729
29730     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
29731     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
29732                        DAG.getConstant(22, dl, MVT::i64));
29733     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
29734                                 Addr, MachinePointerInfo(TrmpAddr, 22));
29735
29736     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
29737   } else {
29738     const Function *Func =
29739       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
29740     CallingConv::ID CC = Func->getCallingConv();
29741     unsigned NestReg;
29742
29743     switch (CC) {
29744     default:
29745       llvm_unreachable("Unsupported calling convention");
29746     case CallingConv::C:
29747     case CallingConv::X86_StdCall: {
29748       // Pass 'nest' parameter in ECX.
29749       // Must be kept in sync with X86CallingConv.td
29750       NestReg = X86::ECX;
29751
29752       // Check that ECX wasn't needed by an 'inreg' parameter.
29753       FunctionType *FTy = Func->getFunctionType();
29754       const AttributeList &Attrs = Func->getAttributes();
29755
29756       if (!Attrs.isEmpty() && !Func->isVarArg()) {
29757         unsigned InRegCount = 0;
29758         unsigned Idx = 0;
29759
29760         for (FunctionType::param_iterator I = FTy->param_begin(),
29761              E = FTy->param_end(); I != E; ++I, ++Idx)
29762           if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
29763             const DataLayout &DL = DAG.getDataLayout();
29764             // FIXME: should only count parameters that are lowered to integers.
29765             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
29766           }
29767
29768         if (InRegCount > 2) {
29769           report_fatal_error("Nest register in use - reduce number of inreg"
29770                              " parameters!");
29771         }
29772       }
29773       break;
29774     }
29775     case CallingConv::X86_FastCall:
29776     case CallingConv::X86_ThisCall:
29777     case CallingConv::Fast:
29778     case CallingConv::Tail:
29779     case CallingConv::SwiftTail:
29780       // Pass 'nest' parameter in EAX.
29781       // Must be kept in sync with X86CallingConv.td
29782       NestReg = X86::EAX;
29783       break;
29784     }
29785
29786     SDValue OutChains[4];
29787     SDValue Addr, Disp;
29788
29789     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29790                        DAG.getConstant(10, dl, MVT::i32));
29791     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
29792
29793     // This is storing the opcode for MOV32ri.
29794     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
29795     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
29796     OutChains[0] =
29797         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
29798                      Trmp, MachinePointerInfo(TrmpAddr));
29799
29800     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29801                        DAG.getConstant(1, dl, MVT::i32));
29802     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
29803                                 MachinePointerInfo(TrmpAddr, 1), Align(1));
29804
29805     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
29806     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29807                        DAG.getConstant(5, dl, MVT::i32));
29808     OutChains[2] =
29809         DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
29810                      MachinePointerInfo(TrmpAddr, 5), Align(1));
29811
29812     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
29813                        DAG.getConstant(6, dl, MVT::i32));
29814     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
29815                                 MachinePointerInfo(TrmpAddr, 6), Align(1));
29816
29817     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
29818   }
29819 }
29820
29821 SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
29822                                              SelectionDAG &DAG) const {
29823   /*
29824    The rounding mode is in bits 11:10 of FPSR, and has the following
29825    settings:
29826      00 Round to nearest
29827      01 Round to -inf
29828      10 Round to +inf
29829      11 Round to 0
29830
29831   GET_ROUNDING, on the other hand, expects the following:
29832     -1 Undefined
29833      0 Round to 0
29834      1 Round to nearest
29835      2 Round to +inf
29836      3 Round to -inf
29837
29838   To perform the conversion, we use a packed lookup table of the four 2-bit
29839   values that we can index by FPSP[11:10]
29840     0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
29841
29842     (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
29843   */
29844
29845   MachineFunction &MF = DAG.getMachineFunction();
29846   MVT VT = Op.getSimpleValueType();
29847   SDLoc DL(Op);
29848
29849   // Save FP Control Word to stack slot
29850   int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
29851   SDValue StackSlot =
29852       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
29853
29854   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
29855
29856   SDValue Chain = Op.getOperand(0);
29857   SDValue Ops[] = {Chain, StackSlot};
29858   Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
29859                                   DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
29860                                   Align(2), MachineMemOperand::MOStore);
29861
29862   // Load FP Control Word from stack slot
29863   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
29864   Chain = CWD.getValue(1);
29865
29866   // Mask and turn the control bits into a shift for the lookup table.
29867   SDValue Shift =
29868     DAG.getNode(ISD::SRL, DL, MVT::i16,
29869                 DAG.getNode(ISD::AND, DL, MVT::i16,
29870                             CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
29871                 DAG.getConstant(9, DL, MVT::i8));
29872   Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
29873
29874   SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
29875   SDValue RetVal =
29876     DAG.getNode(ISD::AND, DL, MVT::i32,
29877                 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
29878                 DAG.getConstant(3, DL, MVT::i32));
29879
29880   RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
29881
29882   return DAG.getMergeValues({RetVal, Chain}, DL);
29883 }
29884
29885 SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
29886                                              SelectionDAG &DAG) const {
29887   MachineFunction &MF = DAG.getMachineFunction();
29888   SDLoc DL(Op);
29889   SDValue Chain = Op.getNode()->getOperand(0);
29890
29891   // FP control word may be set only from data in memory. So we need to allocate
29892   // stack space to save/load FP control word.
29893   int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
29894   SDValue StackSlot =
29895       DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
29896   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
29897   MachineMemOperand *MMO =
29898       MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
29899
29900   // Store FP control word into memory.
29901   SDValue Ops[] = {Chain, StackSlot};
29902   Chain = DAG.getMemIntrinsicNode(
29903       X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
29904
29905   // Load FP Control Word from stack slot and clear RM field (bits 11:10).
29906   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
29907   Chain = CWD.getValue(1);
29908   CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
29909                     DAG.getConstant(0xf3ff, DL, MVT::i16));
29910
29911   // Calculate new rounding mode.
29912   SDValue NewRM = Op.getNode()->getOperand(1);
29913   SDValue RMBits;
29914   if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
29915     uint64_t RM = CVal->getZExtValue();
29916     int FieldVal;
29917     switch (static_cast<RoundingMode>(RM)) {
29918     case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
29919     case RoundingMode::TowardNegative:    FieldVal = X86::rmDownward; break;
29920     case RoundingMode::TowardPositive:    FieldVal = X86::rmUpward; break;
29921     case RoundingMode::TowardZero:        FieldVal = X86::rmTowardZero; break;
29922     default:
29923       llvm_unreachable("rounding mode is not supported by X86 hardware");
29924     }
29925     RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
29926   } else {
29927     // Need to convert argument into bits of control word:
29928     //    0 Round to 0       -> 11
29929     //    1 Round to nearest -> 00
29930     //    2 Round to +inf    -> 10
29931     //    3 Round to -inf    -> 01
29932     // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
29933     // To make the conversion, put all these values into a value 0xc9 and shift
29934     // it left depending on the rounding mode:
29935     //    (0xc9 << 4) & 0xc00 = X86::rmTowardZero
29936     //    (0xc9 << 6) & 0xc00 = X86::rmToNearest
29937     //    ...
29938     // (0xc9 << (2 * NewRM + 4)) & 0xc00
29939     SDValue ShiftValue =
29940         DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
29941                     DAG.getNode(ISD::ADD, DL, MVT::i32,
29942                                 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
29943                                             DAG.getConstant(1, DL, MVT::i8)),
29944                                 DAG.getConstant(4, DL, MVT::i32)));
29945     SDValue Shifted =
29946         DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
29947                     ShiftValue);
29948     RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
29949                          DAG.getConstant(0xc00, DL, MVT::i16));
29950   }
29951
29952   // Update rounding mode bits and store the new FP Control Word into stack.
29953   CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
29954   Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
29955
29956   // Load FP control word from the slot.
29957   SDValue OpsLD[] = {Chain, StackSlot};
29958   MachineMemOperand *MMOL =
29959       MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
29960   Chain = DAG.getMemIntrinsicNode(
29961       X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
29962
29963   // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
29964   // same way but in bits 14:13.
29965   if (Subtarget.hasSSE1()) {
29966     // Store MXCSR into memory.
29967     Chain = DAG.getNode(
29968         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
29969         DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
29970         StackSlot);
29971
29972     // Load MXCSR from stack slot and clear RM field (bits 14:13).
29973     SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
29974     Chain = CWD.getValue(1);
29975     CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
29976                       DAG.getConstant(0xffff9fff, DL, MVT::i32));
29977
29978     // Shift X87 RM bits from 11:10 to 14:13.
29979     RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
29980     RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
29981                          DAG.getConstant(3, DL, MVT::i8));
29982
29983     // Update rounding mode bits and store the new FP Control Word into stack.
29984     CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
29985     Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
29986
29987     // Load MXCSR from the slot.
29988     Chain = DAG.getNode(
29989         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
29990         DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
29991         StackSlot);
29992   }
29993
29994   return Chain;
29995 }
29996
29997 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
29998 //
29999 // i8/i16 vector implemented using dword LZCNT vector instruction
30000 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
30001 // split the vector, perform operation on it's Lo a Hi part and
30002 // concatenate the results.
30003 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
30004                                          const X86Subtarget &Subtarget) {
30005   assert(Op.getOpcode() == ISD::CTLZ);
30006   SDLoc dl(Op);
30007   MVT VT = Op.getSimpleValueType();
30008   MVT EltVT = VT.getVectorElementType();
30009   unsigned NumElems = VT.getVectorNumElements();
30010
30011   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
30012           "Unsupported element type");
30013
30014   // Split vector, it's Lo and Hi parts will be handled in next iteration.
30015   if (NumElems > 16 ||
30016       (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
30017     return splitVectorIntUnary(Op, DAG);
30018
30019   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
30020   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
30021           "Unsupported value type for operation");
30022
30023   // Use native supported vector instruction vplzcntd.
30024   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
30025   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
30026   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
30027   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
30028
30029   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
30030 }
30031
30032 // Lower CTLZ using a PSHUFB lookup table implementation.
30033 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
30034                                        const X86Subtarget &Subtarget,
30035                                        SelectionDAG &DAG) {
30036   MVT VT = Op.getSimpleValueType();
30037   int NumElts = VT.getVectorNumElements();
30038   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
30039   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
30040
30041   // Per-nibble leading zero PSHUFB lookup table.
30042   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
30043                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
30044                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
30045                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
30046
30047   SmallVector<SDValue, 64> LUTVec;
30048   for (int i = 0; i < NumBytes; ++i)
30049     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
30050   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
30051
30052   // Begin by bitcasting the input to byte vector, then split those bytes
30053   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
30054   // If the hi input nibble is zero then we add both results together, otherwise
30055   // we just take the hi result (by masking the lo result to zero before the
30056   // add).
30057   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
30058   SDValue Zero = DAG.getConstant(0, DL, CurrVT);
30059
30060   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
30061   SDValue Lo = Op0;
30062   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
30063   SDValue HiZ;
30064   if (CurrVT.is512BitVector()) {
30065     MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
30066     HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
30067     HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
30068   } else {
30069     HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
30070   }
30071
30072   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
30073   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
30074   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
30075   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
30076
30077   // Merge result back from vXi8 back to VT, working on the lo/hi halves
30078   // of the current vector width in the same way we did for the nibbles.
30079   // If the upper half of the input element is zero then add the halves'
30080   // leading zero counts together, otherwise just use the upper half's.
30081   // Double the width of the result until we are at target width.
30082   while (CurrVT != VT) {
30083     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
30084     int CurrNumElts = CurrVT.getVectorNumElements();
30085     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
30086     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
30087     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
30088
30089     // Check if the upper half of the input element is zero.
30090     if (CurrVT.is512BitVector()) {
30091       MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
30092       HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
30093                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
30094       HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
30095     } else {
30096       HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
30097                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
30098     }
30099     HiZ = DAG.getBitcast(NextVT, HiZ);
30100
30101     // Move the upper/lower halves to the lower bits as we'll be extending to
30102     // NextVT. Mask the lower result to zero if HiZ is true and add the results
30103     // together.
30104     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
30105     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
30106     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
30107     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
30108     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
30109     CurrVT = NextVT;
30110   }
30111
30112   return Res;
30113 }
30114
30115 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
30116                                const X86Subtarget &Subtarget,
30117                                SelectionDAG &DAG) {
30118   MVT VT = Op.getSimpleValueType();
30119
30120   if (Subtarget.hasCDI() &&
30121       // vXi8 vectors need to be promoted to 512-bits for vXi32.
30122       (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
30123     return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
30124
30125   // Decompose 256-bit ops into smaller 128-bit ops.
30126   if (VT.is256BitVector() && !Subtarget.hasInt256())
30127     return splitVectorIntUnary(Op, DAG);
30128
30129   // Decompose 512-bit ops into smaller 256-bit ops.
30130   if (VT.is512BitVector() && !Subtarget.hasBWI())
30131     return splitVectorIntUnary(Op, DAG);
30132
30133   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
30134   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
30135 }
30136
30137 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
30138                          SelectionDAG &DAG) {
30139   MVT VT = Op.getSimpleValueType();
30140   MVT OpVT = VT;
30141   unsigned NumBits = VT.getSizeInBits();
30142   SDLoc dl(Op);
30143   unsigned Opc = Op.getOpcode();
30144
30145   if (VT.isVector())
30146     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
30147
30148   Op = Op.getOperand(0);
30149   if (VT == MVT::i8) {
30150     // Zero extend to i32 since there is not an i8 bsr.
30151     OpVT = MVT::i32;
30152     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
30153   }
30154
30155   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
30156   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
30157   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
30158
30159   if (Opc == ISD::CTLZ) {
30160     // If src is zero (i.e. bsr sets ZF), returns NumBits.
30161     SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
30162                      DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
30163                      Op.getValue(1)};
30164     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
30165   }
30166
30167   // Finally xor with NumBits-1.
30168   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
30169                    DAG.getConstant(NumBits - 1, dl, OpVT));
30170
30171   if (VT == MVT::i8)
30172     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
30173   return Op;
30174 }
30175
30176 static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
30177                          SelectionDAG &DAG) {
30178   MVT VT = Op.getSimpleValueType();
30179   unsigned NumBits = VT.getScalarSizeInBits();
30180   SDValue N0 = Op.getOperand(0);
30181   SDLoc dl(Op);
30182
30183   assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
30184          "Only scalar CTTZ requires custom lowering");
30185
30186   // Issue a bsf (scan bits forward) which also sets EFLAGS.
30187   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
30188   Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
30189
30190   // If src is known never zero we can skip the CMOV.
30191   if (DAG.isKnownNeverZero(N0))
30192     return Op;
30193
30194   // If src is zero (i.e. bsf sets ZF), returns NumBits.
30195   SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
30196                    DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
30197                    Op.getValue(1)};
30198   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
30199 }
30200
30201 static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
30202                            const X86Subtarget &Subtarget) {
30203   MVT VT = Op.getSimpleValueType();
30204   if (VT == MVT::i16 || VT == MVT::i32)
30205     return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
30206
30207   if (VT == MVT::v32i16 || VT == MVT::v64i8)
30208     return splitVectorIntBinary(Op, DAG);
30209
30210   assert(Op.getSimpleValueType().is256BitVector() &&
30211          Op.getSimpleValueType().isInteger() &&
30212          "Only handle AVX 256-bit vector integer operation");
30213   return splitVectorIntBinary(Op, DAG);
30214 }
30215
30216 static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
30217                                   const X86Subtarget &Subtarget) {
30218   MVT VT = Op.getSimpleValueType();
30219   SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
30220   unsigned Opcode = Op.getOpcode();
30221   SDLoc DL(Op);
30222
30223   if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
30224       (VT.is256BitVector() && !Subtarget.hasInt256())) {
30225     assert(Op.getSimpleValueType().isInteger() &&
30226            "Only handle AVX vector integer operation");
30227     return splitVectorIntBinary(Op, DAG);
30228   }
30229
30230   // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
30231   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30232   EVT SetCCResultType =
30233       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30234
30235   unsigned BitWidth = VT.getScalarSizeInBits();
30236   if (Opcode == ISD::USUBSAT) {
30237     if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
30238       // Handle a special-case with a bit-hack instead of cmp+select:
30239       // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
30240       // If the target can use VPTERNLOG, DAGToDAG will match this as
30241       // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
30242       // "broadcast" constant load.
30243       ConstantSDNode *C = isConstOrConstSplat(Y, true);
30244       if (C && C->getAPIntValue().isSignMask()) {
30245         SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
30246         SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
30247         SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
30248         SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
30249         return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
30250       }
30251     }
30252     if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
30253       // usubsat X, Y --> (X >u Y) ? X - Y : 0
30254       SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
30255       SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
30256       // TODO: Move this to DAGCombiner?
30257       if (SetCCResultType == VT &&
30258           DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
30259         return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
30260       return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
30261     }
30262   }
30263
30264   if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
30265       (!VT.isVector() || VT == MVT::v2i64)) {
30266     APInt MinVal = APInt::getSignedMinValue(BitWidth);
30267     APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
30268     SDValue Zero = DAG.getConstant(0, DL, VT);
30269     SDValue Result =
30270         DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
30271                     DAG.getVTList(VT, SetCCResultType), X, Y);
30272     SDValue SumDiff = Result.getValue(0);
30273     SDValue Overflow = Result.getValue(1);
30274     SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
30275     SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
30276     SDValue SumNeg =
30277         DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
30278     Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
30279     return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
30280   }
30281
30282   // Use default expansion.
30283   return SDValue();
30284 }
30285
30286 static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
30287                         SelectionDAG &DAG) {
30288   MVT VT = Op.getSimpleValueType();
30289   if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
30290     // Since X86 does not have CMOV for 8-bit integer, we don't convert
30291     // 8-bit integer abs to NEG and CMOV.
30292     SDLoc DL(Op);
30293     SDValue N0 = Op.getOperand(0);
30294     SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
30295                               DAG.getConstant(0, DL, VT), N0);
30296     SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
30297                      SDValue(Neg.getNode(), 1)};
30298     return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
30299   }
30300
30301   // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
30302   if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
30303     SDLoc DL(Op);
30304     SDValue Src = Op.getOperand(0);
30305     SDValue Sub =
30306         DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
30307     return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
30308   }
30309
30310   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
30311     assert(VT.isInteger() &&
30312            "Only handle AVX 256-bit vector integer operation");
30313     return splitVectorIntUnary(Op, DAG);
30314   }
30315
30316   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30317     return splitVectorIntUnary(Op, DAG);
30318
30319   // Default to expand.
30320   return SDValue();
30321 }
30322
30323 static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
30324                         SelectionDAG &DAG) {
30325   MVT VT = Op.getSimpleValueType();
30326
30327   // For AVX1 cases, split to use legal ops.
30328   if (VT.is256BitVector() && !Subtarget.hasInt256())
30329     return splitVectorIntBinary(Op, DAG);
30330
30331   if (VT == MVT::v32i16 || VT == MVT::v64i8)
30332     return splitVectorIntBinary(Op, DAG);
30333
30334   // Default to expand.
30335   return SDValue();
30336 }
30337
30338 static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
30339                            SelectionDAG &DAG) {
30340   MVT VT = Op.getSimpleValueType();
30341
30342   // For AVX1 cases, split to use legal ops.
30343   if (VT.is256BitVector() && !Subtarget.hasInt256())
30344     return splitVectorIntBinary(Op, DAG);
30345
30346   if (VT == MVT::v32i16 || VT == MVT::v64i8)
30347     return splitVectorIntBinary(Op, DAG);
30348
30349   // Default to expand.
30350   return SDValue();
30351 }
30352
30353 static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
30354                                       SelectionDAG &DAG) {
30355   assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
30356          "Expected FMAXIMUM or FMINIMUM opcode");
30357   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30358   EVT VT = Op.getValueType();
30359   SDValue X = Op.getOperand(0);
30360   SDValue Y = Op.getOperand(1);
30361   SDLoc DL(Op);
30362   uint64_t SizeInBits = VT.getScalarSizeInBits();
30363   APInt PreferredZero = APInt::getZero(SizeInBits);
30364   APInt OppositeZero = PreferredZero;
30365   EVT IVT = VT.changeTypeToInteger();
30366   X86ISD::NodeType MinMaxOp;
30367   if (Op.getOpcode() == ISD::FMAXIMUM) {
30368     MinMaxOp = X86ISD::FMAX;
30369     OppositeZero.setSignBit();
30370   } else {
30371     PreferredZero.setSignBit();
30372     MinMaxOp = X86ISD::FMIN;
30373   }
30374   EVT SetCCType =
30375       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30376
30377   // The tables below show the expected result of Max in cases of NaN and
30378   // signed zeros.
30379   //
30380   //                 Y                       Y
30381   //             Num   xNaN              +0     -0
30382   //          ---------------         ---------------
30383   //     Num  |  Max |   Y  |     +0  |  +0  |  +0  |
30384   // X        ---------------  X      ---------------
30385   //    xNaN  |   X  |  X/Y |     -0  |  +0  |  -0  |
30386   //          ---------------         ---------------
30387   //
30388   // It is achieved by means of FMAX/FMIN with preliminary checks and operand
30389   // reordering.
30390   //
30391   // We check if any of operands is NaN and return NaN. Then we check if any of
30392   // operands is zero or negative zero (for fmaximum and fminimum respectively)
30393   // to ensure the correct zero is returned.
30394   auto MatchesZero = [](SDValue Op, APInt Zero) {
30395     Op = peekThroughBitcasts(Op);
30396     if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
30397       return CstOp->getValueAPF().bitcastToAPInt() == Zero;
30398     if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
30399       return CstOp->getAPIntValue() == Zero;
30400     if (Op->getOpcode() == ISD::BUILD_VECTOR ||
30401         Op->getOpcode() == ISD::SPLAT_VECTOR) {
30402       for (const SDValue &OpVal : Op->op_values()) {
30403         if (OpVal.isUndef())
30404           continue;
30405         auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
30406         if (!CstOp)
30407           return false;
30408         if (!CstOp->getValueAPF().isZero())
30409           continue;
30410         if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
30411           return false;
30412       }
30413       return true;
30414     }
30415     return false;
30416   };
30417
30418   bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
30419   bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
30420   bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
30421                           Op->getFlags().hasNoSignedZeros() ||
30422                           DAG.isKnownNeverZeroFloat(X) ||
30423                           DAG.isKnownNeverZeroFloat(Y);
30424   SDValue NewX, NewY;
30425   if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
30426       MatchesZero(X, OppositeZero)) {
30427     // Operands are already in right order or order does not matter.
30428     NewX = X;
30429     NewY = Y;
30430   } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
30431     NewX = Y;
30432     NewY = X;
30433   } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
30434              (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
30435     if (IsXNeverNaN)
30436       std::swap(X, Y);
30437     // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
30438     // xmm register.
30439     MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
30440     SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorType, X);
30441     // Bits of classes:
30442     // Bits  Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4]  Imm8[5]  Imm8[6] Imm8[7]
30443     // Class    QNAN PosZero NegZero  PosINF  NegINF Denormal Negative    SNAN
30444     SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
30445                                         DL, MVT::i32);
30446     SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
30447     SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
30448                               DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
30449                               DAG.getIntPtrConstant(0, DL));
30450     SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
30451     NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
30452     NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
30453     return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
30454   } else {
30455     SDValue IsXSigned;
30456     if (Subtarget.is64Bit() || VT != MVT::f64) {
30457       SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
30458       SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
30459       IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
30460     } else {
30461       assert(VT == MVT::f64);
30462       SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
30463                                 DAG.getConstantFP(0, DL, MVT::v2f64), X,
30464                                 DAG.getIntPtrConstant(0, DL));
30465       SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
30466       SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
30467                                DAG.getIntPtrConstant(1, DL));
30468       Hi = DAG.getBitcast(MVT::i32, Hi);
30469       SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
30470       EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
30471                                              *DAG.getContext(), MVT::i32);
30472       IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
30473     }
30474     if (MinMaxOp == X86ISD::FMAX) {
30475       NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
30476       NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
30477     } else {
30478       NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
30479       NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
30480     }
30481   }
30482
30483   bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
30484                    Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
30485
30486   // If we did no ordering operands for singed zero handling and we need
30487   // to process NaN and we know that the second operand is not NaN then put
30488   // it in first operand and we will not need to post handle NaN after max/min.
30489   if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
30490     std::swap(NewX, NewY);
30491
30492   SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
30493
30494   if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
30495     return MinMax;
30496
30497   SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
30498   return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
30499 }
30500
30501 static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
30502                         SelectionDAG &DAG) {
30503   MVT VT = Op.getSimpleValueType();
30504
30505   // For AVX1 cases, split to use legal ops.
30506   if (VT.is256BitVector() && !Subtarget.hasInt256())
30507     return splitVectorIntBinary(Op, DAG);
30508
30509   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
30510     return splitVectorIntBinary(Op, DAG);
30511
30512   SDLoc dl(Op);
30513   bool IsSigned = Op.getOpcode() == ISD::ABDS;
30514   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30515
30516   // TODO: Move to TargetLowering expandABD() once we have ABD promotion.
30517   if (VT.isScalarInteger()) {
30518     unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
30519     MVT WideVT = MVT::getIntegerVT(WideBits);
30520     if (TLI.isTypeLegal(WideVT)) {
30521       // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
30522       // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
30523       unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30524       SDValue LHS = DAG.getFreeze(Op.getOperand(0));
30525       SDValue RHS = DAG.getFreeze(Op.getOperand(1));
30526       LHS = DAG.getNode(ExtOpc, dl, WideVT, LHS);
30527       RHS = DAG.getNode(ExtOpc, dl, WideVT, RHS);
30528       SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS);
30529       SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff);
30530       return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
30531     }
30532   }
30533
30534   // TODO: Move to TargetLowering expandABD().
30535   if (!Subtarget.hasSSE41() &&
30536       ((IsSigned && VT == MVT::v16i8) || VT == MVT::v4i32)) {
30537     SDValue LHS = DAG.getFreeze(Op.getOperand(0));
30538     SDValue RHS = DAG.getFreeze(Op.getOperand(1));
30539     ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
30540     SDValue Cmp = DAG.getSetCC(dl, VT, LHS, RHS, CC);
30541     SDValue Diff0 = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
30542     SDValue Diff1 = DAG.getNode(ISD::SUB, dl, VT, RHS, LHS);
30543     return getBitSelect(dl, VT, Diff0, Diff1, Cmp, DAG);
30544   }
30545
30546   // Default to expand.
30547   return SDValue();
30548 }
30549
30550 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
30551                         SelectionDAG &DAG) {
30552   SDLoc dl(Op);
30553   MVT VT = Op.getSimpleValueType();
30554
30555   // Decompose 256-bit ops into 128-bit ops.
30556   if (VT.is256BitVector() && !Subtarget.hasInt256())
30557     return splitVectorIntBinary(Op, DAG);
30558
30559   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30560     return splitVectorIntBinary(Op, DAG);
30561
30562   SDValue A = Op.getOperand(0);
30563   SDValue B = Op.getOperand(1);
30564
30565   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
30566   // vector pairs, multiply and truncate.
30567   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
30568     unsigned NumElts = VT.getVectorNumElements();
30569
30570     if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30571         (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30572       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
30573       return DAG.getNode(
30574           ISD::TRUNCATE, dl, VT,
30575           DAG.getNode(ISD::MUL, dl, ExVT,
30576                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
30577                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
30578     }
30579
30580     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30581
30582     // Extract the lo/hi parts to any extend to i16.
30583     // We're going to mask off the low byte of each result element of the
30584     // pmullw, so it doesn't matter what's in the high byte of each 16-bit
30585     // element.
30586     SDValue Undef = DAG.getUNDEF(VT);
30587     SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
30588     SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
30589
30590     SDValue BLo, BHi;
30591     if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
30592       // If the RHS is a constant, manually unpackl/unpackh.
30593       SmallVector<SDValue, 16> LoOps, HiOps;
30594       for (unsigned i = 0; i != NumElts; i += 16) {
30595         for (unsigned j = 0; j != 8; ++j) {
30596           LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
30597                                                MVT::i16));
30598           HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
30599                                                MVT::i16));
30600         }
30601       }
30602
30603       BLo = DAG.getBuildVector(ExVT, dl, LoOps);
30604       BHi = DAG.getBuildVector(ExVT, dl, HiOps);
30605     } else {
30606       BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
30607       BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
30608     }
30609
30610     // Multiply, mask the lower 8bits of the lo/hi results and pack.
30611     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
30612     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
30613     return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
30614   }
30615
30616   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
30617   if (VT == MVT::v4i32) {
30618     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
30619            "Should not custom lower when pmulld is available!");
30620
30621     // Extract the odd parts.
30622     static const int UnpackMask[] = { 1, -1, 3, -1 };
30623     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
30624     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
30625
30626     // Multiply the even parts.
30627     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
30628                                 DAG.getBitcast(MVT::v2i64, A),
30629                                 DAG.getBitcast(MVT::v2i64, B));
30630     // Now multiply odd parts.
30631     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
30632                                DAG.getBitcast(MVT::v2i64, Aodds),
30633                                DAG.getBitcast(MVT::v2i64, Bodds));
30634
30635     Evens = DAG.getBitcast(VT, Evens);
30636     Odds = DAG.getBitcast(VT, Odds);
30637
30638     // Merge the two vectors back together with a shuffle. This expands into 2
30639     // shuffles.
30640     static const int ShufMask[] = { 0, 4, 2, 6 };
30641     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
30642   }
30643
30644   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
30645          "Only know how to lower V2I64/V4I64/V8I64 multiply");
30646   assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
30647
30648   //  Ahi = psrlqi(a, 32);
30649   //  Bhi = psrlqi(b, 32);
30650   //
30651   //  AloBlo = pmuludq(a, b);
30652   //  AloBhi = pmuludq(a, Bhi);
30653   //  AhiBlo = pmuludq(Ahi, b);
30654   //
30655   //  Hi = psllqi(AloBhi + AhiBlo, 32);
30656   //  return AloBlo + Hi;
30657   KnownBits AKnown = DAG.computeKnownBits(A);
30658   KnownBits BKnown = DAG.computeKnownBits(B);
30659
30660   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
30661   bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
30662   bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
30663
30664   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
30665   bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
30666   bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
30667
30668   SDValue Zero = DAG.getConstant(0, dl, VT);
30669
30670   // Only multiply lo/hi halves that aren't known to be zero.
30671   SDValue AloBlo = Zero;
30672   if (!ALoIsZero && !BLoIsZero)
30673     AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
30674
30675   SDValue AloBhi = Zero;
30676   if (!ALoIsZero && !BHiIsZero) {
30677     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
30678     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
30679   }
30680
30681   SDValue AhiBlo = Zero;
30682   if (!AHiIsZero && !BLoIsZero) {
30683     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
30684     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
30685   }
30686
30687   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
30688   Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
30689
30690   return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
30691 }
30692
30693 static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
30694                                      MVT VT, bool IsSigned,
30695                                      const X86Subtarget &Subtarget,
30696                                      SelectionDAG &DAG,
30697                                      SDValue *Low = nullptr) {
30698   unsigned NumElts = VT.getVectorNumElements();
30699
30700   // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
30701   // to a vXi16 type. Do the multiplies, shift the results and pack the half
30702   // lane results back together.
30703
30704   // We'll take different approaches for signed and unsigned.
30705   // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
30706   // and use pmullw to calculate the full 16-bit product.
30707   // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
30708   // shift them left into the upper byte of each word. This allows us to use
30709   // pmulhw to calculate the full 16-bit product. This trick means we don't
30710   // need to sign extend the bytes to use pmullw.
30711
30712   MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30713   SDValue Zero = DAG.getConstant(0, dl, VT);
30714
30715   SDValue ALo, AHi;
30716   if (IsSigned) {
30717     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
30718     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
30719   } else {
30720     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
30721     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
30722   }
30723
30724   SDValue BLo, BHi;
30725   if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
30726     // If the RHS is a constant, manually unpackl/unpackh and extend.
30727     SmallVector<SDValue, 16> LoOps, HiOps;
30728     for (unsigned i = 0; i != NumElts; i += 16) {
30729       for (unsigned j = 0; j != 8; ++j) {
30730         SDValue LoOp = B.getOperand(i + j);
30731         SDValue HiOp = B.getOperand(i + j + 8);
30732
30733         if (IsSigned) {
30734           LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
30735           HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
30736           LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
30737                              DAG.getConstant(8, dl, MVT::i16));
30738           HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
30739                              DAG.getConstant(8, dl, MVT::i16));
30740         } else {
30741           LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
30742           HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
30743         }
30744
30745         LoOps.push_back(LoOp);
30746         HiOps.push_back(HiOp);
30747       }
30748     }
30749
30750     BLo = DAG.getBuildVector(ExVT, dl, LoOps);
30751     BHi = DAG.getBuildVector(ExVT, dl, HiOps);
30752   } else if (IsSigned) {
30753     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
30754     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
30755   } else {
30756     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
30757     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
30758   }
30759
30760   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
30761   // pack back to vXi8.
30762   unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
30763   SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
30764   SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
30765
30766   if (Low)
30767     *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
30768
30769   return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
30770 }
30771
30772 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
30773                          SelectionDAG &DAG) {
30774   SDLoc dl(Op);
30775   MVT VT = Op.getSimpleValueType();
30776   bool IsSigned = Op->getOpcode() == ISD::MULHS;
30777   unsigned NumElts = VT.getVectorNumElements();
30778   SDValue A = Op.getOperand(0);
30779   SDValue B = Op.getOperand(1);
30780
30781   // Decompose 256-bit ops into 128-bit ops.
30782   if (VT.is256BitVector() && !Subtarget.hasInt256())
30783     return splitVectorIntBinary(Op, DAG);
30784
30785   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
30786     return splitVectorIntBinary(Op, DAG);
30787
30788   if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
30789     assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
30790            (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
30791            (VT == MVT::v16i32 && Subtarget.hasAVX512()));
30792
30793     // PMULxD operations multiply each even value (starting at 0) of LHS with
30794     // the related value of RHS and produce a widen result.
30795     // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
30796     // => <2 x i64> <ae|cg>
30797     //
30798     // In other word, to have all the results, we need to perform two PMULxD:
30799     // 1. one with the even values.
30800     // 2. one with the odd values.
30801     // To achieve #2, with need to place the odd values at an even position.
30802     //
30803     // Place the odd value at an even position (basically, shift all values 1
30804     // step to the left):
30805     const int Mask[] = {1, -1,  3, -1,  5, -1,  7, -1,
30806                         9, -1, 11, -1, 13, -1, 15, -1};
30807     // <a|b|c|d> => <b|undef|d|undef>
30808     SDValue Odd0 =
30809         DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
30810     // <e|f|g|h> => <f|undef|h|undef>
30811     SDValue Odd1 =
30812         DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
30813
30814     // Emit two multiplies, one for the lower 2 ints and one for the higher 2
30815     // ints.
30816     MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
30817     unsigned Opcode =
30818         (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
30819     // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
30820     // => <2 x i64> <ae|cg>
30821     SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
30822                                                   DAG.getBitcast(MulVT, A),
30823                                                   DAG.getBitcast(MulVT, B)));
30824     // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
30825     // => <2 x i64> <bf|dh>
30826     SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
30827                                                   DAG.getBitcast(MulVT, Odd0),
30828                                                   DAG.getBitcast(MulVT, Odd1)));
30829
30830     // Shuffle it back into the right order.
30831     SmallVector<int, 16> ShufMask(NumElts);
30832     for (int i = 0; i != (int)NumElts; ++i)
30833       ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
30834
30835     SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
30836
30837     // If we have a signed multiply but no PMULDQ fix up the result of an
30838     // unsigned multiply.
30839     if (IsSigned && !Subtarget.hasSSE41()) {
30840       SDValue Zero = DAG.getConstant(0, dl, VT);
30841       SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
30842                                DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
30843       SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
30844                                DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
30845
30846       SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
30847       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
30848     }
30849
30850     return Res;
30851   }
30852
30853   // Only i8 vectors should need custom lowering after this.
30854   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30855          (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30856          "Unsupported vector type");
30857
30858   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
30859   // logical shift down the upper half and pack back to i8.
30860
30861   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
30862   // and then ashr/lshr the upper bits down to the lower bits before multiply.
30863
30864   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30865       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30866     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30867     unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30868     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
30869     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
30870     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
30871     Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30872     return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
30873   }
30874
30875   return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
30876 }
30877
30878 // Custom lowering for SMULO/UMULO.
30879 static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
30880                          SelectionDAG &DAG) {
30881   MVT VT = Op.getSimpleValueType();
30882
30883   // Scalars defer to LowerXALUO.
30884   if (!VT.isVector())
30885     return LowerXALUO(Op, DAG);
30886
30887   SDLoc dl(Op);
30888   bool IsSigned = Op->getOpcode() == ISD::SMULO;
30889   SDValue A = Op.getOperand(0);
30890   SDValue B = Op.getOperand(1);
30891   EVT OvfVT = Op->getValueType(1);
30892
30893   if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
30894       (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
30895     // Extract the LHS Lo/Hi vectors
30896     SDValue LHSLo, LHSHi;
30897     std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
30898
30899     // Extract the RHS Lo/Hi vectors
30900     SDValue RHSLo, RHSHi;
30901     std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
30902
30903     EVT LoOvfVT, HiOvfVT;
30904     std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
30905     SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
30906     SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
30907
30908     // Issue the split operations.
30909     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
30910     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
30911
30912     // Join the separate data results and the overflow results.
30913     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30914     SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
30915                               Hi.getValue(1));
30916
30917     return DAG.getMergeValues({Res, Ovf}, dl);
30918   }
30919
30920   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30921   EVT SetccVT =
30922       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
30923
30924   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
30925       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
30926     unsigned NumElts = VT.getVectorNumElements();
30927     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30928     unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30929     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
30930     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
30931     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
30932
30933     SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
30934
30935     SDValue Ovf;
30936     if (IsSigned) {
30937       SDValue High, LowSign;
30938       if (OvfVT.getVectorElementType() == MVT::i1 &&
30939           (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30940         // Rather the truncating try to do the compare on vXi16 or vXi32.
30941         // Shift the high down filling with sign bits.
30942         High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
30943         // Fill all 16 bits with the sign bit from the low.
30944         LowSign =
30945             getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
30946         LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
30947                                              15, DAG);
30948         SetccVT = OvfVT;
30949         if (!Subtarget.hasBWI()) {
30950           // We can't do a vXi16 compare so sign extend to v16i32.
30951           High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
30952           LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
30953         }
30954       } else {
30955         // Otherwise do the compare at vXi8.
30956         High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30957         High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30958         LowSign =
30959             DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30960       }
30961
30962       Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30963     } else {
30964       SDValue High =
30965           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
30966       if (OvfVT.getVectorElementType() == MVT::i1 &&
30967           (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
30968         // Rather the truncating try to do the compare on vXi16 or vXi32.
30969         SetccVT = OvfVT;
30970         if (!Subtarget.hasBWI()) {
30971           // We can't do a vXi16 compare so sign extend to v16i32.
30972           High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
30973         }
30974       } else {
30975         // Otherwise do the compare at vXi8.
30976         High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30977       }
30978
30979       Ovf =
30980           DAG.getSetCC(dl, SetccVT, High,
30981                        DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30982     }
30983
30984     Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30985
30986     return DAG.getMergeValues({Low, Ovf}, dl);
30987   }
30988
30989   SDValue Low;
30990   SDValue High =
30991       LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30992
30993   SDValue Ovf;
30994   if (IsSigned) {
30995     // SMULO overflows if the high bits don't match the sign of the low.
30996     SDValue LowSign =
30997         DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30998     Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30999   } else {
31000     // UMULO overflows if the high bits are non-zero.
31001     Ovf =
31002         DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
31003   }
31004
31005   Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
31006
31007   return DAG.getMergeValues({Low, Ovf}, dl);
31008 }
31009
31010 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
31011   assert(Subtarget.isTargetWin64() && "Unexpected target");
31012   EVT VT = Op.getValueType();
31013   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
31014          "Unexpected return type for lowering");
31015
31016   if (isa<ConstantSDNode>(Op->getOperand(1))) {
31017     SmallVector<SDValue> Result;
31018     if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
31019       return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
31020   }
31021
31022   RTLIB::Libcall LC;
31023   bool isSigned;
31024   switch (Op->getOpcode()) {
31025   default: llvm_unreachable("Unexpected request for libcall!");
31026   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
31027   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
31028   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
31029   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
31030   }
31031
31032   SDLoc dl(Op);
31033   SDValue InChain = DAG.getEntryNode();
31034
31035   TargetLowering::ArgListTy Args;
31036   TargetLowering::ArgListEntry Entry;
31037   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
31038     EVT ArgVT = Op->getOperand(i).getValueType();
31039     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
31040            "Unexpected argument type for lowering");
31041     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
31042     int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31043     MachinePointerInfo MPI =
31044         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31045     Entry.Node = StackPtr;
31046     InChain =
31047         DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
31048     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
31049     Entry.Ty = PointerType::get(ArgTy,0);
31050     Entry.IsSExt = false;
31051     Entry.IsZExt = false;
31052     Args.push_back(Entry);
31053   }
31054
31055   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
31056                                          getPointerTy(DAG.getDataLayout()));
31057
31058   TargetLowering::CallLoweringInfo CLI(DAG);
31059   CLI.setDebugLoc(dl)
31060       .setChain(InChain)
31061       .setLibCallee(
31062           getLibcallCallingConv(LC),
31063           static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
31064           std::move(Args))
31065       .setInRegister()
31066       .setSExtResult(isSigned)
31067       .setZExtResult(!isSigned);
31068
31069   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
31070   return DAG.getBitcast(VT, CallInfo.first);
31071 }
31072
31073 SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
31074                                                    SelectionDAG &DAG,
31075                                                    SDValue &Chain) const {
31076   assert(Subtarget.isTargetWin64() && "Unexpected target");
31077   EVT VT = Op.getValueType();
31078   bool IsStrict = Op->isStrictFPOpcode();
31079
31080   SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
31081   EVT ArgVT = Arg.getValueType();
31082
31083   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
31084          "Unexpected return type for lowering");
31085
31086   RTLIB::Libcall LC;
31087   if (Op->getOpcode() == ISD::FP_TO_SINT ||
31088       Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
31089     LC = RTLIB::getFPTOSINT(ArgVT, VT);
31090   else
31091     LC = RTLIB::getFPTOUINT(ArgVT, VT);
31092   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
31093
31094   SDLoc dl(Op);
31095   MakeLibCallOptions CallOptions;
31096   Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
31097
31098   SDValue Result;
31099   // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
31100   // expected VT (i128).
31101   std::tie(Result, Chain) =
31102       makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
31103   Result = DAG.getBitcast(VT, Result);
31104   return Result;
31105 }
31106
31107 SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
31108                                                    SelectionDAG &DAG) const {
31109   assert(Subtarget.isTargetWin64() && "Unexpected target");
31110   EVT VT = Op.getValueType();
31111   bool IsStrict = Op->isStrictFPOpcode();
31112
31113   SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
31114   EVT ArgVT = Arg.getValueType();
31115
31116   assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
31117          "Unexpected argument type for lowering");
31118
31119   RTLIB::Libcall LC;
31120   if (Op->getOpcode() == ISD::SINT_TO_FP ||
31121       Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
31122     LC = RTLIB::getSINTTOFP(ArgVT, VT);
31123   else
31124     LC = RTLIB::getUINTTOFP(ArgVT, VT);
31125   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
31126
31127   SDLoc dl(Op);
31128   MakeLibCallOptions CallOptions;
31129   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
31130
31131   // Pass the i128 argument as an indirect argument on the stack.
31132   SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
31133   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31134   MachinePointerInfo MPI =
31135       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31136   Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
31137
31138   SDValue Result;
31139   std::tie(Result, Chain) =
31140       makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
31141   return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
31142 }
31143
31144 // Return true if the required (according to Opcode) shift-imm form is natively
31145 // supported by the Subtarget
31146 static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
31147                                         unsigned Opcode) {
31148   if (!VT.isSimple())
31149     return false;
31150
31151   if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
31152     return false;
31153
31154   if (VT.getScalarSizeInBits() < 16)
31155     return false;
31156
31157   if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
31158       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
31159     return true;
31160
31161   bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
31162                 (VT.is256BitVector() && Subtarget.hasInt256());
31163
31164   bool AShift = LShift && (Subtarget.hasAVX512() ||
31165                            (VT != MVT::v2i64 && VT != MVT::v4i64));
31166   return (Opcode == ISD::SRA) ? AShift : LShift;
31167 }
31168
31169 // The shift amount is a variable, but it is the same for all vector lanes.
31170 // These instructions are defined together with shift-immediate.
31171 static
31172 bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget,
31173                                       unsigned Opcode) {
31174   return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
31175 }
31176
31177 // Return true if the required (according to Opcode) variable-shift form is
31178 // natively supported by the Subtarget
31179 static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
31180                                     unsigned Opcode) {
31181   if (!VT.isSimple())
31182     return false;
31183
31184   if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
31185     return false;
31186
31187   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
31188     return false;
31189
31190   // vXi16 supported only on AVX-512, BWI
31191   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
31192     return false;
31193
31194   if (Subtarget.hasAVX512() &&
31195       (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
31196     return true;
31197
31198   bool LShift = VT.is128BitVector() || VT.is256BitVector();
31199   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
31200   return (Opcode == ISD::SRA) ? AShift : LShift;
31201 }
31202
31203 static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
31204                                            const X86Subtarget &Subtarget) {
31205   MVT VT = Op.getSimpleValueType();
31206   SDLoc dl(Op);
31207   SDValue R = Op.getOperand(0);
31208   SDValue Amt = Op.getOperand(1);
31209   unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
31210
31211   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
31212     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
31213     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
31214     SDValue Ex = DAG.getBitcast(ExVT, R);
31215
31216     // ashr(R, 63) === cmp_slt(R, 0)
31217     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
31218       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
31219              "Unsupported PCMPGT op");
31220       return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
31221     }
31222
31223     if (ShiftAmt >= 32) {
31224       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
31225       SDValue Upper =
31226           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
31227       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
31228                                                  ShiftAmt - 32, DAG);
31229       if (VT == MVT::v2i64)
31230         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
31231       if (VT == MVT::v4i64)
31232         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
31233                                   {9, 1, 11, 3, 13, 5, 15, 7});
31234     } else {
31235       // SRA upper i32, SRL whole i64 and select lower i32.
31236       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
31237                                                  ShiftAmt, DAG);
31238       SDValue Lower =
31239           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
31240       Lower = DAG.getBitcast(ExVT, Lower);
31241       if (VT == MVT::v2i64)
31242         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
31243       if (VT == MVT::v4i64)
31244         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
31245                                   {8, 1, 10, 3, 12, 5, 14, 7});
31246     }
31247     return DAG.getBitcast(VT, Ex);
31248   };
31249
31250   // Optimize shl/srl/sra with constant shift amount.
31251   APInt APIntShiftAmt;
31252   if (!X86::isConstantSplat(Amt, APIntShiftAmt))
31253     return SDValue();
31254
31255   // If the shift amount is out of range, return undef.
31256   if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
31257     return DAG.getUNDEF(VT);
31258
31259   uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
31260
31261   if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
31262     // Hardware support for vector shifts is sparse which makes us scalarize the
31263     // vector operations in many cases. Also, on sandybridge ADD is faster than
31264     // shl: (shl V, 1) -> (add (freeze V), (freeze V))
31265     if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
31266       // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
31267       // must be 0). (add undef, undef) however can be any value. To make this
31268       // safe, we must freeze R to ensure that register allocation uses the same
31269       // register for an undefined value. This ensures that the result will
31270       // still be even and preserves the original semantics.
31271       R = DAG.getFreeze(R);
31272       return DAG.getNode(ISD::ADD, dl, VT, R, R);
31273     }
31274
31275     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
31276   }
31277
31278   // i64 SRA needs to be performed as partial shifts.
31279   if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
31280        (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
31281       Op.getOpcode() == ISD::SRA)
31282     return ArithmeticShiftRight64(ShiftAmt);
31283
31284   if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
31285       (Subtarget.hasBWI() && VT == MVT::v64i8)) {
31286     unsigned NumElts = VT.getVectorNumElements();
31287     MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31288
31289     // Simple i8 add case
31290     if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
31291       // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
31292       // must be 0). (add undef, undef) however can be any value. To make this
31293       // safe, we must freeze R to ensure that register allocation uses the same
31294       // register for an undefined value. This ensures that the result will
31295       // still be even and preserves the original semantics.
31296       R = DAG.getFreeze(R);
31297       return DAG.getNode(ISD::ADD, dl, VT, R, R);
31298     }
31299
31300     // ashr(R, 7)  === cmp_slt(R, 0)
31301     if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
31302       SDValue Zeros = DAG.getConstant(0, dl, VT);
31303       if (VT.is512BitVector()) {
31304         assert(VT == MVT::v64i8 && "Unexpected element type!");
31305         SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
31306         return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
31307       }
31308       return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
31309     }
31310
31311     // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
31312     if (VT == MVT::v16i8 && Subtarget.hasXOP())
31313       return SDValue();
31314
31315     if (Op.getOpcode() == ISD::SHL) {
31316       // Make a large shift.
31317       SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
31318                                                ShiftAmt, DAG);
31319       SHL = DAG.getBitcast(VT, SHL);
31320       // Zero out the rightmost bits.
31321       APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
31322       return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
31323     }
31324     if (Op.getOpcode() == ISD::SRL) {
31325       // Make a large shift.
31326       SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
31327                                                ShiftAmt, DAG);
31328       SRL = DAG.getBitcast(VT, SRL);
31329       // Zero out the leftmost bits.
31330       APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
31331       return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
31332     }
31333     if (Op.getOpcode() == ISD::SRA) {
31334       // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
31335       SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
31336
31337       SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
31338       Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
31339       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
31340       return Res;
31341     }
31342     llvm_unreachable("Unknown shift opcode.");
31343   }
31344
31345   return SDValue();
31346 }
31347
31348 static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
31349                                           const X86Subtarget &Subtarget) {
31350   MVT VT = Op.getSimpleValueType();
31351   SDLoc dl(Op);
31352   SDValue R = Op.getOperand(0);
31353   SDValue Amt = Op.getOperand(1);
31354   unsigned Opcode = Op.getOpcode();
31355   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
31356
31357   int BaseShAmtIdx = -1;
31358   if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
31359     if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
31360       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
31361                                  Subtarget, DAG);
31362
31363     // vXi8 shifts - shift as v8i16 + mask result.
31364     if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
31365          (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
31366          VT == MVT::v64i8) &&
31367         !Subtarget.hasXOP()) {
31368       unsigned NumElts = VT.getVectorNumElements();
31369       MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31370       if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
31371         unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
31372         unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
31373
31374         // Create the mask using vXi16 shifts. For shift-rights we need to move
31375         // the upper byte down before splatting the vXi8 mask.
31376         SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
31377         BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
31378                                       BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
31379         if (Opcode != ISD::SHL)
31380           BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
31381                                                8, DAG);
31382         BitMask = DAG.getBitcast(VT, BitMask);
31383         BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
31384                                        SmallVector<int, 64>(NumElts, 0));
31385
31386         SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
31387                                           DAG.getBitcast(ExtVT, R), BaseShAmt,
31388                                           BaseShAmtIdx, Subtarget, DAG);
31389         Res = DAG.getBitcast(VT, Res);
31390         Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
31391
31392         if (Opcode == ISD::SRA) {
31393           // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
31394           // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
31395           SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
31396           SignMask =
31397               getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
31398                                   BaseShAmtIdx, Subtarget, DAG);
31399           SignMask = DAG.getBitcast(VT, SignMask);
31400           Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
31401           Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
31402         }
31403         return Res;
31404       }
31405     }
31406   }
31407
31408   return SDValue();
31409 }
31410
31411 // Convert a shift/rotate left amount to a multiplication scale factor.
31412 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
31413                                        const X86Subtarget &Subtarget,
31414                                        SelectionDAG &DAG) {
31415   MVT VT = Amt.getSimpleValueType();
31416   if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
31417         (Subtarget.hasInt256() && VT == MVT::v16i16) ||
31418         (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
31419         (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
31420         (Subtarget.hasInt256() && VT == MVT::v32i8) ||
31421         (Subtarget.hasBWI() && VT == MVT::v64i8)))
31422     return SDValue();
31423
31424   MVT SVT = VT.getVectorElementType();
31425   unsigned SVTBits = SVT.getSizeInBits();
31426   unsigned NumElems = VT.getVectorNumElements();
31427
31428   APInt UndefElts;
31429   SmallVector<APInt> EltBits;
31430   if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
31431     APInt One(SVTBits, 1);
31432     SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
31433     for (unsigned I = 0; I != NumElems; ++I) {
31434       if (UndefElts[I] || EltBits[I].uge(SVTBits))
31435         continue;
31436       uint64_t ShAmt = EltBits[I].getZExtValue();
31437       Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
31438     }
31439     return DAG.getBuildVector(VT, dl, Elts);
31440   }
31441
31442   // If the target doesn't support variable shifts, use either FP conversion
31443   // or integer multiplication to avoid shifting each element individually.
31444   if (VT == MVT::v4i32) {
31445     Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
31446     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
31447                       DAG.getConstant(0x3f800000U, dl, VT));
31448     Amt = DAG.getBitcast(MVT::v4f32, Amt);
31449     return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
31450   }
31451
31452   // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
31453   if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
31454     SDValue Z = DAG.getConstant(0, dl, VT);
31455     SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
31456     SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
31457     Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
31458     Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
31459     if (Subtarget.hasSSE41())
31460       return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31461     return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
31462   }
31463
31464   return SDValue();
31465 }
31466
31467 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
31468                           SelectionDAG &DAG) {
31469   MVT VT = Op.getSimpleValueType();
31470   SDLoc dl(Op);
31471   SDValue R = Op.getOperand(0);
31472   SDValue Amt = Op.getOperand(1);
31473   unsigned EltSizeInBits = VT.getScalarSizeInBits();
31474   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31475
31476   unsigned Opc = Op.getOpcode();
31477   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
31478   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
31479
31480   assert(VT.isVector() && "Custom lowering only for vector shifts!");
31481   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
31482
31483   if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
31484     return V;
31485
31486   if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
31487     return V;
31488
31489   if (supportedVectorVarShift(VT, Subtarget, Opc))
31490     return Op;
31491
31492   // i64 vector arithmetic shift can be emulated with the transform:
31493   // M = lshr(SIGN_MASK, Amt)
31494   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
31495   if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
31496        (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
31497       Opc == ISD::SRA) {
31498     SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
31499     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
31500     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
31501     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
31502     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
31503     return R;
31504   }
31505
31506   // XOP has 128-bit variable logical/arithmetic shifts.
31507   // +ve/-ve Amt = shift left/right.
31508   if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
31509                              VT == MVT::v8i16 || VT == MVT::v16i8)) {
31510     if (Opc == ISD::SRL || Opc == ISD::SRA) {
31511       SDValue Zero = DAG.getConstant(0, dl, VT);
31512       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
31513     }
31514     if (Opc == ISD::SHL || Opc == ISD::SRL)
31515       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
31516     if (Opc == ISD::SRA)
31517       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
31518   }
31519
31520   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
31521   // shifts per-lane and then shuffle the partial results back together.
31522   if (VT == MVT::v2i64 && Opc != ISD::SRA) {
31523     // Splat the shift amounts so the scalar shifts above will catch it.
31524     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
31525     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
31526     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
31527     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
31528     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
31529   }
31530
31531   // If possible, lower this shift as a sequence of two shifts by
31532   // constant plus a BLENDing shuffle instead of scalarizing it.
31533   // Example:
31534   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
31535   //
31536   // Could be rewritten as:
31537   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
31538   //
31539   // The advantage is that the two shifts from the example would be
31540   // lowered as X86ISD::VSRLI nodes in parallel before blending.
31541   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
31542                       (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
31543     SDValue Amt1, Amt2;
31544     unsigned NumElts = VT.getVectorNumElements();
31545     SmallVector<int, 8> ShuffleMask;
31546     for (unsigned i = 0; i != NumElts; ++i) {
31547       SDValue A = Amt->getOperand(i);
31548       if (A.isUndef()) {
31549         ShuffleMask.push_back(SM_SentinelUndef);
31550         continue;
31551       }
31552       if (!Amt1 || Amt1 == A) {
31553         ShuffleMask.push_back(i);
31554         Amt1 = A;
31555         continue;
31556       }
31557       if (!Amt2 || Amt2 == A) {
31558         ShuffleMask.push_back(i + NumElts);
31559         Amt2 = A;
31560         continue;
31561       }
31562       break;
31563     }
31564
31565     // Only perform this blend if we can perform it without loading a mask.
31566     if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
31567         (VT != MVT::v16i16 ||
31568          is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
31569         (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
31570          canWidenShuffleElements(ShuffleMask))) {
31571       auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
31572       auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
31573       if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
31574           Cst2->getAPIntValue().ult(EltSizeInBits)) {
31575         SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
31576                                                     Cst1->getZExtValue(), DAG);
31577         SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
31578                                                     Cst2->getZExtValue(), DAG);
31579         return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
31580       }
31581     }
31582   }
31583
31584   // If possible, lower this packed shift into a vector multiply instead of
31585   // expanding it into a sequence of scalar shifts.
31586   // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
31587   if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
31588                                                 Subtarget.canExtendTo512BW())))
31589     if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
31590       return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
31591
31592   // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
31593   // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
31594   if (Opc == ISD::SRL && ConstantAmt &&
31595       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
31596     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
31597     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
31598     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
31599       SDValue Zero = DAG.getConstant(0, dl, VT);
31600       SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
31601       SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
31602       return DAG.getSelect(dl, VT, ZAmt, R, Res);
31603     }
31604   }
31605
31606   // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
31607   // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
31608   // TODO: Special case handling for shift by 0/1, really we can afford either
31609   // of these cases in pre-SSE41/XOP/AVX512 but not both.
31610   if (Opc == ISD::SRA && ConstantAmt &&
31611       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
31612       ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
31613         !Subtarget.hasAVX512()) ||
31614        DAG.isKnownNeverZero(Amt))) {
31615     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
31616     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
31617     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
31618       SDValue Amt0 =
31619           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
31620       SDValue Amt1 =
31621           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
31622       SDValue Sra1 =
31623           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
31624       SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
31625       Res = DAG.getSelect(dl, VT, Amt0, R, Res);
31626       return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
31627     }
31628   }
31629
31630   // v4i32 Non Uniform Shifts.
31631   // If the shift amount is constant we can shift each lane using the SSE2
31632   // immediate shifts, else we need to zero-extend each lane to the lower i64
31633   // and shift using the SSE2 variable shifts.
31634   // The separate results can then be blended together.
31635   if (VT == MVT::v4i32) {
31636     SDValue Amt0, Amt1, Amt2, Amt3;
31637     if (ConstantAmt) {
31638       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
31639       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
31640       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
31641       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
31642     } else {
31643       // The SSE2 shifts use the lower i64 as the same shift amount for
31644       // all lanes and the upper i64 is ignored. On AVX we're better off
31645       // just zero-extending, but for SSE just duplicating the top 16-bits is
31646       // cheaper and has the same effect for out of range values.
31647       if (Subtarget.hasAVX()) {
31648         SDValue Z = DAG.getConstant(0, dl, VT);
31649         Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
31650         Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
31651         Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
31652         Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
31653       } else {
31654         SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
31655         SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
31656                                              {4, 5, 6, 7, -1, -1, -1, -1});
31657         SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
31658         SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
31659         Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
31660         Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
31661         Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
31662         Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
31663       }
31664     }
31665
31666     unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
31667     SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
31668     SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
31669     SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
31670     SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
31671
31672     // Merge the shifted lane results optimally with/without PBLENDW.
31673     // TODO - ideally shuffle combining would handle this.
31674     if (Subtarget.hasSSE41()) {
31675       SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
31676       SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
31677       return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
31678     }
31679     SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
31680     SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
31681     return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
31682   }
31683
31684   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
31685   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
31686   // make the existing SSE solution better.
31687   // NOTE: We honor prefered vector width before promoting to 512-bits.
31688   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
31689       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
31690       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
31691       (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
31692       (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
31693     assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
31694            "Unexpected vector type");
31695     MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
31696     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
31697     unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
31698     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
31699     Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
31700     return DAG.getNode(ISD::TRUNCATE, dl, VT,
31701                        DAG.getNode(Opc, dl, ExtVT, R, Amt));
31702   }
31703
31704   // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
31705   // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
31706   if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
31707       (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
31708        (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
31709       !Subtarget.hasXOP()) {
31710     int NumElts = VT.getVectorNumElements();
31711     SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
31712
31713     // Extend constant shift amount to vXi16 (it doesn't matter if the type
31714     // isn't legal).
31715     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
31716     Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
31717     Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
31718     Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
31719     assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
31720            "Constant build vector expected");
31721
31722     if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
31723       bool IsSigned = Opc == ISD::SRA;
31724       R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
31725       R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
31726       R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
31727       return DAG.getZExtOrTrunc(R, dl, VT);
31728     }
31729
31730     SmallVector<SDValue, 16> LoAmt, HiAmt;
31731     for (int i = 0; i != NumElts; i += 16) {
31732       for (int j = 0; j != 8; ++j) {
31733         LoAmt.push_back(Amt.getOperand(i + j));
31734         HiAmt.push_back(Amt.getOperand(i + j + 8));
31735       }
31736     }
31737
31738     MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
31739     SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
31740     SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
31741
31742     SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
31743     SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
31744     LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
31745     HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
31746     LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
31747     HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
31748     LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
31749     HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
31750     return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
31751   }
31752
31753   if (VT == MVT::v16i8 ||
31754       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
31755       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
31756     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
31757
31758     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31759       if (VT.is512BitVector()) {
31760         // On AVX512BW targets we make use of the fact that VSELECT lowers
31761         // to a masked blend which selects bytes based just on the sign bit
31762         // extracted to a mask.
31763         MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
31764         V0 = DAG.getBitcast(VT, V0);
31765         V1 = DAG.getBitcast(VT, V1);
31766         Sel = DAG.getBitcast(VT, Sel);
31767         Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
31768                            ISD::SETGT);
31769         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
31770       } else if (Subtarget.hasSSE41()) {
31771         // On SSE41 targets we can use PBLENDVB which selects bytes based just
31772         // on the sign bit.
31773         V0 = DAG.getBitcast(VT, V0);
31774         V1 = DAG.getBitcast(VT, V1);
31775         Sel = DAG.getBitcast(VT, Sel);
31776         return DAG.getBitcast(SelVT,
31777                               DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
31778       }
31779       // On pre-SSE41 targets we test for the sign bit by comparing to
31780       // zero - a negative value will set all bits of the lanes to true
31781       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31782       SDValue Z = DAG.getConstant(0, dl, SelVT);
31783       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31784       return DAG.getSelect(dl, SelVT, C, V0, V1);
31785     };
31786
31787     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31788     // We can safely do this using i16 shifts as we're only interested in
31789     // the 3 lower bits of each byte.
31790     Amt = DAG.getBitcast(ExtVT, Amt);
31791     Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31792     Amt = DAG.getBitcast(VT, Amt);
31793
31794     if (Opc == ISD::SHL || Opc == ISD::SRL) {
31795       // r = VSELECT(r, shift(r, 4), a);
31796       SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31797       R = SignBitSelect(VT, Amt, M, R);
31798
31799       // a += a
31800       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31801
31802       // r = VSELECT(r, shift(r, 2), a);
31803       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31804       R = SignBitSelect(VT, Amt, M, R);
31805
31806       // a += a
31807       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31808
31809       // return VSELECT(r, shift(r, 1), a);
31810       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31811       R = SignBitSelect(VT, Amt, M, R);
31812       return R;
31813     }
31814
31815     if (Opc == ISD::SRA) {
31816       // For SRA we need to unpack each byte to the higher byte of a i16 vector
31817       // so we can correctly sign extend. We don't care what happens to the
31818       // lower byte.
31819       SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31820       SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31821       SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31822       SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31823       ALo = DAG.getBitcast(ExtVT, ALo);
31824       AHi = DAG.getBitcast(ExtVT, AHi);
31825       RLo = DAG.getBitcast(ExtVT, RLo);
31826       RHi = DAG.getBitcast(ExtVT, RHi);
31827
31828       // r = VSELECT(r, shift(r, 4), a);
31829       SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31830       SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31831       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31832       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31833
31834       // a += a
31835       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31836       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31837
31838       // r = VSELECT(r, shift(r, 2), a);
31839       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31840       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31841       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31842       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31843
31844       // a += a
31845       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31846       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31847
31848       // r = VSELECT(r, shift(r, 1), a);
31849       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31850       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31851       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31852       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31853
31854       // Logical shift the result back to the lower byte, leaving a zero upper
31855       // byte meaning that we can safely pack with PACKUSWB.
31856       RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31857       RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31858       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31859     }
31860   }
31861
31862   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31863     MVT ExtVT = MVT::v8i32;
31864     SDValue Z = DAG.getConstant(0, dl, VT);
31865     SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31866     SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31867     SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31868     SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31869     ALo = DAG.getBitcast(ExtVT, ALo);
31870     AHi = DAG.getBitcast(ExtVT, AHi);
31871     RLo = DAG.getBitcast(ExtVT, RLo);
31872     RHi = DAG.getBitcast(ExtVT, RHi);
31873     SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31874     SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31875     Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31876     Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31877     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31878   }
31879
31880   if (VT == MVT::v8i16) {
31881     // If we have a constant shift amount, the non-SSE41 path is best as
31882     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31883     bool UseSSE41 = Subtarget.hasSSE41() &&
31884                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31885
31886     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31887       // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31888       // the sign bit.
31889       if (UseSSE41) {
31890         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
31891         V0 = DAG.getBitcast(ExtVT, V0);
31892         V1 = DAG.getBitcast(ExtVT, V1);
31893         Sel = DAG.getBitcast(ExtVT, Sel);
31894         return DAG.getBitcast(
31895             VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31896       }
31897       // On pre-SSE41 targets we splat the sign bit - a negative value will
31898       // set all bits of the lanes to true and VSELECT uses that in
31899       // its OR(AND(V0,C),AND(V1,~C)) lowering.
31900       SDValue C =
31901           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31902       return DAG.getSelect(dl, VT, C, V0, V1);
31903     };
31904
31905     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31906     if (UseSSE41) {
31907       // On SSE41 targets we need to replicate the shift mask in both
31908       // bytes for PBLENDVB.
31909       Amt = DAG.getNode(
31910           ISD::OR, dl, VT,
31911           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31912           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31913     } else {
31914       Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31915     }
31916
31917     // r = VSELECT(r, shift(r, 8), a);
31918     SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31919     R = SignBitSelect(Amt, M, R);
31920
31921     // a += a
31922     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31923
31924     // r = VSELECT(r, shift(r, 4), a);
31925     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31926     R = SignBitSelect(Amt, M, R);
31927
31928     // a += a
31929     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31930
31931     // r = VSELECT(r, shift(r, 2), a);
31932     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31933     R = SignBitSelect(Amt, M, R);
31934
31935     // a += a
31936     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31937
31938     // return VSELECT(r, shift(r, 1), a);
31939     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31940     R = SignBitSelect(Amt, M, R);
31941     return R;
31942   }
31943
31944   // Decompose 256-bit shifts into 128-bit shifts.
31945   if (VT.is256BitVector())
31946     return splitVectorIntBinary(Op, DAG);
31947
31948   if (VT == MVT::v32i16 || VT == MVT::v64i8)
31949     return splitVectorIntBinary(Op, DAG);
31950
31951   return SDValue();
31952 }
31953
31954 static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
31955                                 SelectionDAG &DAG) {
31956   MVT VT = Op.getSimpleValueType();
31957   assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31958          "Unexpected funnel shift opcode!");
31959
31960   SDLoc DL(Op);
31961   SDValue Op0 = Op.getOperand(0);
31962   SDValue Op1 = Op.getOperand(1);
31963   SDValue Amt = Op.getOperand(2);
31964   unsigned EltSizeInBits = VT.getScalarSizeInBits();
31965   bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31966
31967   if (VT.isVector()) {
31968     APInt APIntShiftAmt;
31969     bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31970
31971     if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31972       if (IsFSHR)
31973         std::swap(Op0, Op1);
31974
31975       if (IsCstSplat) {
31976         uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31977         SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31978         return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31979                              {Op0, Op1, Imm}, DAG, Subtarget);
31980       }
31981       return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
31982                            {Op0, Op1, Amt}, DAG, Subtarget);
31983     }
31984     assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31985             VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31986             VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31987            "Unexpected funnel shift type!");
31988
31989     // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31990     // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31991     if (IsCstSplat)
31992       return SDValue();
31993
31994     SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31995     SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31996     bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31997
31998     // Constant vXi16 funnel shifts can be efficiently handled by default.
31999     if (IsCst && EltSizeInBits == 16)
32000       return SDValue();
32001
32002     unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
32003     unsigned NumElts = VT.getVectorNumElements();
32004     MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
32005     MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
32006
32007     // Split 256-bit integers on XOP/pre-AVX2 targets.
32008     // Split 512-bit integers on non 512-bit BWI targets.
32009     if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
32010                                  !Subtarget.hasAVX2())) ||
32011         (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
32012          EltSizeInBits < 32)) {
32013       // Pre-mask the amount modulo using the wider vector.
32014       Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
32015       return splitVectorOp(Op, DAG);
32016     }
32017
32018     // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
32019     if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
32020       int ScalarAmtIdx = -1;
32021       if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
32022         // Uniform vXi16 funnel shifts can be efficiently handled by default.
32023         if (EltSizeInBits == 16)
32024           return SDValue();
32025
32026         SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
32027         SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
32028         Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
32029                                  ScalarAmtIdx, Subtarget, DAG);
32030         Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
32031                                  ScalarAmtIdx, Subtarget, DAG);
32032         return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
32033       }
32034     }
32035
32036     MVT WideSVT = MVT::getIntegerVT(
32037         std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
32038     MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
32039
32040     // If per-element shifts are legal, fallback to generic expansion.
32041     if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
32042       return SDValue();
32043
32044     // Attempt to fold as:
32045     // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
32046     // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
32047     if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
32048         supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
32049       Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
32050       Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
32051       AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
32052       Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
32053                                        EltSizeInBits, DAG);
32054       SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
32055       Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
32056       if (!IsFSHR)
32057         Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
32058                                          EltSizeInBits, DAG);
32059       return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
32060     }
32061
32062     // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
32063     if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
32064         supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
32065       SDValue Z = DAG.getConstant(0, DL, VT);
32066       SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
32067       SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
32068       SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
32069       SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
32070       SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
32071       SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
32072       return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
32073     }
32074
32075     // Fallback to generic expansion.
32076     return SDValue();
32077   }
32078   assert(
32079       (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
32080       "Unexpected funnel shift type!");
32081
32082   // Expand slow SHLD/SHRD cases if we are not optimizing for size.
32083   bool OptForSize = DAG.shouldOptForSize();
32084   bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
32085
32086   // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
32087   // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
32088   if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
32089       !isa<ConstantSDNode>(Amt)) {
32090     SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
32091     SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
32092     Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
32093     Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
32094     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
32095     SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
32096     Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
32097     if (IsFSHR) {
32098       Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
32099     } else {
32100       Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
32101       Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
32102     }
32103     return DAG.getZExtOrTrunc(Res, DL, VT);
32104   }
32105
32106   if (VT == MVT::i8 || ExpandFunnel)
32107     return SDValue();
32108
32109   // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
32110   if (VT == MVT::i16) {
32111     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
32112                       DAG.getConstant(15, DL, Amt.getValueType()));
32113     unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
32114     return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
32115   }
32116
32117   return Op;
32118 }
32119
32120 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
32121                            SelectionDAG &DAG) {
32122   MVT VT = Op.getSimpleValueType();
32123   assert(VT.isVector() && "Custom lowering only for vector rotates!");
32124
32125   SDLoc DL(Op);
32126   SDValue R = Op.getOperand(0);
32127   SDValue Amt = Op.getOperand(1);
32128   unsigned Opcode = Op.getOpcode();
32129   unsigned EltSizeInBits = VT.getScalarSizeInBits();
32130   int NumElts = VT.getVectorNumElements();
32131   bool IsROTL = Opcode == ISD::ROTL;
32132
32133   // Check for constant splat rotation amount.
32134   APInt CstSplatValue;
32135   bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
32136
32137   // Check for splat rotate by zero.
32138   if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
32139     return R;
32140
32141   // AVX512 implicitly uses modulo rotation amounts.
32142   if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
32143     // Attempt to rotate by immediate.
32144     if (IsCstSplat) {
32145       unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
32146       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
32147       return DAG.getNode(RotOpc, DL, VT, R,
32148                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
32149     }
32150
32151     // Else, fall-back on VPROLV/VPRORV.
32152     return Op;
32153   }
32154
32155   // AVX512 VBMI2 vXi16 - lower to funnel shifts.
32156   if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
32157     unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
32158     return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
32159   }
32160
32161   SDValue Z = DAG.getConstant(0, DL, VT);
32162
32163   if (!IsROTL) {
32164     // If the ISD::ROTR amount is constant, we're always better converting to
32165     // ISD::ROTL.
32166     if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
32167       return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
32168
32169     // XOP targets always prefers ISD::ROTL.
32170     if (Subtarget.hasXOP())
32171       return DAG.getNode(ISD::ROTL, DL, VT, R,
32172                          DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
32173   }
32174
32175   // Split 256-bit integers on XOP/pre-AVX2 targets.
32176   if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
32177     return splitVectorIntBinary(Op, DAG);
32178
32179   // XOP has 128-bit vector variable + immediate rotates.
32180   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
32181   // XOP implicitly uses modulo rotation amounts.
32182   if (Subtarget.hasXOP()) {
32183     assert(IsROTL && "Only ROTL expected");
32184     assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
32185
32186     // Attempt to rotate by immediate.
32187     if (IsCstSplat) {
32188       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
32189       return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
32190                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
32191     }
32192
32193     // Use general rotate by variable (per-element).
32194     return Op;
32195   }
32196
32197   // Rotate by an uniform constant - expand back to shifts.
32198   if (IsCstSplat)
32199     return SDValue();
32200
32201   // Split 512-bit integers on non 512-bit BWI targets.
32202   if (VT.is512BitVector() && !Subtarget.useBWIRegs())
32203     return splitVectorIntBinary(Op, DAG);
32204
32205   assert(
32206       (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
32207        ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
32208         Subtarget.hasAVX2()) ||
32209        ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
32210       "Only vXi32/vXi16/vXi8 vector rotates supported");
32211
32212   MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
32213   MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
32214
32215   SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
32216   SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32217
32218   // Attempt to fold as unpack(x,x) << zext(splat(y)):
32219   // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
32220   // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
32221   if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
32222     int BaseRotAmtIdx = -1;
32223     if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
32224       if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
32225         unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
32226         return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
32227       }
32228       unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
32229       SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
32230       SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
32231       Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
32232                                BaseRotAmtIdx, Subtarget, DAG);
32233       Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
32234                                BaseRotAmtIdx, Subtarget, DAG);
32235       return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
32236     }
32237   }
32238
32239   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
32240   unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
32241
32242   // Attempt to fold as unpack(x,x) << zext(y):
32243   // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
32244   // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
32245   // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
32246   if (!(ConstantAmt && EltSizeInBits != 8) &&
32247       !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
32248       (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
32249     SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
32250     SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
32251     SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
32252     SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
32253     SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
32254     SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
32255     return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
32256   }
32257
32258   // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
32259   // the amount bit.
32260   // TODO: We're doing nothing here that we couldn't do for funnel shifts.
32261   if (EltSizeInBits == 8) {
32262     MVT WideVT =
32263         MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
32264
32265     // Attempt to fold as:
32266     // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
32267     // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
32268     if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
32269         supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
32270       // If we're rotating by constant, just use default promotion.
32271       if (ConstantAmt)
32272         return SDValue();
32273       // See if we can perform this by widening to vXi16 or vXi32.
32274       R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
32275       R = DAG.getNode(
32276           ISD::OR, DL, WideVT, R,
32277           getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
32278       Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
32279       R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
32280       if (IsROTL)
32281         R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
32282       return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
32283     }
32284
32285     // We don't need ModuloAmt here as we just peek at individual bits.
32286     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
32287       if (Subtarget.hasSSE41()) {
32288         // On SSE41 targets we can use PBLENDVB which selects bytes based just
32289         // on the sign bit.
32290         V0 = DAG.getBitcast(VT, V0);
32291         V1 = DAG.getBitcast(VT, V1);
32292         Sel = DAG.getBitcast(VT, Sel);
32293         return DAG.getBitcast(SelVT,
32294                               DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
32295       }
32296       // On pre-SSE41 targets we test for the sign bit by comparing to
32297       // zero - a negative value will set all bits of the lanes to true
32298       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
32299       SDValue Z = DAG.getConstant(0, DL, SelVT);
32300       SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
32301       return DAG.getSelect(DL, SelVT, C, V0, V1);
32302     };
32303
32304     // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
32305     if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
32306       Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
32307       IsROTL = true;
32308     }
32309
32310     unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
32311     unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
32312
32313     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
32314     // We can safely do this using i16 shifts as we're only interested in
32315     // the 3 lower bits of each byte.
32316     Amt = DAG.getBitcast(ExtVT, Amt);
32317     Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
32318     Amt = DAG.getBitcast(VT, Amt);
32319
32320     // r = VSELECT(r, rot(r, 4), a);
32321     SDValue M;
32322     M = DAG.getNode(
32323         ISD::OR, DL, VT,
32324         DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
32325         DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
32326     R = SignBitSelect(VT, Amt, M, R);
32327
32328     // a += a
32329     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
32330
32331     // r = VSELECT(r, rot(r, 2), a);
32332     M = DAG.getNode(
32333         ISD::OR, DL, VT,
32334         DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
32335         DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
32336     R = SignBitSelect(VT, Amt, M, R);
32337
32338     // a += a
32339     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
32340
32341     // return VSELECT(r, rot(r, 1), a);
32342     M = DAG.getNode(
32343         ISD::OR, DL, VT,
32344         DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
32345         DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
32346     return SignBitSelect(VT, Amt, M, R);
32347   }
32348
32349   bool IsSplatAmt = DAG.isSplatValue(Amt);
32350   bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
32351                         supportedVectorVarShift(VT, Subtarget, ISD::SRL);
32352
32353   // Fallback for splats + all supported variable shifts.
32354   // Fallback for non-constants AVX2 vXi16 as well.
32355   if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
32356     Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32357     SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
32358     AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
32359     SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
32360     SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
32361     return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
32362   }
32363
32364   // Everything below assumes ISD::ROTL.
32365   if (!IsROTL) {
32366     Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
32367     IsROTL = true;
32368   }
32369
32370   // ISD::ROT* uses modulo rotate amounts.
32371   Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
32372
32373   assert(IsROTL && "Only ROTL supported");
32374
32375   // As with shifts, attempt to convert the rotation amount to a multiplication
32376   // factor, fallback to general expansion.
32377   SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
32378   if (!Scale)
32379     return SDValue();
32380
32381   // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
32382   if (EltSizeInBits == 16) {
32383     SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
32384     SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
32385     return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32386   }
32387
32388   // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
32389   // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
32390   // that can then be OR'd with the lower 32-bits.
32391   assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
32392   static const int OddMask[] = {1, -1, 3, -1};
32393   SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
32394   SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
32395
32396   SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
32397                               DAG.getBitcast(MVT::v2i64, R),
32398                               DAG.getBitcast(MVT::v2i64, Scale));
32399   SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
32400                               DAG.getBitcast(MVT::v2i64, R13),
32401                               DAG.getBitcast(MVT::v2i64, Scale13));
32402   Res02 = DAG.getBitcast(VT, Res02);
32403   Res13 = DAG.getBitcast(VT, Res13);
32404
32405   return DAG.getNode(ISD::OR, DL, VT,
32406                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
32407                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
32408 }
32409
32410 /// Returns true if the operand type is exactly twice the native width, and
32411 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
32412 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
32413 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
32414 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
32415   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
32416
32417   if (OpWidth == 64)
32418     return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
32419   if (OpWidth == 128)
32420     return Subtarget.canUseCMPXCHG16B();
32421
32422   return false;
32423 }
32424
32425 TargetLoweringBase::AtomicExpansionKind
32426 X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
32427   Type *MemType = SI->getValueOperand()->getType();
32428
32429   bool NoImplicitFloatOps =
32430       SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
32431   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
32432       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
32433       (Subtarget.hasSSE1() || Subtarget.hasX87()))
32434     return AtomicExpansionKind::None;
32435
32436   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
32437                                  : AtomicExpansionKind::None;
32438 }
32439
32440 // Note: this turns large loads into lock cmpxchg8b/16b.
32441 // TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
32442 TargetLowering::AtomicExpansionKind
32443 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
32444   Type *MemType = LI->getType();
32445
32446   // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
32447   // can use movq to do the load. If we have X87 we can load into an 80-bit
32448   // X87 register and store it to a stack temporary.
32449   bool NoImplicitFloatOps =
32450       LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
32451   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
32452       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
32453       (Subtarget.hasSSE1() || Subtarget.hasX87()))
32454     return AtomicExpansionKind::None;
32455
32456   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32457                                  : AtomicExpansionKind::None;
32458 }
32459
32460 enum BitTestKind : unsigned {
32461   UndefBit,
32462   ConstantBit,
32463   NotConstantBit,
32464   ShiftBit,
32465   NotShiftBit
32466 };
32467
32468 static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
32469   using namespace llvm::PatternMatch;
32470   BitTestKind BTK = UndefBit;
32471   auto *C = dyn_cast<ConstantInt>(V);
32472   if (C) {
32473     // Check if V is a power of 2 or NOT power of 2.
32474     if (isPowerOf2_64(C->getZExtValue()))
32475       BTK = ConstantBit;
32476     else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
32477       BTK = NotConstantBit;
32478     return {V, BTK};
32479   }
32480
32481   // Check if V is some power of 2 pattern known to be non-zero
32482   auto *I = dyn_cast<Instruction>(V);
32483   if (I) {
32484     bool Not = false;
32485     // Check if we have a NOT
32486     Value *PeekI;
32487     if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
32488         match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
32489       Not = true;
32490       I = dyn_cast<Instruction>(PeekI);
32491
32492       // If I is constant, it will fold and we can evaluate later. If its an
32493       // argument or something of that nature, we can't analyze.
32494       if (I == nullptr)
32495         return {nullptr, UndefBit};
32496     }
32497     // We can only use 1 << X without more sophisticated analysis. C << X where
32498     // C is a power of 2 but not 1 can result in zero which cannot be translated
32499     // to bittest. Likewise any C >> X (either arith or logical) can be zero.
32500     if (I->getOpcode() == Instruction::Shl) {
32501       // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
32502       // -X` and some other provable power of 2 patterns that we can use CTZ on
32503       // may be profitable.
32504       // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
32505       // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
32506       // be provably a non-zero power of 2.
32507       // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
32508       // transformable to bittest.
32509       auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
32510       if (!ShiftVal)
32511         return {nullptr, UndefBit};
32512       if (ShiftVal->equalsInt(1))
32513         BTK = Not ? NotShiftBit : ShiftBit;
32514
32515       if (BTK == UndefBit)
32516         return {nullptr, UndefBit};
32517
32518       Value *BitV = I->getOperand(1);
32519
32520       Value *AndOp;
32521       const APInt *AndC;
32522       if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
32523         // Read past a shiftmask instruction to find count
32524         if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
32525           BitV = AndOp;
32526       }
32527       return {BitV, BTK};
32528     }
32529   }
32530   return {nullptr, UndefBit};
32531 }
32532
32533 TargetLowering::AtomicExpansionKind
32534 X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
32535   using namespace llvm::PatternMatch;
32536   // If the atomicrmw's result isn't actually used, we can just add a "lock"
32537   // prefix to a normal instruction for these operations.
32538   if (AI->use_empty())
32539     return AtomicExpansionKind::None;
32540
32541   if (AI->getOperation() == AtomicRMWInst::Xor) {
32542     // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
32543     // preferable to both `cmpxchg` and `btc`.
32544     if (match(AI->getOperand(1), m_SignMask()))
32545       return AtomicExpansionKind::None;
32546   }
32547
32548   // If the atomicrmw's result is used by a single bit AND, we may use
32549   // bts/btr/btc instruction for these operations.
32550   // Note: InstCombinePass can cause a de-optimization here. It replaces the
32551   // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
32552   // (depending on CC). This pattern can only use bts/btr/btc but we don't
32553   // detect it.
32554   Instruction *I = AI->user_back();
32555   auto BitChange = FindSingleBitChange(AI->getValOperand());
32556   if (BitChange.second == UndefBit || !AI->hasOneUse() ||
32557       I->getOpcode() != Instruction::And ||
32558       AI->getType()->getPrimitiveSizeInBits() == 8 ||
32559       AI->getParent() != I->getParent())
32560     return AtomicExpansionKind::CmpXChg;
32561
32562   unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
32563
32564   // This is a redundant AND, it should get cleaned up elsewhere.
32565   if (AI == I->getOperand(OtherIdx))
32566     return AtomicExpansionKind::CmpXChg;
32567
32568   // The following instruction must be a AND single bit.
32569   if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
32570     auto *C1 = cast<ConstantInt>(AI->getValOperand());
32571     auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
32572     if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
32573       return AtomicExpansionKind::CmpXChg;
32574     }
32575     if (AI->getOperation() == AtomicRMWInst::And) {
32576       return ~C1->getValue() == C2->getValue()
32577                  ? AtomicExpansionKind::BitTestIntrinsic
32578                  : AtomicExpansionKind::CmpXChg;
32579     }
32580     return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
32581                     : AtomicExpansionKind::CmpXChg;
32582   }
32583
32584   assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
32585
32586   auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
32587   if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
32588     return AtomicExpansionKind::CmpXChg;
32589
32590   assert(BitChange.first != nullptr && BitTested.first != nullptr);
32591
32592   // If shift amounts are not the same we can't use BitTestIntrinsic.
32593   if (BitChange.first != BitTested.first)
32594     return AtomicExpansionKind::CmpXChg;
32595
32596   // If atomic AND need to be masking all be one bit and testing the one bit
32597   // unset in the mask.
32598   if (AI->getOperation() == AtomicRMWInst::And)
32599     return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
32600                ? AtomicExpansionKind::BitTestIntrinsic
32601                : AtomicExpansionKind::CmpXChg;
32602
32603   // If atomic XOR/OR need to be setting and testing the same bit.
32604   return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
32605              ? AtomicExpansionKind::BitTestIntrinsic
32606              : AtomicExpansionKind::CmpXChg;
32607 }
32608
32609 void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
32610   IRBuilder<> Builder(AI);
32611   Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32612   Intrinsic::ID IID_C = Intrinsic::not_intrinsic;
32613   Intrinsic::ID IID_I = Intrinsic::not_intrinsic;
32614   switch (AI->getOperation()) {
32615   default:
32616     llvm_unreachable("Unknown atomic operation");
32617   case AtomicRMWInst::Or:
32618     IID_C = Intrinsic::x86_atomic_bts;
32619     IID_I = Intrinsic::x86_atomic_bts_rm;
32620     break;
32621   case AtomicRMWInst::Xor:
32622     IID_C = Intrinsic::x86_atomic_btc;
32623     IID_I = Intrinsic::x86_atomic_btc_rm;
32624     break;
32625   case AtomicRMWInst::And:
32626     IID_C = Intrinsic::x86_atomic_btr;
32627     IID_I = Intrinsic::x86_atomic_btr_rm;
32628     break;
32629   }
32630   Instruction *I = AI->user_back();
32631   LLVMContext &Ctx = AI->getContext();
32632   Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32633                                           Type::getInt8PtrTy(Ctx));
32634   Function *BitTest = nullptr;
32635   Value *Result = nullptr;
32636   auto BitTested = FindSingleBitChange(AI->getValOperand());
32637   assert(BitTested.first != nullptr);
32638
32639   if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
32640     auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
32641
32642     BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
32643
32644     unsigned Imm = llvm::countr_zero(C->getZExtValue());
32645     Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
32646   } else {
32647     BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
32648
32649     assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
32650
32651     Value *SI = BitTested.first;
32652     assert(SI != nullptr);
32653
32654     // BT{S|R|C} on memory operand don't modulo bit position so we need to
32655     // mask it.
32656     unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
32657     Value *BitPos =
32658         Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
32659     // Todo(1): In many cases it may be provable that SI is less than
32660     // ShiftBits in which case this mask is unnecessary
32661     // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
32662     // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
32663     // favor of just a raw BT{S|R|C}.
32664
32665     Result = Builder.CreateCall(BitTest, {Addr, BitPos});
32666     Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
32667
32668     // If the result is only used for zero/non-zero status then we don't need to
32669     // shift value back. Otherwise do so.
32670     for (auto It = I->user_begin(); It != I->user_end(); ++It) {
32671       if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
32672         if (ICmp->isEquality()) {
32673           auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
32674           auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
32675           if (C0 || C1) {
32676             assert(C0 == nullptr || C1 == nullptr);
32677             if ((C0 ? C0 : C1)->isZero())
32678               continue;
32679           }
32680         }
32681       }
32682       Result = Builder.CreateShl(Result, BitPos);
32683       break;
32684     }
32685   }
32686
32687   I->replaceAllUsesWith(Result);
32688   I->eraseFromParent();
32689   AI->eraseFromParent();
32690 }
32691
32692 static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
32693   using namespace llvm::PatternMatch;
32694   if (!AI->hasOneUse())
32695     return false;
32696
32697   Value *Op = AI->getOperand(1);
32698   ICmpInst::Predicate Pred;
32699   Instruction *I = AI->user_back();
32700   AtomicRMWInst::BinOp Opc = AI->getOperation();
32701   if (Opc == AtomicRMWInst::Add) {
32702     if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
32703       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32704     if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
32705       if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32706         return Pred == CmpInst::ICMP_SLT;
32707       if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32708         return Pred == CmpInst::ICMP_SGT;
32709     }
32710     return false;
32711   }
32712   if (Opc == AtomicRMWInst::Sub) {
32713     if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32714       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32715     if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
32716       if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32717         return Pred == CmpInst::ICMP_SLT;
32718       if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32719         return Pred == CmpInst::ICMP_SGT;
32720     }
32721     return false;
32722   }
32723   if ((Opc == AtomicRMWInst::Or &&
32724        match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||
32725       (Opc == AtomicRMWInst::And &&
32726        match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {
32727     if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32728       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32729              Pred == CmpInst::ICMP_SLT;
32730     if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32731       return Pred == CmpInst::ICMP_SGT;
32732     return false;
32733   }
32734   if (Opc == AtomicRMWInst::Xor) {
32735     if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32736       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32737     if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32738       if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32739         return Pred == CmpInst::ICMP_SLT;
32740       if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
32741         return Pred == CmpInst::ICMP_SGT;
32742     }
32743     return false;
32744   }
32745
32746   return false;
32747 }
32748
32749 void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32750     AtomicRMWInst *AI) const {
32751   IRBuilder<> Builder(AI);
32752   Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32753   Instruction *TempI = nullptr;
32754   LLVMContext &Ctx = AI->getContext();
32755   ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32756   if (!ICI) {
32757     TempI = AI->user_back();
32758     assert(TempI->hasOneUse() && "Must have one use");
32759     ICI = cast<ICmpInst>(TempI->user_back());
32760   }
32761   X86::CondCode CC = X86::COND_INVALID;
32762   ICmpInst::Predicate Pred = ICI->getPredicate();
32763   switch (Pred) {
32764   default:
32765     llvm_unreachable("Not supported Pred");
32766   case CmpInst::ICMP_EQ:
32767     CC = X86::COND_E;
32768     break;
32769   case CmpInst::ICMP_NE:
32770     CC = X86::COND_NE;
32771     break;
32772   case CmpInst::ICMP_SLT:
32773     CC = X86::COND_S;
32774     break;
32775   case CmpInst::ICMP_SGT:
32776     CC = X86::COND_NS;
32777     break;
32778   }
32779   Intrinsic::ID IID = Intrinsic::not_intrinsic;
32780   switch (AI->getOperation()) {
32781   default:
32782     llvm_unreachable("Unknown atomic operation");
32783   case AtomicRMWInst::Add:
32784     IID = Intrinsic::x86_atomic_add_cc;
32785     break;
32786   case AtomicRMWInst::Sub:
32787     IID = Intrinsic::x86_atomic_sub_cc;
32788     break;
32789   case AtomicRMWInst::Or:
32790     IID = Intrinsic::x86_atomic_or_cc;
32791     break;
32792   case AtomicRMWInst::And:
32793     IID = Intrinsic::x86_atomic_and_cc;
32794     break;
32795   case AtomicRMWInst::Xor:
32796     IID = Intrinsic::x86_atomic_xor_cc;
32797     break;
32798   }
32799   Function *CmpArith =
32800       Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
32801   Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32802                                           Type::getInt8PtrTy(Ctx));
32803   Value *Call = Builder.CreateCall(
32804       CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32805   Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32806   ICI->replaceAllUsesWith(Result);
32807   ICI->eraseFromParent();
32808   if (TempI)
32809     TempI->eraseFromParent();
32810   AI->eraseFromParent();
32811 }
32812
32813 TargetLowering::AtomicExpansionKind
32814 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32815   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32816   Type *MemType = AI->getType();
32817
32818   // If the operand is too big, we must see if cmpxchg8/16b is available
32819   // and default to library calls otherwise.
32820   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32821     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32822                                    : AtomicExpansionKind::None;
32823   }
32824
32825   AtomicRMWInst::BinOp Op = AI->getOperation();
32826   switch (Op) {
32827   case AtomicRMWInst::Xchg:
32828     return AtomicExpansionKind::None;
32829   case AtomicRMWInst::Add:
32830   case AtomicRMWInst::Sub:
32831     if (shouldExpandCmpArithRMWInIR(AI))
32832       return AtomicExpansionKind::CmpArithIntrinsic;
32833     // It's better to use xadd, xsub or xchg for these in other cases.
32834     return AtomicExpansionKind::None;
32835   case AtomicRMWInst::Or:
32836   case AtomicRMWInst::And:
32837   case AtomicRMWInst::Xor:
32838     if (shouldExpandCmpArithRMWInIR(AI))
32839       return AtomicExpansionKind::CmpArithIntrinsic;
32840     return shouldExpandLogicAtomicRMWInIR(AI);
32841   case AtomicRMWInst::Nand:
32842   case AtomicRMWInst::Max:
32843   case AtomicRMWInst::Min:
32844   case AtomicRMWInst::UMax:
32845   case AtomicRMWInst::UMin:
32846   case AtomicRMWInst::FAdd:
32847   case AtomicRMWInst::FSub:
32848   case AtomicRMWInst::FMax:
32849   case AtomicRMWInst::FMin:
32850   case AtomicRMWInst::UIncWrap:
32851   case AtomicRMWInst::UDecWrap:
32852   default:
32853     // These always require a non-trivial set of data operations on x86. We must
32854     // use a cmpxchg loop.
32855     return AtomicExpansionKind::CmpXChg;
32856   }
32857 }
32858
32859 LoadInst *
32860 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32861   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32862   Type *MemType = AI->getType();
32863   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32864   // there is no benefit in turning such RMWs into loads, and it is actually
32865   // harmful as it introduces a mfence.
32866   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32867     return nullptr;
32868
32869   // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32870   // lowering available in lowerAtomicArith.
32871   // TODO: push more cases through this path.
32872   if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32873     if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32874         AI->use_empty())
32875       return nullptr;
32876
32877   IRBuilder<> Builder(AI);
32878   Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32879   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
32880   auto SSID = AI->getSyncScopeID();
32881   // We must restrict the ordering to avoid generating loads with Release or
32882   // ReleaseAcquire orderings.
32883   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
32884
32885   // Before the load we need a fence. Here is an example lifted from
32886   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32887   // is required:
32888   // Thread 0:
32889   //   x.store(1, relaxed);
32890   //   r1 = y.fetch_add(0, release);
32891   // Thread 1:
32892   //   y.fetch_add(42, acquire);
32893   //   r2 = x.load(relaxed);
32894   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32895   // lowered to just a load without a fence. A mfence flushes the store buffer,
32896   // making the optimization clearly correct.
32897   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32898   // otherwise, we might be able to be more aggressive on relaxed idempotent
32899   // rmw. In practice, they do not look useful, so we don't try to be
32900   // especially clever.
32901   if (SSID == SyncScope::SingleThread)
32902     // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
32903     // the IR level, so we must wrap it in an intrinsic.
32904     return nullptr;
32905
32906   if (!Subtarget.hasMFence())
32907     // FIXME: it might make sense to use a locked operation here but on a
32908     // different cache-line to prevent cache-line bouncing. In practice it
32909     // is probably a small win, and x86 processors without mfence are rare
32910     // enough that we do not bother.
32911     return nullptr;
32912
32913   Function *MFence =
32914       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
32915   Builder.CreateCall(MFence, {});
32916
32917   // Finally we can emit the atomic load.
32918   LoadInst *Loaded = Builder.CreateAlignedLoad(
32919       AI->getType(), AI->getPointerOperand(), AI->getAlign());
32920   Loaded->setAtomic(Order, SSID);
32921   AI->replaceAllUsesWith(Loaded);
32922   AI->eraseFromParent();
32923   return Loaded;
32924 }
32925
32926 bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
32927   if (!SI.isUnordered())
32928     return false;
32929   return ExperimentalUnorderedISEL;
32930 }
32931 bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
32932   if (!LI.isUnordered())
32933     return false;
32934   return ExperimentalUnorderedISEL;
32935 }
32936
32937
32938 /// Emit a locked operation on a stack location which does not change any
32939 /// memory location, but does involve a lock prefix.  Location is chosen to be
32940 /// a) very likely accessed only by a single thread to minimize cache traffic,
32941 /// and b) definitely dereferenceable.  Returns the new Chain result.
32942 static SDValue emitLockedStackOp(SelectionDAG &DAG,
32943                                  const X86Subtarget &Subtarget, SDValue Chain,
32944                                  const SDLoc &DL) {
32945   // Implementation notes:
32946   // 1) LOCK prefix creates a full read/write reordering barrier for memory
32947   // operations issued by the current processor.  As such, the location
32948   // referenced is not relevant for the ordering properties of the instruction.
32949   // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32950   // 8.2.3.9  Loads and Stores Are Not Reordered with Locked Instructions
32951   // 2) Using an immediate operand appears to be the best encoding choice
32952   // here since it doesn't require an extra register.
32953   // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32954   // is small enough it might just be measurement noise.)
32955   // 4) When choosing offsets, there are several contributing factors:
32956   //   a) If there's no redzone, we default to TOS.  (We could allocate a cache
32957   //      line aligned stack object to improve this case.)
32958   //   b) To minimize our chances of introducing a false dependence, we prefer
32959   //      to offset the stack usage from TOS slightly.
32960   //   c) To minimize concerns about cross thread stack usage - in particular,
32961   //      the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32962   //      captures state in the TOS frame and accesses it from many threads -
32963   //      we want to use an offset such that the offset is in a distinct cache
32964   //      line from the TOS frame.
32965   //
32966   // For a general discussion of the tradeoffs and benchmark results, see:
32967   // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32968
32969   auto &MF = DAG.getMachineFunction();
32970   auto &TFL = *Subtarget.getFrameLowering();
32971   const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32972
32973   if (Subtarget.is64Bit()) {
32974     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32975     SDValue Ops[] = {
32976       DAG.getRegister(X86::RSP, MVT::i64),                  // Base
32977       DAG.getTargetConstant(1, DL, MVT::i8),                // Scale
32978       DAG.getRegister(0, MVT::i64),                         // Index
32979       DAG.getTargetConstant(SPOffset, DL, MVT::i32),        // Disp
32980       DAG.getRegister(0, MVT::i16),                         // Segment.
32981       Zero,
32982       Chain};
32983     SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32984                                      MVT::Other, Ops);
32985     return SDValue(Res, 1);
32986   }
32987
32988   SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32989   SDValue Ops[] = {
32990     DAG.getRegister(X86::ESP, MVT::i32),            // Base
32991     DAG.getTargetConstant(1, DL, MVT::i8),          // Scale
32992     DAG.getRegister(0, MVT::i32),                   // Index
32993     DAG.getTargetConstant(SPOffset, DL, MVT::i32),  // Disp
32994     DAG.getRegister(0, MVT::i16),                   // Segment.
32995     Zero,
32996     Chain
32997   };
32998   SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32999                                    MVT::Other, Ops);
33000   return SDValue(Res, 1);
33001 }
33002
33003 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
33004                                  SelectionDAG &DAG) {
33005   SDLoc dl(Op);
33006   AtomicOrdering FenceOrdering =
33007       static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
33008   SyncScope::ID FenceSSID =
33009       static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
33010
33011   // The only fence that needs an instruction is a sequentially-consistent
33012   // cross-thread fence.
33013   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
33014       FenceSSID == SyncScope::System) {
33015     if (Subtarget.hasMFence())
33016       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
33017
33018     SDValue Chain = Op.getOperand(0);
33019     return emitLockedStackOp(DAG, Subtarget, Chain, dl);
33020   }
33021
33022   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
33023   return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
33024 }
33025
33026 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
33027                              SelectionDAG &DAG) {
33028   MVT T = Op.getSimpleValueType();
33029   SDLoc DL(Op);
33030   unsigned Reg = 0;
33031   unsigned size = 0;
33032   switch(T.SimpleTy) {
33033   default: llvm_unreachable("Invalid value type!");
33034   case MVT::i8:  Reg = X86::AL;  size = 1; break;
33035   case MVT::i16: Reg = X86::AX;  size = 2; break;
33036   case MVT::i32: Reg = X86::EAX; size = 4; break;
33037   case MVT::i64:
33038     assert(Subtarget.is64Bit() && "Node not type legal!");
33039     Reg = X86::RAX; size = 8;
33040     break;
33041   }
33042   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
33043                                   Op.getOperand(2), SDValue());
33044   SDValue Ops[] = { cpIn.getValue(0),
33045                     Op.getOperand(1),
33046                     Op.getOperand(3),
33047                     DAG.getTargetConstant(size, DL, MVT::i8),
33048                     cpIn.getValue(1) };
33049   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
33050   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
33051   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
33052                                            Ops, T, MMO);
33053
33054   SDValue cpOut =
33055     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
33056   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
33057                                       MVT::i32, cpOut.getValue(2));
33058   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
33059
33060   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
33061                      cpOut, Success, EFLAGS.getValue(1));
33062 }
33063
33064 // Create MOVMSKB, taking into account whether we need to split for AVX1.
33065 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
33066                            const X86Subtarget &Subtarget) {
33067   MVT InVT = V.getSimpleValueType();
33068
33069   if (InVT == MVT::v64i8) {
33070     SDValue Lo, Hi;
33071     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
33072     Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
33073     Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
33074     Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
33075     Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
33076     Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
33077                      DAG.getConstant(32, DL, MVT::i8));
33078     return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
33079   }
33080   if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
33081     SDValue Lo, Hi;
33082     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
33083     Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
33084     Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
33085     Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
33086                      DAG.getConstant(16, DL, MVT::i8));
33087     return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
33088   }
33089
33090   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
33091 }
33092
33093 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
33094                             SelectionDAG &DAG) {
33095   SDValue Src = Op.getOperand(0);
33096   MVT SrcVT = Src.getSimpleValueType();
33097   MVT DstVT = Op.getSimpleValueType();
33098
33099   // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
33100   // half to v32i1 and concatenating the result.
33101   if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
33102     assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
33103     assert(Subtarget.hasBWI() && "Expected BWI target");
33104     SDLoc dl(Op);
33105     SDValue Lo, Hi;
33106     std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
33107     Lo = DAG.getBitcast(MVT::v32i1, Lo);
33108     Hi = DAG.getBitcast(MVT::v32i1, Hi);
33109     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
33110   }
33111
33112   // Use MOVMSK for vector to scalar conversion to prevent scalarization.
33113   if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
33114     assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
33115     MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
33116     SDLoc DL(Op);
33117     SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
33118     V = getPMOVMSKB(DL, V, DAG, Subtarget);
33119     return DAG.getZExtOrTrunc(V, DL, DstVT);
33120   }
33121
33122   assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
33123           SrcVT == MVT::i64) && "Unexpected VT!");
33124
33125   assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33126   if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
33127       !(DstVT == MVT::x86mmx && SrcVT.isVector()))
33128     // This conversion needs to be expanded.
33129     return SDValue();
33130
33131   SDLoc dl(Op);
33132   if (SrcVT.isVector()) {
33133     // Widen the vector in input in the case of MVT::v2i32.
33134     // Example: from MVT::v2i32 to MVT::v4i32.
33135     MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
33136                                  SrcVT.getVectorNumElements() * 2);
33137     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
33138                       DAG.getUNDEF(SrcVT));
33139   } else {
33140     assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
33141            "Unexpected source type in LowerBITCAST");
33142     Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
33143   }
33144
33145   MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
33146   Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
33147
33148   if (DstVT == MVT::x86mmx)
33149     return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
33150
33151   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
33152                      DAG.getIntPtrConstant(0, dl));
33153 }
33154
33155 /// Compute the horizontal sum of bytes in V for the elements of VT.
33156 ///
33157 /// Requires V to be a byte vector and VT to be an integer vector type with
33158 /// wider elements than V's type. The width of the elements of VT determines
33159 /// how many bytes of V are summed horizontally to produce each element of the
33160 /// result.
33161 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
33162                                       const X86Subtarget &Subtarget,
33163                                       SelectionDAG &DAG) {
33164   SDLoc DL(V);
33165   MVT ByteVecVT = V.getSimpleValueType();
33166   MVT EltVT = VT.getVectorElementType();
33167   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
33168          "Expected value to have byte element type.");
33169   assert(EltVT != MVT::i8 &&
33170          "Horizontal byte sum only makes sense for wider elements!");
33171   unsigned VecSize = VT.getSizeInBits();
33172   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
33173
33174   // PSADBW instruction horizontally add all bytes and leave the result in i64
33175   // chunks, thus directly computes the pop count for v2i64 and v4i64.
33176   if (EltVT == MVT::i64) {
33177     SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
33178     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
33179     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
33180     return DAG.getBitcast(VT, V);
33181   }
33182
33183   if (EltVT == MVT::i32) {
33184     // We unpack the low half and high half into i32s interleaved with zeros so
33185     // that we can use PSADBW to horizontally sum them. The most useful part of
33186     // this is that it lines up the results of two PSADBW instructions to be
33187     // two v2i64 vectors which concatenated are the 4 population counts. We can
33188     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
33189     SDValue Zeros = DAG.getConstant(0, DL, VT);
33190     SDValue V32 = DAG.getBitcast(VT, V);
33191     SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
33192     SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
33193
33194     // Do the horizontal sums into two v2i64s.
33195     Zeros = DAG.getConstant(0, DL, ByteVecVT);
33196     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
33197     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
33198                       DAG.getBitcast(ByteVecVT, Low), Zeros);
33199     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
33200                        DAG.getBitcast(ByteVecVT, High), Zeros);
33201
33202     // Merge them together.
33203     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
33204     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
33205                     DAG.getBitcast(ShortVecVT, Low),
33206                     DAG.getBitcast(ShortVecVT, High));
33207
33208     return DAG.getBitcast(VT, V);
33209   }
33210
33211   // The only element type left is i16.
33212   assert(EltVT == MVT::i16 && "Unknown how to handle type");
33213
33214   // To obtain pop count for each i16 element starting from the pop count for
33215   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
33216   // right by 8. It is important to shift as i16s as i8 vector shift isn't
33217   // directly supported.
33218   SDValue ShifterV = DAG.getConstant(8, DL, VT);
33219   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
33220   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
33221                   DAG.getBitcast(ByteVecVT, V));
33222   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
33223 }
33224
33225 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
33226                                         const X86Subtarget &Subtarget,
33227                                         SelectionDAG &DAG) {
33228   MVT VT = Op.getSimpleValueType();
33229   MVT EltVT = VT.getVectorElementType();
33230   int NumElts = VT.getVectorNumElements();
33231   (void)EltVT;
33232   assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
33233
33234   // Implement a lookup table in register by using an algorithm based on:
33235   // http://wm.ite.pl/articles/sse-popcount.html
33236   //
33237   // The general idea is that every lower byte nibble in the input vector is an
33238   // index into a in-register pre-computed pop count table. We then split up the
33239   // input vector in two new ones: (1) a vector with only the shifted-right
33240   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
33241   // masked out higher ones) for each byte. PSHUFB is used separately with both
33242   // to index the in-register table. Next, both are added and the result is a
33243   // i8 vector where each element contains the pop count for input byte.
33244   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
33245                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
33246                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
33247                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
33248
33249   SmallVector<SDValue, 64> LUTVec;
33250   for (int i = 0; i < NumElts; ++i)
33251     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
33252   SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
33253   SDValue M0F = DAG.getConstant(0x0F, DL, VT);
33254
33255   // High nibbles
33256   SDValue FourV = DAG.getConstant(4, DL, VT);
33257   SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
33258
33259   // Low nibbles
33260   SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
33261
33262   // The input vector is used as the shuffle mask that index elements into the
33263   // LUT. After counting low and high nibbles, add the vector to obtain the
33264   // final pop count per i8 element.
33265   SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
33266   SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
33267   return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
33268 }
33269
33270 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
33271 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
33272 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
33273                                 SelectionDAG &DAG) {
33274   MVT VT = Op.getSimpleValueType();
33275   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
33276          "Unknown CTPOP type to handle");
33277   SDLoc DL(Op.getNode());
33278   SDValue Op0 = Op.getOperand(0);
33279
33280   // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
33281   if (Subtarget.hasVPOPCNTDQ()) {
33282     unsigned NumElems = VT.getVectorNumElements();
33283     assert((VT.getVectorElementType() == MVT::i8 ||
33284             VT.getVectorElementType() == MVT::i16) && "Unexpected type");
33285     if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
33286       MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
33287       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
33288       Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
33289       return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
33290     }
33291   }
33292
33293   // Decompose 256-bit ops into smaller 128-bit ops.
33294   if (VT.is256BitVector() && !Subtarget.hasInt256())
33295     return splitVectorIntUnary(Op, DAG);
33296
33297   // Decompose 512-bit ops into smaller 256-bit ops.
33298   if (VT.is512BitVector() && !Subtarget.hasBWI())
33299     return splitVectorIntUnary(Op, DAG);
33300
33301   // For element types greater than i8, do vXi8 pop counts and a bytesum.
33302   if (VT.getScalarType() != MVT::i8) {
33303     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
33304     SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
33305     SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
33306     return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
33307   }
33308
33309   // We can't use the fast LUT approach, so fall back on LegalizeDAG.
33310   if (!Subtarget.hasSSSE3())
33311     return SDValue();
33312
33313   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
33314 }
33315
33316 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
33317                           SelectionDAG &DAG) {
33318   assert(Op.getSimpleValueType().isVector() &&
33319          "We only do custom lowering for vector population count.");
33320   return LowerVectorCTPOP(Op, Subtarget, DAG);
33321 }
33322
33323 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
33324   MVT VT = Op.getSimpleValueType();
33325   SDValue In = Op.getOperand(0);
33326   SDLoc DL(Op);
33327
33328   // For scalars, its still beneficial to transfer to/from the SIMD unit to
33329   // perform the BITREVERSE.
33330   if (!VT.isVector()) {
33331     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
33332     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
33333     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
33334     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
33335                        DAG.getIntPtrConstant(0, DL));
33336   }
33337
33338   int NumElts = VT.getVectorNumElements();
33339   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
33340
33341   // Decompose 256-bit ops into smaller 128-bit ops.
33342   if (VT.is256BitVector())
33343     return splitVectorIntUnary(Op, DAG);
33344
33345   assert(VT.is128BitVector() &&
33346          "Only 128-bit vector bitreverse lowering supported.");
33347
33348   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
33349   // perform the BSWAP in the shuffle.
33350   // Its best to shuffle using the second operand as this will implicitly allow
33351   // memory folding for multiple vectors.
33352   SmallVector<SDValue, 16> MaskElts;
33353   for (int i = 0; i != NumElts; ++i) {
33354     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
33355       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
33356       int PermuteByte = SourceByte | (2 << 5);
33357       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
33358     }
33359   }
33360
33361   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
33362   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
33363   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
33364                     Res, Mask);
33365   return DAG.getBitcast(VT, Res);
33366 }
33367
33368 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
33369                                SelectionDAG &DAG) {
33370   MVT VT = Op.getSimpleValueType();
33371
33372   if (Subtarget.hasXOP() && !VT.is512BitVector())
33373     return LowerBITREVERSE_XOP(Op, DAG);
33374
33375   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
33376
33377   SDValue In = Op.getOperand(0);
33378   SDLoc DL(Op);
33379
33380   assert(VT.getScalarType() == MVT::i8 &&
33381          "Only byte vector BITREVERSE supported");
33382
33383   // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
33384   if (VT == MVT::v64i8 && !Subtarget.hasBWI())
33385     return splitVectorIntUnary(Op, DAG);
33386
33387   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
33388   if (VT == MVT::v32i8 && !Subtarget.hasInt256())
33389     return splitVectorIntUnary(Op, DAG);
33390
33391   unsigned NumElts = VT.getVectorNumElements();
33392
33393   // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
33394   if (Subtarget.hasGFNI()) {
33395     MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
33396     SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
33397     Matrix = DAG.getBitcast(VT, Matrix);
33398     return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
33399                        DAG.getTargetConstant(0, DL, MVT::i8));
33400   }
33401
33402   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
33403   // two nibbles and a PSHUFB lookup to find the bitreverse of each
33404   // 0-15 value (moved to the other nibble).
33405   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
33406   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
33407   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
33408
33409   const int LoLUT[16] = {
33410       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
33411       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
33412       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
33413       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
33414   const int HiLUT[16] = {
33415       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
33416       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
33417       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
33418       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
33419
33420   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
33421   for (unsigned i = 0; i < NumElts; ++i) {
33422     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
33423     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
33424   }
33425
33426   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
33427   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
33428   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
33429   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
33430   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
33431 }
33432
33433 static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
33434                            SelectionDAG &DAG) {
33435   SDLoc DL(Op);
33436   SDValue X = Op.getOperand(0);
33437   MVT VT = Op.getSimpleValueType();
33438
33439   // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
33440   if (VT == MVT::i8 ||
33441       DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
33442     X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
33443     SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
33444                                 DAG.getConstant(0, DL, MVT::i8));
33445     // Copy the inverse of the parity flag into a register with setcc.
33446     SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
33447     // Extend to the original type.
33448     return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
33449   }
33450
33451   // If we have POPCNT, use the default expansion.
33452   if (Subtarget.hasPOPCNT())
33453     return SDValue();
33454
33455   if (VT == MVT::i64) {
33456     // Xor the high and low 16-bits together using a 32-bit operation.
33457     SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
33458                              DAG.getNode(ISD::SRL, DL, MVT::i64, X,
33459                                          DAG.getConstant(32, DL, MVT::i8)));
33460     SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
33461     X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
33462   }
33463
33464   if (VT != MVT::i16) {
33465     // Xor the high and low 16-bits together using a 32-bit operation.
33466     SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
33467                                DAG.getConstant(16, DL, MVT::i8));
33468     X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
33469   } else {
33470     // If the input is 16-bits, we need to extend to use an i32 shift below.
33471     X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
33472   }
33473
33474   // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
33475   // This should allow an h-reg to be used to save a shift.
33476   SDValue Hi = DAG.getNode(
33477       ISD::TRUNCATE, DL, MVT::i8,
33478       DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
33479   SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
33480   SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
33481   SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
33482
33483   // Copy the inverse of the parity flag into a register with setcc.
33484   SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
33485   // Extend to the original type.
33486   return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
33487 }
33488
33489 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
33490                                         const X86Subtarget &Subtarget) {
33491   unsigned NewOpc = 0;
33492   switch (N->getOpcode()) {
33493   case ISD::ATOMIC_LOAD_ADD:
33494     NewOpc = X86ISD::LADD;
33495     break;
33496   case ISD::ATOMIC_LOAD_SUB:
33497     NewOpc = X86ISD::LSUB;
33498     break;
33499   case ISD::ATOMIC_LOAD_OR:
33500     NewOpc = X86ISD::LOR;
33501     break;
33502   case ISD::ATOMIC_LOAD_XOR:
33503     NewOpc = X86ISD::LXOR;
33504     break;
33505   case ISD::ATOMIC_LOAD_AND:
33506     NewOpc = X86ISD::LAND;
33507     break;
33508   default:
33509     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
33510   }
33511
33512   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
33513
33514   return DAG.getMemIntrinsicNode(
33515       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
33516       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
33517       /*MemVT=*/N->getSimpleValueType(0), MMO);
33518 }
33519
33520 /// Lower atomic_load_ops into LOCK-prefixed operations.
33521 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
33522                                 const X86Subtarget &Subtarget) {
33523   AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
33524   SDValue Chain = N->getOperand(0);
33525   SDValue LHS = N->getOperand(1);
33526   SDValue RHS = N->getOperand(2);
33527   unsigned Opc = N->getOpcode();
33528   MVT VT = N->getSimpleValueType(0);
33529   SDLoc DL(N);
33530
33531   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
33532   // can only be lowered when the result is unused.  They should have already
33533   // been transformed into a cmpxchg loop in AtomicExpand.
33534   if (N->hasAnyUseOfValue(0)) {
33535     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
33536     // select LXADD if LOCK_SUB can't be selected.
33537     // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
33538     // can use LXADD as opposed to cmpxchg.
33539     if (Opc == ISD::ATOMIC_LOAD_SUB ||
33540         (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS))) {
33541       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
33542       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS,
33543                            AN->getMemOperand());
33544     }
33545     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
33546            "Used AtomicRMW ops other than Add should have been expanded!");
33547     return N;
33548   }
33549
33550   // Specialized lowering for the canonical form of an idemptotent atomicrmw.
33551   // The core idea here is that since the memory location isn't actually
33552   // changing, all we need is a lowering for the *ordering* impacts of the
33553   // atomicrmw.  As such, we can chose a different operation and memory
33554   // location to minimize impact on other code.
33555   // The above holds unless the node is marked volatile in which
33556   // case it needs to be preserved according to the langref.
33557   if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
33558     // On X86, the only ordering which actually requires an instruction is
33559     // seq_cst which isn't SingleThread, everything just needs to be preserved
33560     // during codegen and then dropped. Note that we expect (but don't assume),
33561     // that orderings other than seq_cst and acq_rel have been canonicalized to
33562     // a store or load.
33563     if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
33564         AN->getSyncScopeID() == SyncScope::System) {
33565       // Prefer a locked operation against a stack location to minimize cache
33566       // traffic.  This assumes that stack locations are very likely to be
33567       // accessed only by the owning thread.
33568       SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
33569       assert(!N->hasAnyUseOfValue(0));
33570       // NOTE: The getUNDEF is needed to give something for the unused result 0.
33571       return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33572                          DAG.getUNDEF(VT), NewChain);
33573     }
33574     // MEMBARRIER is a compiler barrier; it codegens to a no-op.
33575     SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
33576     assert(!N->hasAnyUseOfValue(0));
33577     // NOTE: The getUNDEF is needed to give something for the unused result 0.
33578     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33579                        DAG.getUNDEF(VT), NewChain);
33580   }
33581
33582   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
33583   // RAUW the chain, but don't worry about the result, as it's unused.
33584   assert(!N->hasAnyUseOfValue(0));
33585   // NOTE: The getUNDEF is needed to give something for the unused result 0.
33586   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
33587                      DAG.getUNDEF(VT), LockOp.getValue(1));
33588 }
33589
33590 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
33591                                  const X86Subtarget &Subtarget) {
33592   auto *Node = cast<AtomicSDNode>(Op.getNode());
33593   SDLoc dl(Node);
33594   EVT VT = Node->getMemoryVT();
33595
33596   bool IsSeqCst =
33597       Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
33598   bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
33599
33600   // If this store is not sequentially consistent and the type is legal
33601   // we can just keep it.
33602   if (!IsSeqCst && IsTypeLegal)
33603     return Op;
33604
33605   if (VT == MVT::i64 && !IsTypeLegal) {
33606     // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
33607     // is enabled.
33608     bool NoImplicitFloatOps =
33609         DAG.getMachineFunction().getFunction().hasFnAttribute(
33610             Attribute::NoImplicitFloat);
33611     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
33612       SDValue Chain;
33613       if (Subtarget.hasSSE1()) {
33614         SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
33615                                        Node->getOperand(2));
33616         MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33617         SclToVec = DAG.getBitcast(StVT, SclToVec);
33618         SDVTList Tys = DAG.getVTList(MVT::Other);
33619         SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
33620         Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
33621                                         MVT::i64, Node->getMemOperand());
33622       } else if (Subtarget.hasX87()) {
33623         // First load this into an 80-bit X87 register using a stack temporary.
33624         // This will put the whole integer into the significand.
33625         SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33626         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33627         MachinePointerInfo MPI =
33628             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
33629         Chain =
33630             DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
33631                          MPI, MaybeAlign(), MachineMemOperand::MOStore);
33632         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33633         SDValue LdOps[] = {Chain, StackPtr};
33634         SDValue Value = DAG.getMemIntrinsicNode(
33635             X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33636             /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33637         Chain = Value.getValue(1);
33638
33639         // Now use an FIST to do the atomic store.
33640         SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33641         Chain =
33642             DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33643                                     StoreOps, MVT::i64, Node->getMemOperand());
33644       }
33645
33646       if (Chain) {
33647         // If this is a sequentially consistent store, also emit an appropriate
33648         // barrier.
33649         if (IsSeqCst)
33650           Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33651
33652         return Chain;
33653       }
33654     }
33655   }
33656
33657   // Convert seq_cst store -> xchg
33658   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33659   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33660   SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
33661                                Node->getMemoryVT(),
33662                                Node->getOperand(0),
33663                                Node->getOperand(1), Node->getOperand(2),
33664                                Node->getMemOperand());
33665   return Swap.getValue(1);
33666 }
33667
33668 static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
33669   SDNode *N = Op.getNode();
33670   MVT VT = N->getSimpleValueType(0);
33671   unsigned Opc = Op.getOpcode();
33672
33673   // Let legalize expand this if it isn't a legal type yet.
33674   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33675     return SDValue();
33676
33677   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33678   SDLoc DL(N);
33679
33680   // Set the carry flag.
33681   SDValue Carry = Op.getOperand(2);
33682   EVT CarryVT = Carry.getValueType();
33683   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33684                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
33685
33686   bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33687   SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33688                             Op.getOperand(0), Op.getOperand(1),
33689                             Carry.getValue(1));
33690
33691   bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33692   SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33693                            Sum.getValue(1), DL, DAG);
33694   if (N->getValueType(1) == MVT::i1)
33695     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33696
33697   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33698 }
33699
33700 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33701                             SelectionDAG &DAG) {
33702   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33703
33704   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33705   // which returns the values as { float, float } (in XMM0) or
33706   // { double, double } (which is returned in XMM0, XMM1).
33707   SDLoc dl(Op);
33708   SDValue Arg = Op.getOperand(0);
33709   EVT ArgVT = Arg.getValueType();
33710   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33711
33712   TargetLowering::ArgListTy Args;
33713   TargetLowering::ArgListEntry Entry;
33714
33715   Entry.Node = Arg;
33716   Entry.Ty = ArgTy;
33717   Entry.IsSExt = false;
33718   Entry.IsZExt = false;
33719   Args.push_back(Entry);
33720
33721   bool isF64 = ArgVT == MVT::f64;
33722   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33723   // the small struct {f32, f32} is returned in (eax, edx). For f64,
33724   // the results are returned via SRet in memory.
33725   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33726   RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33727   const char *LibcallName = TLI.getLibcallName(LC);
33728   SDValue Callee =
33729       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33730
33731   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33732                       : (Type *)FixedVectorType::get(ArgTy, 4);
33733
33734   TargetLowering::CallLoweringInfo CLI(DAG);
33735   CLI.setDebugLoc(dl)
33736       .setChain(DAG.getEntryNode())
33737       .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33738
33739   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33740
33741   if (isF64)
33742     // Returned in xmm0 and xmm1.
33743     return CallResult.first;
33744
33745   // Returned in bits 0:31 and 32:64 xmm0.
33746   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
33747                                CallResult.first, DAG.getIntPtrConstant(0, dl));
33748   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
33749                                CallResult.first, DAG.getIntPtrConstant(1, dl));
33750   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33751   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33752 }
33753
33754 /// Widen a vector input to a vector of NVT.  The
33755 /// input vector must have the same element type as NVT.
33756 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
33757                             bool FillWithZeroes = false) {
33758   // Check if InOp already has the right width.
33759   MVT InVT = InOp.getSimpleValueType();
33760   if (InVT == NVT)
33761     return InOp;
33762
33763   if (InOp.isUndef())
33764     return DAG.getUNDEF(NVT);
33765
33766   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
33767          "input and widen element type must match");
33768
33769   unsigned InNumElts = InVT.getVectorNumElements();
33770   unsigned WidenNumElts = NVT.getVectorNumElements();
33771   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33772          "Unexpected request for vector widening");
33773
33774   SDLoc dl(InOp);
33775   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
33776       InOp.getNumOperands() == 2) {
33777     SDValue N1 = InOp.getOperand(1);
33778     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33779         N1.isUndef()) {
33780       InOp = InOp.getOperand(0);
33781       InVT = InOp.getSimpleValueType();
33782       InNumElts = InVT.getVectorNumElements();
33783     }
33784   }
33785   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
33786       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
33787     SmallVector<SDValue, 16> Ops;
33788     for (unsigned i = 0; i < InNumElts; ++i)
33789       Ops.push_back(InOp.getOperand(i));
33790
33791     EVT EltVT = InOp.getOperand(0).getValueType();
33792
33793     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
33794       DAG.getUNDEF(EltVT);
33795     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
33796       Ops.push_back(FillVal);
33797     return DAG.getBuildVector(NVT, dl, Ops);
33798   }
33799   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
33800     DAG.getUNDEF(NVT);
33801   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
33802                      InOp, DAG.getIntPtrConstant(0, dl));
33803 }
33804
33805 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
33806                              SelectionDAG &DAG) {
33807   assert(Subtarget.hasAVX512() &&
33808          "MGATHER/MSCATTER are supported on AVX-512 arch only");
33809
33810   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
33811   SDValue Src = N->getValue();
33812   MVT VT = Src.getSimpleValueType();
33813   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33814   SDLoc dl(Op);
33815
33816   SDValue Scale = N->getScale();
33817   SDValue Index = N->getIndex();
33818   SDValue Mask = N->getMask();
33819   SDValue Chain = N->getChain();
33820   SDValue BasePtr = N->getBasePtr();
33821
33822   if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33823     assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33824     // If the index is v2i64 and we have VLX we can use xmm for data and index.
33825     if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33826       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33827       EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33828       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33829       SDVTList VTs = DAG.getVTList(MVT::Other);
33830       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33831       return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33832                                      N->getMemoryVT(), N->getMemOperand());
33833     }
33834     return SDValue();
33835   }
33836
33837   MVT IndexVT = Index.getSimpleValueType();
33838
33839   // If the index is v2i32, we're being called by type legalization and we
33840   // should just let the default handling take care of it.
33841   if (IndexVT == MVT::v2i32)
33842     return SDValue();
33843
33844   // If we don't have VLX and neither the passthru or index is 512-bits, we
33845   // need to widen until one is.
33846   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33847       !Index.getSimpleValueType().is512BitVector()) {
33848     // Determine how much we need to widen by to get a 512-bit type.
33849     unsigned Factor = std::min(512/VT.getSizeInBits(),
33850                                512/IndexVT.getSizeInBits());
33851     unsigned NumElts = VT.getVectorNumElements() * Factor;
33852
33853     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33854     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33855     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33856
33857     Src = ExtendToType(Src, VT, DAG);
33858     Index = ExtendToType(Index, IndexVT, DAG);
33859     Mask = ExtendToType(Mask, MaskVT, DAG, true);
33860   }
33861
33862   SDVTList VTs = DAG.getVTList(MVT::Other);
33863   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33864   return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33865                                  N->getMemoryVT(), N->getMemOperand());
33866 }
33867
33868 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33869                           SelectionDAG &DAG) {
33870
33871   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
33872   MVT VT = Op.getSimpleValueType();
33873   MVT ScalarVT = VT.getScalarType();
33874   SDValue Mask = N->getMask();
33875   MVT MaskVT = Mask.getSimpleValueType();
33876   SDValue PassThru = N->getPassThru();
33877   SDLoc dl(Op);
33878
33879   // Handle AVX masked loads which don't support passthru other than 0.
33880   if (MaskVT.getVectorElementType() != MVT::i1) {
33881     // We also allow undef in the isel pattern.
33882     if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33883       return Op;
33884
33885     SDValue NewLoad = DAG.getMaskedLoad(
33886         VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33887         getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33888         N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33889         N->isExpandingLoad());
33890     // Emit a blend.
33891     SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33892     return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33893   }
33894
33895   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33896          "Expanding masked load is supported on AVX-512 target only!");
33897
33898   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33899          "Expanding masked load is supported for 32 and 64-bit types only!");
33900
33901   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33902          "Cannot lower masked load op.");
33903
33904   assert((ScalarVT.getSizeInBits() >= 32 ||
33905           (Subtarget.hasBWI() &&
33906               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
33907          "Unsupported masked load op.");
33908
33909   // This operation is legal for targets with VLX, but without
33910   // VLX the vector should be widened to 512 bit
33911   unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33912   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33913   PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33914
33915   // Mask element has to be i1.
33916   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33917          "Unexpected mask type");
33918
33919   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33920
33921   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33922   SDValue NewLoad = DAG.getMaskedLoad(
33923       WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33924       PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33925       N->getExtensionType(), N->isExpandingLoad());
33926
33927   SDValue Extract =
33928       DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33929                   DAG.getIntPtrConstant(0, dl));
33930   SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33931   return DAG.getMergeValues(RetOps, dl);
33932 }
33933
33934 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33935                            SelectionDAG &DAG) {
33936   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
33937   SDValue DataToStore = N->getValue();
33938   MVT VT = DataToStore.getSimpleValueType();
33939   MVT ScalarVT = VT.getScalarType();
33940   SDValue Mask = N->getMask();
33941   SDLoc dl(Op);
33942
33943   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33944          "Expanding masked load is supported on AVX-512 target only!");
33945
33946   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33947          "Expanding masked load is supported for 32 and 64-bit types only!");
33948
33949   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33950          "Cannot lower masked store op.");
33951
33952   assert((ScalarVT.getSizeInBits() >= 32 ||
33953           (Subtarget.hasBWI() &&
33954               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
33955           "Unsupported masked store op.");
33956
33957   // This operation is legal for targets with VLX, but without
33958   // VLX the vector should be widened to 512 bit
33959   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33960   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33961
33962   // Mask element has to be i1.
33963   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33964          "Unexpected mask type");
33965
33966   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33967
33968   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33969   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33970   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33971                             N->getOffset(), Mask, N->getMemoryVT(),
33972                             N->getMemOperand(), N->getAddressingMode(),
33973                             N->isTruncatingStore(), N->isCompressingStore());
33974 }
33975
33976 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33977                             SelectionDAG &DAG) {
33978   assert(Subtarget.hasAVX2() &&
33979          "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33980
33981   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
33982   SDLoc dl(Op);
33983   MVT VT = Op.getSimpleValueType();
33984   SDValue Index = N->getIndex();
33985   SDValue Mask = N->getMask();
33986   SDValue PassThru = N->getPassThru();
33987   MVT IndexVT = Index.getSimpleValueType();
33988
33989   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33990
33991   // If the index is v2i32, we're being called by type legalization.
33992   if (IndexVT == MVT::v2i32)
33993     return SDValue();
33994
33995   // If we don't have VLX and neither the passthru or index is 512-bits, we
33996   // need to widen until one is.
33997   MVT OrigVT = VT;
33998   if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33999       !IndexVT.is512BitVector()) {
34000     // Determine how much we need to widen by to get a 512-bit type.
34001     unsigned Factor = std::min(512/VT.getSizeInBits(),
34002                                512/IndexVT.getSizeInBits());
34003
34004     unsigned NumElts = VT.getVectorNumElements() * Factor;
34005
34006     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
34007     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
34008     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
34009
34010     PassThru = ExtendToType(PassThru, VT, DAG);
34011     Index = ExtendToType(Index, IndexVT, DAG);
34012     Mask = ExtendToType(Mask, MaskVT, DAG, true);
34013   }
34014
34015   // Break dependency on the data register.
34016   if (PassThru.isUndef())
34017     PassThru = getZeroVector(VT, Subtarget, DAG, dl);
34018
34019   SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
34020                     N->getScale() };
34021   SDValue NewGather = DAG.getMemIntrinsicNode(
34022       X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
34023       N->getMemOperand());
34024   SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
34025                                 NewGather, DAG.getIntPtrConstant(0, dl));
34026   return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
34027 }
34028
34029 static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
34030   SDLoc dl(Op);
34031   SDValue Src = Op.getOperand(0);
34032   MVT DstVT = Op.getSimpleValueType();
34033
34034   AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
34035   unsigned SrcAS = N->getSrcAddressSpace();
34036
34037   assert(SrcAS != N->getDestAddressSpace() &&
34038          "addrspacecast must be between different address spaces");
34039
34040   if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
34041     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
34042   } else if (DstVT == MVT::i64) {
34043     Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
34044   } else if (DstVT == MVT::i32) {
34045     Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
34046   } else {
34047     report_fatal_error("Bad address space in addrspacecast");
34048   }
34049   return Op;
34050 }
34051
34052 SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
34053                                               SelectionDAG &DAG) const {
34054   // TODO: Eventually, the lowering of these nodes should be informed by or
34055   // deferred to the GC strategy for the function in which they appear. For
34056   // now, however, they must be lowered to something. Since they are logically
34057   // no-ops in the case of a null GC strategy (or a GC strategy which does not
34058   // require special handling for these nodes), lower them as literal NOOPs for
34059   // the time being.
34060   SmallVector<SDValue, 2> Ops;
34061   Ops.push_back(Op.getOperand(0));
34062   if (Op->getGluedNode())
34063     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
34064
34065   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
34066   return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
34067 }
34068
34069 // Custom split CVTPS2PH with wide types.
34070 static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
34071   SDLoc dl(Op);
34072   EVT VT = Op.getValueType();
34073   SDValue Lo, Hi;
34074   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
34075   EVT LoVT, HiVT;
34076   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
34077   SDValue RC = Op.getOperand(1);
34078   Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
34079   Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
34080   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34081 }
34082
34083 static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget,
34084                              SelectionDAG &DAG) {
34085   unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
34086
34087   // We don't support non-data prefetch without PREFETCHI.
34088   // Just preserve the chain.
34089   if (!IsData && !Subtarget.hasPREFETCHI())
34090     return Op.getOperand(0);
34091
34092   return Op;
34093 }
34094
34095 static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,
34096                                      unsigned OpNo) {
34097   const APInt Operand(32, OpNo);
34098   std::string OpNoStr = llvm::toString(Operand, 10, false);
34099   std::string Str(" $");
34100
34101   std::string OpNoStr1(Str + OpNoStr);             // e.g. " $1" (OpNo=1)
34102   std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
34103
34104   auto I = StringRef::npos;
34105   for (auto &AsmStr : AsmStrs) {
34106     // Match the OpNo string. We should match exactly to exclude match
34107     // sub-string, e.g. "$12" contain "$1"
34108     if (AsmStr.endswith(OpNoStr1))
34109       I = AsmStr.size() - OpNoStr1.size();
34110
34111     // Get the index of operand in AsmStr.
34112     if (I == StringRef::npos)
34113       I = AsmStr.find(OpNoStr1 + ",");
34114     if (I == StringRef::npos)
34115       I = AsmStr.find(OpNoStr2);
34116
34117     if (I == StringRef::npos)
34118       continue;
34119
34120     assert(I > 0 && "Unexpected inline asm string!");
34121     // Remove the operand string and label (if exsit).
34122     // For example:
34123     // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
34124     // ==>
34125     // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
34126     // ==>
34127     // "call dword ptr "
34128     auto TmpStr = AsmStr.substr(0, I);
34129     I = TmpStr.rfind(':');
34130     if (I != StringRef::npos)
34131       TmpStr = TmpStr.substr(I + 1);
34132     return TmpStr.take_while(llvm::isAlpha);
34133   }
34134
34135   return StringRef();
34136 }
34137
34138 bool X86TargetLowering::isInlineAsmTargetBranch(
34139     const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
34140   // In a __asm block, __asm inst foo where inst is CALL or JMP should be
34141   // changed from indirect TargetLowering::C_Memory to direct
34142   // TargetLowering::C_Address.
34143   // We don't need to special case LOOP* and Jcc, which cannot target a memory
34144   // location.
34145   StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
34146   return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
34147 }
34148
34149 /// Provide custom lowering hooks for some operations.
34150 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
34151   switch (Op.getOpcode()) {
34152   default: llvm_unreachable("Should not custom lower this!");
34153   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
34154   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
34155     return LowerCMP_SWAP(Op, Subtarget, DAG);
34156   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
34157   case ISD::ATOMIC_LOAD_ADD:
34158   case ISD::ATOMIC_LOAD_SUB:
34159   case ISD::ATOMIC_LOAD_OR:
34160   case ISD::ATOMIC_LOAD_XOR:
34161   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
34162   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG, Subtarget);
34163   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
34164   case ISD::PARITY:             return LowerPARITY(Op, Subtarget, DAG);
34165   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
34166   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
34167   case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
34168   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
34169   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
34170   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
34171   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
34172   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
34173   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
34174   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
34175   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
34176   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
34177   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
34178   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
34179   case ISD::SHL_PARTS:
34180   case ISD::SRA_PARTS:
34181   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
34182   case ISD::FSHL:
34183   case ISD::FSHR:               return LowerFunnelShift(Op, Subtarget, DAG);
34184   case ISD::STRICT_SINT_TO_FP:
34185   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
34186   case ISD::STRICT_UINT_TO_FP:
34187   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
34188   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
34189   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
34190   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
34191   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
34192   case ISD::ZERO_EXTEND_VECTOR_INREG:
34193   case ISD::SIGN_EXTEND_VECTOR_INREG:
34194     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
34195   case ISD::FP_TO_SINT:
34196   case ISD::STRICT_FP_TO_SINT:
34197   case ISD::FP_TO_UINT:
34198   case ISD::STRICT_FP_TO_UINT:  return LowerFP_TO_INT(Op, DAG);
34199   case ISD::FP_TO_SINT_SAT:
34200   case ISD::FP_TO_UINT_SAT:     return LowerFP_TO_INT_SAT(Op, DAG);
34201   case ISD::FP_EXTEND:
34202   case ISD::STRICT_FP_EXTEND:   return LowerFP_EXTEND(Op, DAG);
34203   case ISD::FP_ROUND:
34204   case ISD::STRICT_FP_ROUND:    return LowerFP_ROUND(Op, DAG);
34205   case ISD::FP16_TO_FP:
34206   case ISD::STRICT_FP16_TO_FP:  return LowerFP16_TO_FP(Op, DAG);
34207   case ISD::FP_TO_FP16:
34208   case ISD::STRICT_FP_TO_FP16:  return LowerFP_TO_FP16(Op, DAG);
34209   case ISD::FP_TO_BF16:         return LowerFP_TO_BF16(Op, DAG);
34210   case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
34211   case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
34212   case ISD::FADD:
34213   case ISD::FSUB:               return lowerFaddFsub(Op, DAG);
34214   case ISD::FROUND:             return LowerFROUND(Op, DAG);
34215   case ISD::FABS:
34216   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
34217   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
34218   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
34219   case ISD::LRINT:
34220   case ISD::LLRINT:             return LowerLRINT_LLRINT(Op, DAG);
34221   case ISD::SETCC:
34222   case ISD::STRICT_FSETCC:
34223   case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);
34224   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
34225   case ISD::SELECT:             return LowerSELECT(Op, DAG);
34226   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
34227   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
34228   case ISD::VASTART:            return LowerVASTART(Op, DAG);
34229   case ISD::VAARG:              return LowerVAARG(Op, DAG);
34230   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
34231   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
34232   case ISD::INTRINSIC_VOID:
34233   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
34234   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
34235   case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
34236   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
34237   case ISD::FRAME_TO_ARGS_OFFSET:
34238                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
34239   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
34240   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
34241   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
34242   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
34243   case ISD::EH_SJLJ_SETUP_DISPATCH:
34244     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
34245   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
34246   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
34247   case ISD::GET_ROUNDING:       return LowerGET_ROUNDING(Op, DAG);
34248   case ISD::SET_ROUNDING:       return LowerSET_ROUNDING(Op, DAG);
34249   case ISD::CTLZ:
34250   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
34251   case ISD::CTTZ:
34252   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, Subtarget, DAG);
34253   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
34254   case ISD::MULHS:
34255   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
34256   case ISD::ROTL:
34257   case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
34258   case ISD::SRA:
34259   case ISD::SRL:
34260   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
34261   case ISD::SADDO:
34262   case ISD::UADDO:
34263   case ISD::SSUBO:
34264   case ISD::USUBO:              return LowerXALUO(Op, DAG);
34265   case ISD::SMULO:
34266   case ISD::UMULO:              return LowerMULO(Op, Subtarget, DAG);
34267   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
34268   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
34269   case ISD::SADDO_CARRY:
34270   case ISD::SSUBO_CARRY:
34271   case ISD::UADDO_CARRY:
34272   case ISD::USUBO_CARRY:        return LowerADDSUBO_CARRY(Op, DAG);
34273   case ISD::ADD:
34274   case ISD::SUB:                return lowerAddSub(Op, DAG, Subtarget);
34275   case ISD::UADDSAT:
34276   case ISD::SADDSAT:
34277   case ISD::USUBSAT:
34278   case ISD::SSUBSAT:            return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
34279   case ISD::SMAX:
34280   case ISD::SMIN:
34281   case ISD::UMAX:
34282   case ISD::UMIN:               return LowerMINMAX(Op, Subtarget, DAG);
34283   case ISD::FMINIMUM:
34284   case ISD::FMAXIMUM:
34285     return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
34286   case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);
34287   case ISD::ABDS:
34288   case ISD::ABDU:               return LowerABD(Op, Subtarget, DAG);
34289   case ISD::AVGCEILU:           return LowerAVG(Op, Subtarget, DAG);
34290   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
34291   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
34292   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
34293   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
34294   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
34295   case ISD::GC_TRANSITION_START:
34296   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION(Op, DAG);
34297   case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
34298   case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
34299   case ISD::PREFETCH:           return LowerPREFETCH(Op, Subtarget, DAG);
34300   }
34301 }
34302
34303 /// Replace a node with an illegal result type with a new node built out of
34304 /// custom code.
34305 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
34306                                            SmallVectorImpl<SDValue>&Results,
34307                                            SelectionDAG &DAG) const {
34308   SDLoc dl(N);
34309   switch (N->getOpcode()) {
34310   default:
34311 #ifndef NDEBUG
34312     dbgs() << "ReplaceNodeResults: ";
34313     N->dump(&DAG);
34314 #endif
34315     llvm_unreachable("Do not know how to custom type legalize this operation!");
34316   case X86ISD::CVTPH2PS: {
34317     EVT VT = N->getValueType(0);
34318     SDValue Lo, Hi;
34319     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34320     EVT LoVT, HiVT;
34321     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
34322     Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
34323     Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
34324     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34325     Results.push_back(Res);
34326     return;
34327   }
34328   case X86ISD::STRICT_CVTPH2PS: {
34329     EVT VT = N->getValueType(0);
34330     SDValue Lo, Hi;
34331     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
34332     EVT LoVT, HiVT;
34333     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
34334     Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
34335                      {N->getOperand(0), Lo});
34336     Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
34337                      {N->getOperand(0), Hi});
34338     SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34339                                 Lo.getValue(1), Hi.getValue(1));
34340     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34341     Results.push_back(Res);
34342     Results.push_back(Chain);
34343     return;
34344   }
34345   case X86ISD::CVTPS2PH:
34346     Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
34347     return;
34348   case ISD::CTPOP: {
34349     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34350     // Use a v2i64 if possible.
34351     bool NoImplicitFloatOps =
34352         DAG.getMachineFunction().getFunction().hasFnAttribute(
34353             Attribute::NoImplicitFloat);
34354     if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
34355       SDValue Wide =
34356           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
34357       Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
34358       // Bit count should fit in 32-bits, extract it as that and then zero
34359       // extend to i64. Otherwise we end up extracting bits 63:32 separately.
34360       Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
34361       Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
34362                          DAG.getIntPtrConstant(0, dl));
34363       Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
34364       Results.push_back(Wide);
34365     }
34366     return;
34367   }
34368   case ISD::MUL: {
34369     EVT VT = N->getValueType(0);
34370     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
34371            VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
34372     // Pre-promote these to vXi16 to avoid op legalization thinking all 16
34373     // elements are needed.
34374     MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
34375     SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
34376     SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
34377     SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
34378     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34379     unsigned NumConcats = 16 / VT.getVectorNumElements();
34380     SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34381     ConcatOps[0] = Res;
34382     Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
34383     Results.push_back(Res);
34384     return;
34385   }
34386   case ISD::SMULO:
34387   case ISD::UMULO: {
34388     EVT VT = N->getValueType(0);
34389     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
34390            VT == MVT::v2i32 && "Unexpected VT!");
34391     bool IsSigned = N->getOpcode() == ISD::SMULO;
34392     unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
34393     SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
34394     SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
34395     SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
34396     // Extract the high 32 bits from each result using PSHUFD.
34397     // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
34398     SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
34399     Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
34400     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
34401                      DAG.getIntPtrConstant(0, dl));
34402
34403     // Truncate the low bits of the result. This will become PSHUFD.
34404     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34405
34406     SDValue HiCmp;
34407     if (IsSigned) {
34408       // SMULO overflows if the high bits don't match the sign of the low.
34409       HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
34410     } else {
34411       // UMULO overflows if the high bits are non-zero.
34412       HiCmp = DAG.getConstant(0, dl, VT);
34413     }
34414     SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
34415
34416     // Widen the result with by padding with undef.
34417     Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34418                       DAG.getUNDEF(VT));
34419     Results.push_back(Res);
34420     Results.push_back(Ovf);
34421     return;
34422   }
34423   case X86ISD::VPMADDWD: {
34424     // Legalize types for X86ISD::VPMADDWD by widening.
34425     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34426
34427     EVT VT = N->getValueType(0);
34428     EVT InVT = N->getOperand(0).getValueType();
34429     assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
34430            "Expected a VT that divides into 128 bits.");
34431     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
34432            "Unexpected type action!");
34433     unsigned NumConcat = 128 / InVT.getSizeInBits();
34434
34435     EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
34436                                     InVT.getVectorElementType(),
34437                                     NumConcat * InVT.getVectorNumElements());
34438     EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
34439                                   VT.getVectorElementType(),
34440                                   NumConcat * VT.getVectorNumElements());
34441
34442     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
34443     Ops[0] = N->getOperand(0);
34444     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
34445     Ops[0] = N->getOperand(1);
34446     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
34447
34448     SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
34449     Results.push_back(Res);
34450     return;
34451   }
34452   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
34453   case X86ISD::FMINC:
34454   case X86ISD::FMIN:
34455   case X86ISD::FMAXC:
34456   case X86ISD::FMAX: {
34457     EVT VT = N->getValueType(0);
34458     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
34459     SDValue UNDEF = DAG.getUNDEF(VT);
34460     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
34461                               N->getOperand(0), UNDEF);
34462     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
34463                               N->getOperand(1), UNDEF);
34464     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
34465     return;
34466   }
34467   case ISD::SDIV:
34468   case ISD::UDIV:
34469   case ISD::SREM:
34470   case ISD::UREM: {
34471     EVT VT = N->getValueType(0);
34472     if (VT.isVector()) {
34473       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
34474              "Unexpected type action!");
34475       // If this RHS is a constant splat vector we can widen this and let
34476       // division/remainder by constant optimize it.
34477       // TODO: Can we do something for non-splat?
34478       APInt SplatVal;
34479       if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
34480         unsigned NumConcats = 128 / VT.getSizeInBits();
34481         SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
34482         Ops0[0] = N->getOperand(0);
34483         EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
34484         SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
34485         SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
34486         SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
34487         Results.push_back(Res);
34488       }
34489       return;
34490     }
34491
34492     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
34493     Results.push_back(V);
34494     return;
34495   }
34496   case ISD::TRUNCATE: {
34497     MVT VT = N->getSimpleValueType(0);
34498     if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
34499       return;
34500
34501     // The generic legalizer will try to widen the input type to the same
34502     // number of elements as the widened result type. But this isn't always
34503     // the best thing so do some custom legalization to avoid some cases.
34504     MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
34505     SDValue In = N->getOperand(0);
34506     EVT InVT = In.getValueType();
34507
34508     unsigned InBits = InVT.getSizeInBits();
34509     if (128 % InBits == 0) {
34510       // 128 bit and smaller inputs should avoid truncate all together and
34511       // just use a build_vector that will become a shuffle.
34512       // TODO: Widen and use a shuffle directly?
34513       MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
34514       EVT EltVT = VT.getVectorElementType();
34515       unsigned WidenNumElts = WidenVT.getVectorNumElements();
34516       SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
34517       // Use the original element count so we don't do more scalar opts than
34518       // necessary.
34519       unsigned MinElts = VT.getVectorNumElements();
34520       for (unsigned i=0; i < MinElts; ++i) {
34521         SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
34522                                   DAG.getIntPtrConstant(i, dl));
34523         Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
34524       }
34525       Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
34526       return;
34527     }
34528     // With AVX512 there are some cases that can use a target specific
34529     // truncate node to go from 256/512 to less than 128 with zeros in the
34530     // upper elements of the 128 bit result.
34531     if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34532       // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34533       if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34534         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34535         return;
34536       }
34537       // There's one case we can widen to 512 bits and use VTRUNC.
34538       if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34539         In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34540                          DAG.getUNDEF(MVT::v4i64));
34541         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34542         return;
34543       }
34544     }
34545     if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34546         getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34547         isTypeLegal(MVT::v4i64)) {
34548       // Input needs to be split and output needs to widened. Let's use two
34549       // VTRUNCs, and shuffle their results together into the wider type.
34550       SDValue Lo, Hi;
34551       std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34552
34553       Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34554       Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34555       SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34556                                          { 0,  1,  2,  3, 16, 17, 18, 19,
34557                                           -1, -1, -1, -1, -1, -1, -1, -1 });
34558       Results.push_back(Res);
34559       return;
34560     }
34561
34562     return;
34563   }
34564   case ISD::ANY_EXTEND:
34565     // Right now, only MVT::v8i8 has Custom action for an illegal type.
34566     // It's intended to custom handle the input type.
34567     assert(N->getValueType(0) == MVT::v8i8 &&
34568            "Do not know how to legalize this Node");
34569     return;
34570   case ISD::SIGN_EXTEND:
34571   case ISD::ZERO_EXTEND: {
34572     EVT VT = N->getValueType(0);
34573     SDValue In = N->getOperand(0);
34574     EVT InVT = In.getValueType();
34575     if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34576         (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34577       assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
34578              "Unexpected type action!");
34579       assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
34580       // Custom split this so we can extend i8/i16->i32 invec. This is better
34581       // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34582       // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34583       // we allow the sra from the extend to i32 to be shared by the split.
34584       In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34585
34586       // Fill a vector with sign bits for each element.
34587       SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34588       SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34589
34590       // Create an unpackl and unpackh to interleave the sign bits then bitcast
34591       // to v2i64.
34592       SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34593                                         {0, 4, 1, 5});
34594       Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34595       SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34596                                         {2, 6, 3, 7});
34597       Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34598
34599       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34600       Results.push_back(Res);
34601       return;
34602     }
34603
34604     if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34605       if (!InVT.is128BitVector()) {
34606         // Not a 128 bit vector, but maybe type legalization will promote
34607         // it to 128 bits.
34608         if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34609           return;
34610         InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34611         if (!InVT.is128BitVector())
34612           return;
34613
34614         // Promote the input to 128 bits. Type legalization will turn this into
34615         // zext_inreg/sext_inreg.
34616         In = DAG.getNode(N->getOpcode(), dl, InVT, In);
34617       }
34618
34619       // Perform custom splitting instead of the two stage extend we would get
34620       // by default.
34621       EVT LoVT, HiVT;
34622       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34623       assert(isTypeLegal(LoVT) && "Split VT not legal?");
34624
34625       SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
34626
34627       // We need to shift the input over by half the number of elements.
34628       unsigned NumElts = InVT.getVectorNumElements();
34629       unsigned HalfNumElts = NumElts / 2;
34630       SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34631       for (unsigned i = 0; i != HalfNumElts; ++i)
34632         ShufMask[i] = i + HalfNumElts;
34633
34634       SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34635       Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
34636
34637       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34638       Results.push_back(Res);
34639     }
34640     return;
34641   }
34642   case ISD::FP_TO_SINT:
34643   case ISD::STRICT_FP_TO_SINT:
34644   case ISD::FP_TO_UINT:
34645   case ISD::STRICT_FP_TO_UINT: {
34646     bool IsStrict = N->isStrictFPOpcode();
34647     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
34648                     N->getOpcode() == ISD::STRICT_FP_TO_SINT;
34649     EVT VT = N->getValueType(0);
34650     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34651     SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34652     EVT SrcVT = Src.getValueType();
34653
34654     SDValue Res;
34655     if (isSoftFP16(SrcVT)) {
34656       EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34657       if (IsStrict) {
34658         Res =
34659             DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
34660                         {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34661                                             {NVT, MVT::Other}, {Chain, Src})});
34662         Chain = Res.getValue(1);
34663       } else {
34664         Res = DAG.getNode(N->getOpcode(), dl, VT,
34665                           DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34666       }
34667       Results.push_back(Res);
34668       if (IsStrict)
34669         Results.push_back(Chain);
34670
34671       return;
34672     }
34673
34674     if (VT.isVector() && Subtarget.hasFP16() &&
34675         SrcVT.getVectorElementType() == MVT::f16) {
34676       EVT EleVT = VT.getVectorElementType();
34677       EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34678
34679       if (SrcVT != MVT::v8f16) {
34680         SDValue Tmp =
34681             IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34682         SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34683         Ops[0] = Src;
34684         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34685       }
34686
34687       if (IsStrict) {
34688         unsigned Opc =
34689             IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34690         Res =
34691             DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34692         Chain = Res.getValue(1);
34693       } else {
34694         unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34695         Res = DAG.getNode(Opc, dl, ResVT, Src);
34696       }
34697
34698       // TODO: Need to add exception check code for strict FP.
34699       if (EleVT.getSizeInBits() < 16) {
34700         MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34701         Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34702
34703         // Now widen to 128 bits.
34704         unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34705         MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34706         SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34707         ConcatOps[0] = Res;
34708         Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34709       }
34710
34711       Results.push_back(Res);
34712       if (IsStrict)
34713         Results.push_back(Chain);
34714
34715       return;
34716     }
34717
34718     if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34719       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
34720              "Unexpected type action!");
34721
34722       // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34723       unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34724       MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34725                                        VT.getVectorNumElements());
34726       SDValue Res;
34727       SDValue Chain;
34728       if (IsStrict) {
34729         Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34730                           {N->getOperand(0), Src});
34731         Chain = Res.getValue(1);
34732       } else
34733         Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34734
34735       // Preserve what we know about the size of the original result. If the
34736       // result is v2i32, we have to manually widen the assert.
34737       if (PromoteVT == MVT::v2i32)
34738         Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34739                           DAG.getUNDEF(MVT::v2i32));
34740
34741       Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34742                         Res.getValueType(), Res,
34743                         DAG.getValueType(VT.getVectorElementType()));
34744
34745       if (PromoteVT == MVT::v2i32)
34746         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34747                           DAG.getIntPtrConstant(0, dl));
34748
34749       // Truncate back to the original width.
34750       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34751
34752       // Now widen to 128 bits.
34753       unsigned NumConcats = 128 / VT.getSizeInBits();
34754       MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
34755                                       VT.getVectorNumElements() * NumConcats);
34756       SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34757       ConcatOps[0] = Res;
34758       Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34759       Results.push_back(Res);
34760       if (IsStrict)
34761         Results.push_back(Chain);
34762       return;
34763     }
34764
34765
34766     if (VT == MVT::v2i32) {
34767       assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34768              "Strict unsigned conversion requires AVX512");
34769       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34770       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
34771              "Unexpected type action!");
34772       if (Src.getValueType() == MVT::v2f64) {
34773         if (!IsSigned && !Subtarget.hasAVX512()) {
34774           SDValue Res =
34775               expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34776           Results.push_back(Res);
34777           return;
34778         }
34779
34780         unsigned Opc;
34781         if (IsStrict)
34782           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34783         else
34784           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34785
34786         // If we have VLX we can emit a target specific FP_TO_UINT node,.
34787         if (!IsSigned && !Subtarget.hasVLX()) {
34788           // Otherwise we can defer to the generic legalizer which will widen
34789           // the input as well. This will be further widened during op
34790           // legalization to v8i32<-v8f64.
34791           // For strict nodes we'll need to widen ourselves.
34792           // FIXME: Fix the type legalizer to safely widen strict nodes?
34793           if (!IsStrict)
34794             return;
34795           Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34796                             DAG.getConstantFP(0.0, dl, MVT::v2f64));
34797           Opc = N->getOpcode();
34798         }
34799         SDValue Res;
34800         SDValue Chain;
34801         if (IsStrict) {
34802           Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34803                             {N->getOperand(0), Src});
34804           Chain = Res.getValue(1);
34805         } else {
34806           Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34807         }
34808         Results.push_back(Res);
34809         if (IsStrict)
34810           Results.push_back(Chain);
34811         return;
34812       }
34813
34814       // Custom widen strict v2f32->v2i32 by padding with zeros.
34815       // FIXME: Should generic type legalizer do this?
34816       if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34817         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34818                           DAG.getConstantFP(0.0, dl, MVT::v2f32));
34819         SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
34820                                   {N->getOperand(0), Src});
34821         Results.push_back(Res);
34822         Results.push_back(Res.getValue(1));
34823         return;
34824       }
34825
34826       // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34827       // so early out here.
34828       return;
34829     }
34830
34831     assert(!VT.isVector() && "Vectors should have been handled above!");
34832
34833     if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34834          (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34835         (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34836       assert(!Subtarget.is64Bit() && "i64 should be legal");
34837       unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34838       // If we use a 128-bit result we might need to use a target specific node.
34839       unsigned SrcElts =
34840           std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34841       MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34842       MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34843       unsigned Opc = N->getOpcode();
34844       if (NumElts != SrcElts) {
34845         if (IsStrict)
34846           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
34847         else
34848           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34849       }
34850
34851       SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
34852       SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34853                                 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34854                                 ZeroIdx);
34855       SDValue Chain;
34856       if (IsStrict) {
34857         SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34858         Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34859         Chain = Res.getValue(1);
34860       } else
34861         Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34862       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34863       Results.push_back(Res);
34864       if (IsStrict)
34865         Results.push_back(Chain);
34866       return;
34867     }
34868
34869     if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34870       SDValue Chain;
34871       SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34872       Results.push_back(V);
34873       if (IsStrict)
34874         Results.push_back(Chain);
34875       return;
34876     }
34877
34878     if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34879       Results.push_back(V);
34880       if (IsStrict)
34881         Results.push_back(Chain);
34882     }
34883     return;
34884   }
34885   case ISD::LRINT:
34886   case ISD::LLRINT: {
34887     if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34888       Results.push_back(V);
34889     return;
34890   }
34891
34892   case ISD::SINT_TO_FP:
34893   case ISD::STRICT_SINT_TO_FP:
34894   case ISD::UINT_TO_FP:
34895   case ISD::STRICT_UINT_TO_FP: {
34896     bool IsStrict = N->isStrictFPOpcode();
34897     bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
34898                     N->getOpcode() == ISD::STRICT_SINT_TO_FP;
34899     EVT VT = N->getValueType(0);
34900     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34901     if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34902         Subtarget.hasVLX()) {
34903       if (Src.getValueType().getVectorElementType() == MVT::i16)
34904         return;
34905
34906       if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34907         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34908                           IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34909                                    : DAG.getUNDEF(MVT::v2i32));
34910       if (IsStrict) {
34911         unsigned Opc =
34912             IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
34913         SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34914                                   {N->getOperand(0), Src});
34915         Results.push_back(Res);
34916         Results.push_back(Res.getValue(1));
34917       } else {
34918         unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34919         Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34920       }
34921       return;
34922     }
34923     if (VT != MVT::v2f32)
34924       return;
34925     EVT SrcVT = Src.getValueType();
34926     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34927       if (IsStrict) {
34928         unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34929                                 : X86ISD::STRICT_CVTUI2P;
34930         SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34931                                   {N->getOperand(0), Src});
34932         Results.push_back(Res);
34933         Results.push_back(Res.getValue(1));
34934       } else {
34935         unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34936         Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34937       }
34938       return;
34939     }
34940     if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34941         Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34942       SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34943       SDValue One  = DAG.getConstant(1, dl, SrcVT);
34944       SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34945                                  DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34946                                  DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34947       SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34948       SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34949       SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34950       for (int i = 0; i != 2; ++i) {
34951         SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34952                                   SignSrc, DAG.getIntPtrConstant(i, dl));
34953         if (IsStrict)
34954           SignCvts[i] =
34955               DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34956                           {N->getOperand(0), Elt});
34957         else
34958           SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34959       };
34960       SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34961       SDValue Slow, Chain;
34962       if (IsStrict) {
34963         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34964                             SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34965         Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34966                            {Chain, SignCvt, SignCvt});
34967         Chain = Slow.getValue(1);
34968       } else {
34969         Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34970       }
34971       IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34972       IsNeg =
34973           DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34974       SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34975       Results.push_back(Cvt);
34976       if (IsStrict)
34977         Results.push_back(Chain);
34978       return;
34979     }
34980
34981     if (SrcVT != MVT::v2i32)
34982       return;
34983
34984     if (IsSigned || Subtarget.hasAVX512()) {
34985       if (!IsStrict)
34986         return;
34987
34988       // Custom widen strict v2i32->v2f32 to avoid scalarization.
34989       // FIXME: Should generic type legalizer do this?
34990       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34991                         DAG.getConstant(0, dl, MVT::v2i32));
34992       SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
34993                                 {N->getOperand(0), Src});
34994       Results.push_back(Res);
34995       Results.push_back(Res.getValue(1));
34996       return;
34997     }
34998
34999     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
35000     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
35001     SDValue VBias = DAG.getConstantFP(
35002         llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
35003     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
35004                              DAG.getBitcast(MVT::v2i64, VBias));
35005     Or = DAG.getBitcast(MVT::v2f64, Or);
35006     if (IsStrict) {
35007       SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
35008                                 {N->getOperand(0), Or, VBias});
35009       SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
35010                                 {MVT::v4f32, MVT::Other},
35011                                 {Sub.getValue(1), Sub});
35012       Results.push_back(Res);
35013       Results.push_back(Res.getValue(1));
35014     } else {
35015       // TODO: Are there any fast-math-flags to propagate here?
35016       SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
35017       Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
35018     }
35019     return;
35020   }
35021   case ISD::STRICT_FP_ROUND:
35022   case ISD::FP_ROUND: {
35023     bool IsStrict = N->isStrictFPOpcode();
35024     SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
35025     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
35026     SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
35027     EVT SrcVT = Src.getValueType();
35028     EVT VT = N->getValueType(0);
35029     SDValue V;
35030     if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
35031       SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
35032                              : DAG.getUNDEF(MVT::v2f32);
35033       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
35034     }
35035     if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
35036       assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
35037       if (SrcVT.getVectorElementType() != MVT::f32)
35038         return;
35039
35040       if (IsStrict)
35041         V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
35042                         {Chain, Src, Rnd});
35043       else
35044         V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
35045
35046       Results.push_back(DAG.getBitcast(MVT::v8f16, V));
35047       if (IsStrict)
35048         Results.push_back(V.getValue(1));
35049       return;
35050     }
35051     if (!isTypeLegal(Src.getValueType()))
35052       return;
35053     EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
35054     if (IsStrict)
35055       V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
35056                       {Chain, Src});
35057     else
35058       V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
35059     Results.push_back(V);
35060     if (IsStrict)
35061       Results.push_back(V.getValue(1));
35062     return;
35063   }
35064   case ISD::FP_EXTEND:
35065   case ISD::STRICT_FP_EXTEND: {
35066     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
35067     // No other ValueType for FP_EXTEND should reach this point.
35068     assert(N->getValueType(0) == MVT::v2f32 &&
35069            "Do not know how to legalize this Node");
35070     if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
35071       return;
35072     bool IsStrict = N->isStrictFPOpcode();
35073     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
35074     SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
35075                            : DAG.getUNDEF(MVT::v2f16);
35076     SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
35077     if (IsStrict)
35078       V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
35079                       {N->getOperand(0), V});
35080     else
35081       V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
35082     Results.push_back(V);
35083     if (IsStrict)
35084       Results.push_back(V.getValue(1));
35085     return;
35086   }
35087   case ISD::INTRINSIC_W_CHAIN: {
35088     unsigned IntNo = N->getConstantOperandVal(1);
35089     switch (IntNo) {
35090     default : llvm_unreachable("Do not know how to custom type "
35091                                "legalize this intrinsic operation!");
35092     case Intrinsic::x86_rdtsc:
35093       return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
35094                                      Results);
35095     case Intrinsic::x86_rdtscp:
35096       return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
35097                                      Results);
35098     case Intrinsic::x86_rdpmc:
35099       expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
35100                                   Results);
35101       return;
35102     case Intrinsic::x86_rdpru:
35103       expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
35104         Results);
35105       return;
35106     case Intrinsic::x86_xgetbv:
35107       expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
35108                                   Results);
35109       return;
35110     }
35111   }
35112   case ISD::READCYCLECOUNTER: {
35113     return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
35114   }
35115   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
35116     EVT T = N->getValueType(0);
35117     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
35118     bool Regs64bit = T == MVT::i128;
35119     assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
35120            "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
35121     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
35122     SDValue cpInL, cpInH;
35123     std::tie(cpInL, cpInH) =
35124         DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
35125     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
35126                              Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
35127     cpInH =
35128         DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
35129                          cpInH, cpInL.getValue(1));
35130     SDValue swapInL, swapInH;
35131     std::tie(swapInL, swapInH) =
35132         DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
35133     swapInH =
35134         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
35135                          swapInH, cpInH.getValue(1));
35136
35137     // In 64-bit mode we might need the base pointer in RBX, but we can't know
35138     // until later. So we keep the RBX input in a vreg and use a custom
35139     // inserter.
35140     // Since RBX will be a reserved register the register allocator will not
35141     // make sure its value will be properly saved and restored around this
35142     // live-range.
35143     SDValue Result;
35144     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
35145     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
35146     if (Regs64bit) {
35147       SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
35148                        swapInH.getValue(1)};
35149       Result =
35150           DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
35151     } else {
35152       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
35153                                  swapInH.getValue(1));
35154       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
35155                        swapInL.getValue(1)};
35156       Result =
35157           DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
35158     }
35159
35160     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
35161                                         Regs64bit ? X86::RAX : X86::EAX,
35162                                         HalfT, Result.getValue(1));
35163     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
35164                                         Regs64bit ? X86::RDX : X86::EDX,
35165                                         HalfT, cpOutL.getValue(2));
35166     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
35167
35168     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
35169                                         MVT::i32, cpOutH.getValue(2));
35170     SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
35171     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
35172
35173     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
35174     Results.push_back(Success);
35175     Results.push_back(EFLAGS.getValue(1));
35176     return;
35177   }
35178   case ISD::ATOMIC_LOAD: {
35179     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
35180     bool NoImplicitFloatOps =
35181         DAG.getMachineFunction().getFunction().hasFnAttribute(
35182             Attribute::NoImplicitFloat);
35183     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
35184       auto *Node = cast<AtomicSDNode>(N);
35185       if (Subtarget.hasSSE1()) {
35186         // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
35187         // Then extract the lower 64-bits.
35188         MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
35189         SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
35190         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
35191         SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
35192                                              MVT::i64, Node->getMemOperand());
35193         if (Subtarget.hasSSE2()) {
35194           SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
35195                                     DAG.getIntPtrConstant(0, dl));
35196           Results.push_back(Res);
35197           Results.push_back(Ld.getValue(1));
35198           return;
35199         }
35200         // We use an alternative sequence for SSE1 that extracts as v2f32 and
35201         // then casts to i64. This avoids a 128-bit stack temporary being
35202         // created by type legalization if we were to cast v4f32->v2i64.
35203         SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
35204                                   DAG.getIntPtrConstant(0, dl));
35205         Res = DAG.getBitcast(MVT::i64, Res);
35206         Results.push_back(Res);
35207         Results.push_back(Ld.getValue(1));
35208         return;
35209       }
35210       if (Subtarget.hasX87()) {
35211         // First load this into an 80-bit X87 register. This will put the whole
35212         // integer into the significand.
35213         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
35214         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
35215         SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
35216                                                  dl, Tys, Ops, MVT::i64,
35217                                                  Node->getMemOperand());
35218         SDValue Chain = Result.getValue(1);
35219
35220         // Now store the X87 register to a stack temporary and convert to i64.
35221         // This store is not atomic and doesn't need to be.
35222         // FIXME: We don't need a stack temporary if the result of the load
35223         // is already being stored. We could just directly store there.
35224         SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
35225         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
35226         MachinePointerInfo MPI =
35227             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
35228         SDValue StoreOps[] = { Chain, Result, StackPtr };
35229         Chain = DAG.getMemIntrinsicNode(
35230             X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
35231             MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
35232
35233         // Finally load the value back from the stack temporary and return it.
35234         // This load is not atomic and doesn't need to be.
35235         // This load will be further type legalized.
35236         Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
35237         Results.push_back(Result);
35238         Results.push_back(Result.getValue(1));
35239         return;
35240       }
35241     }
35242     // TODO: Use MOVLPS when SSE1 is available?
35243     // Delegate to generic TypeLegalization. Situations we can really handle
35244     // should have already been dealt with by AtomicExpandPass.cpp.
35245     break;
35246   }
35247   case ISD::ATOMIC_SWAP:
35248   case ISD::ATOMIC_LOAD_ADD:
35249   case ISD::ATOMIC_LOAD_SUB:
35250   case ISD::ATOMIC_LOAD_AND:
35251   case ISD::ATOMIC_LOAD_OR:
35252   case ISD::ATOMIC_LOAD_XOR:
35253   case ISD::ATOMIC_LOAD_NAND:
35254   case ISD::ATOMIC_LOAD_MIN:
35255   case ISD::ATOMIC_LOAD_MAX:
35256   case ISD::ATOMIC_LOAD_UMIN:
35257   case ISD::ATOMIC_LOAD_UMAX:
35258     // Delegate to generic TypeLegalization. Situations we can really handle
35259     // should have already been dealt with by AtomicExpandPass.cpp.
35260     break;
35261
35262   case ISD::BITCAST: {
35263     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
35264     EVT DstVT = N->getValueType(0);
35265     EVT SrcVT = N->getOperand(0).getValueType();
35266
35267     // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
35268     // we can split using the k-register rather than memory.
35269     if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
35270       assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
35271       SDValue Lo, Hi;
35272       std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
35273       Lo = DAG.getBitcast(MVT::i32, Lo);
35274       Hi = DAG.getBitcast(MVT::i32, Hi);
35275       SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
35276       Results.push_back(Res);
35277       return;
35278     }
35279
35280     if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
35281       // FIXME: Use v4f32 for SSE1?
35282       assert(Subtarget.hasSSE2() && "Requires SSE2");
35283       assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
35284              "Unexpected type action!");
35285       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
35286       SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
35287                                 N->getOperand(0));
35288       Res = DAG.getBitcast(WideVT, Res);
35289       Results.push_back(Res);
35290       return;
35291     }
35292
35293     return;
35294   }
35295   case ISD::MGATHER: {
35296     EVT VT = N->getValueType(0);
35297     if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
35298         (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
35299       auto *Gather = cast<MaskedGatherSDNode>(N);
35300       SDValue Index = Gather->getIndex();
35301       if (Index.getValueType() != MVT::v2i64)
35302         return;
35303       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
35304              "Unexpected type action!");
35305       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
35306       SDValue Mask = Gather->getMask();
35307       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
35308       SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
35309                                      Gather->getPassThru(),
35310                                      DAG.getUNDEF(VT));
35311       if (!Subtarget.hasVLX()) {
35312         // We need to widen the mask, but the instruction will only use 2
35313         // of its elements. So we can use undef.
35314         Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
35315                            DAG.getUNDEF(MVT::v2i1));
35316         Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
35317       }
35318       SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
35319                         Gather->getBasePtr(), Index, Gather->getScale() };
35320       SDValue Res = DAG.getMemIntrinsicNode(
35321           X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
35322           Gather->getMemoryVT(), Gather->getMemOperand());
35323       Results.push_back(Res);
35324       Results.push_back(Res.getValue(1));
35325       return;
35326     }
35327     return;
35328   }
35329   case ISD::LOAD: {
35330     // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
35331     // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
35332     // cast since type legalization will try to use an i64 load.
35333     MVT VT = N->getSimpleValueType(0);
35334     assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
35335     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
35336            "Unexpected type action!");
35337     if (!ISD::isNON_EXTLoad(N))
35338       return;
35339     auto *Ld = cast<LoadSDNode>(N);
35340     if (Subtarget.hasSSE2()) {
35341       MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
35342       SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
35343                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
35344                                 Ld->getMemOperand()->getFlags());
35345       SDValue Chain = Res.getValue(1);
35346       MVT VecVT = MVT::getVectorVT(LdVT, 2);
35347       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
35348       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
35349       Res = DAG.getBitcast(WideVT, Res);
35350       Results.push_back(Res);
35351       Results.push_back(Chain);
35352       return;
35353     }
35354     assert(Subtarget.hasSSE1() && "Expected SSE");
35355     SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
35356     SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
35357     SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
35358                                           MVT::i64, Ld->getMemOperand());
35359     Results.push_back(Res);
35360     Results.push_back(Res.getValue(1));
35361     return;
35362   }
35363   case ISD::ADDRSPACECAST: {
35364     SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
35365     Results.push_back(V);
35366     return;
35367   }
35368   case ISD::BITREVERSE: {
35369     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
35370     assert(Subtarget.hasXOP() && "Expected XOP");
35371     // We can use VPPERM by copying to a vector register and back. We'll need
35372     // to move the scalar in two i32 pieces.
35373     Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
35374     return;
35375   }
35376   case ISD::EXTRACT_VECTOR_ELT: {
35377     // f16 = extract vXf16 %vec, i64 %idx
35378     assert(N->getSimpleValueType(0) == MVT::f16 &&
35379            "Unexpected Value type of EXTRACT_VECTOR_ELT!");
35380     assert(Subtarget.hasFP16() && "Expected FP16");
35381     SDValue VecOp = N->getOperand(0);
35382     EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();
35383     SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
35384     Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
35385                         N->getOperand(1));
35386     Split = DAG.getBitcast(MVT::f16, Split);
35387     Results.push_back(Split);
35388     return;
35389   }
35390   }
35391 }
35392
35393 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
35394   switch ((X86ISD::NodeType)Opcode) {
35395   case X86ISD::FIRST_NUMBER:       break;
35396 #define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
35397   NODE_NAME_CASE(BSF)
35398   NODE_NAME_CASE(BSR)
35399   NODE_NAME_CASE(FSHL)
35400   NODE_NAME_CASE(FSHR)
35401   NODE_NAME_CASE(FAND)
35402   NODE_NAME_CASE(FANDN)
35403   NODE_NAME_CASE(FOR)
35404   NODE_NAME_CASE(FXOR)
35405   NODE_NAME_CASE(FILD)
35406   NODE_NAME_CASE(FIST)
35407   NODE_NAME_CASE(FP_TO_INT_IN_MEM)
35408   NODE_NAME_CASE(FLD)
35409   NODE_NAME_CASE(FST)
35410   NODE_NAME_CASE(CALL)
35411   NODE_NAME_CASE(CALL_RVMARKER)
35412   NODE_NAME_CASE(BT)
35413   NODE_NAME_CASE(CMP)
35414   NODE_NAME_CASE(FCMP)
35415   NODE_NAME_CASE(STRICT_FCMP)
35416   NODE_NAME_CASE(STRICT_FCMPS)
35417   NODE_NAME_CASE(COMI)
35418   NODE_NAME_CASE(UCOMI)
35419   NODE_NAME_CASE(CMPM)
35420   NODE_NAME_CASE(CMPMM)
35421   NODE_NAME_CASE(STRICT_CMPM)
35422   NODE_NAME_CASE(CMPMM_SAE)
35423   NODE_NAME_CASE(SETCC)
35424   NODE_NAME_CASE(SETCC_CARRY)
35425   NODE_NAME_CASE(FSETCC)
35426   NODE_NAME_CASE(FSETCCM)
35427   NODE_NAME_CASE(FSETCCM_SAE)
35428   NODE_NAME_CASE(CMOV)
35429   NODE_NAME_CASE(BRCOND)
35430   NODE_NAME_CASE(RET_GLUE)
35431   NODE_NAME_CASE(IRET)
35432   NODE_NAME_CASE(REP_STOS)
35433   NODE_NAME_CASE(REP_MOVS)
35434   NODE_NAME_CASE(GlobalBaseReg)
35435   NODE_NAME_CASE(Wrapper)
35436   NODE_NAME_CASE(WrapperRIP)
35437   NODE_NAME_CASE(MOVQ2DQ)
35438   NODE_NAME_CASE(MOVDQ2Q)
35439   NODE_NAME_CASE(MMX_MOVD2W)
35440   NODE_NAME_CASE(MMX_MOVW2D)
35441   NODE_NAME_CASE(PEXTRB)
35442   NODE_NAME_CASE(PEXTRW)
35443   NODE_NAME_CASE(INSERTPS)
35444   NODE_NAME_CASE(PINSRB)
35445   NODE_NAME_CASE(PINSRW)
35446   NODE_NAME_CASE(PSHUFB)
35447   NODE_NAME_CASE(ANDNP)
35448   NODE_NAME_CASE(BLENDI)
35449   NODE_NAME_CASE(BLENDV)
35450   NODE_NAME_CASE(HADD)
35451   NODE_NAME_CASE(HSUB)
35452   NODE_NAME_CASE(FHADD)
35453   NODE_NAME_CASE(FHSUB)
35454   NODE_NAME_CASE(CONFLICT)
35455   NODE_NAME_CASE(FMAX)
35456   NODE_NAME_CASE(FMAXS)
35457   NODE_NAME_CASE(FMAX_SAE)
35458   NODE_NAME_CASE(FMAXS_SAE)
35459   NODE_NAME_CASE(FMIN)
35460   NODE_NAME_CASE(FMINS)
35461   NODE_NAME_CASE(FMIN_SAE)
35462   NODE_NAME_CASE(FMINS_SAE)
35463   NODE_NAME_CASE(FMAXC)
35464   NODE_NAME_CASE(FMINC)
35465   NODE_NAME_CASE(FRSQRT)
35466   NODE_NAME_CASE(FRCP)
35467   NODE_NAME_CASE(EXTRQI)
35468   NODE_NAME_CASE(INSERTQI)
35469   NODE_NAME_CASE(TLSADDR)
35470   NODE_NAME_CASE(TLSBASEADDR)
35471   NODE_NAME_CASE(TLSCALL)
35472   NODE_NAME_CASE(EH_SJLJ_SETJMP)
35473   NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35474   NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35475   NODE_NAME_CASE(EH_RETURN)
35476   NODE_NAME_CASE(TC_RETURN)
35477   NODE_NAME_CASE(FNSTCW16m)
35478   NODE_NAME_CASE(FLDCW16m)
35479   NODE_NAME_CASE(LCMPXCHG_DAG)
35480   NODE_NAME_CASE(LCMPXCHG8_DAG)
35481   NODE_NAME_CASE(LCMPXCHG16_DAG)
35482   NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35483   NODE_NAME_CASE(LADD)
35484   NODE_NAME_CASE(LSUB)
35485   NODE_NAME_CASE(LOR)
35486   NODE_NAME_CASE(LXOR)
35487   NODE_NAME_CASE(LAND)
35488   NODE_NAME_CASE(LBTS)
35489   NODE_NAME_CASE(LBTC)
35490   NODE_NAME_CASE(LBTR)
35491   NODE_NAME_CASE(LBTS_RM)
35492   NODE_NAME_CASE(LBTC_RM)
35493   NODE_NAME_CASE(LBTR_RM)
35494   NODE_NAME_CASE(AADD)
35495   NODE_NAME_CASE(AOR)
35496   NODE_NAME_CASE(AXOR)
35497   NODE_NAME_CASE(AAND)
35498   NODE_NAME_CASE(VZEXT_MOVL)
35499   NODE_NAME_CASE(VZEXT_LOAD)
35500   NODE_NAME_CASE(VEXTRACT_STORE)
35501   NODE_NAME_CASE(VTRUNC)
35502   NODE_NAME_CASE(VTRUNCS)
35503   NODE_NAME_CASE(VTRUNCUS)
35504   NODE_NAME_CASE(VMTRUNC)
35505   NODE_NAME_CASE(VMTRUNCS)
35506   NODE_NAME_CASE(VMTRUNCUS)
35507   NODE_NAME_CASE(VTRUNCSTORES)
35508   NODE_NAME_CASE(VTRUNCSTOREUS)
35509   NODE_NAME_CASE(VMTRUNCSTORES)
35510   NODE_NAME_CASE(VMTRUNCSTOREUS)
35511   NODE_NAME_CASE(VFPEXT)
35512   NODE_NAME_CASE(STRICT_VFPEXT)
35513   NODE_NAME_CASE(VFPEXT_SAE)
35514   NODE_NAME_CASE(VFPEXTS)
35515   NODE_NAME_CASE(VFPEXTS_SAE)
35516   NODE_NAME_CASE(VFPROUND)
35517   NODE_NAME_CASE(STRICT_VFPROUND)
35518   NODE_NAME_CASE(VMFPROUND)
35519   NODE_NAME_CASE(VFPROUND_RND)
35520   NODE_NAME_CASE(VFPROUNDS)
35521   NODE_NAME_CASE(VFPROUNDS_RND)
35522   NODE_NAME_CASE(VSHLDQ)
35523   NODE_NAME_CASE(VSRLDQ)
35524   NODE_NAME_CASE(VSHL)
35525   NODE_NAME_CASE(VSRL)
35526   NODE_NAME_CASE(VSRA)
35527   NODE_NAME_CASE(VSHLI)
35528   NODE_NAME_CASE(VSRLI)
35529   NODE_NAME_CASE(VSRAI)
35530   NODE_NAME_CASE(VSHLV)
35531   NODE_NAME_CASE(VSRLV)
35532   NODE_NAME_CASE(VSRAV)
35533   NODE_NAME_CASE(VROTLI)
35534   NODE_NAME_CASE(VROTRI)
35535   NODE_NAME_CASE(VPPERM)
35536   NODE_NAME_CASE(CMPP)
35537   NODE_NAME_CASE(STRICT_CMPP)
35538   NODE_NAME_CASE(PCMPEQ)
35539   NODE_NAME_CASE(PCMPGT)
35540   NODE_NAME_CASE(PHMINPOS)
35541   NODE_NAME_CASE(ADD)
35542   NODE_NAME_CASE(SUB)
35543   NODE_NAME_CASE(ADC)
35544   NODE_NAME_CASE(SBB)
35545   NODE_NAME_CASE(SMUL)
35546   NODE_NAME_CASE(UMUL)
35547   NODE_NAME_CASE(OR)
35548   NODE_NAME_CASE(XOR)
35549   NODE_NAME_CASE(AND)
35550   NODE_NAME_CASE(BEXTR)
35551   NODE_NAME_CASE(BEXTRI)
35552   NODE_NAME_CASE(BZHI)
35553   NODE_NAME_CASE(PDEP)
35554   NODE_NAME_CASE(PEXT)
35555   NODE_NAME_CASE(MUL_IMM)
35556   NODE_NAME_CASE(MOVMSK)
35557   NODE_NAME_CASE(PTEST)
35558   NODE_NAME_CASE(TESTP)
35559   NODE_NAME_CASE(KORTEST)
35560   NODE_NAME_CASE(KTEST)
35561   NODE_NAME_CASE(KADD)
35562   NODE_NAME_CASE(KSHIFTL)
35563   NODE_NAME_CASE(KSHIFTR)
35564   NODE_NAME_CASE(PACKSS)
35565   NODE_NAME_CASE(PACKUS)
35566   NODE_NAME_CASE(PALIGNR)
35567   NODE_NAME_CASE(VALIGN)
35568   NODE_NAME_CASE(VSHLD)
35569   NODE_NAME_CASE(VSHRD)
35570   NODE_NAME_CASE(VSHLDV)
35571   NODE_NAME_CASE(VSHRDV)
35572   NODE_NAME_CASE(PSHUFD)
35573   NODE_NAME_CASE(PSHUFHW)
35574   NODE_NAME_CASE(PSHUFLW)
35575   NODE_NAME_CASE(SHUFP)
35576   NODE_NAME_CASE(SHUF128)
35577   NODE_NAME_CASE(MOVLHPS)
35578   NODE_NAME_CASE(MOVHLPS)
35579   NODE_NAME_CASE(MOVDDUP)
35580   NODE_NAME_CASE(MOVSHDUP)
35581   NODE_NAME_CASE(MOVSLDUP)
35582   NODE_NAME_CASE(MOVSD)
35583   NODE_NAME_CASE(MOVSS)
35584   NODE_NAME_CASE(MOVSH)
35585   NODE_NAME_CASE(UNPCKL)
35586   NODE_NAME_CASE(UNPCKH)
35587   NODE_NAME_CASE(VBROADCAST)
35588   NODE_NAME_CASE(VBROADCAST_LOAD)
35589   NODE_NAME_CASE(VBROADCASTM)
35590   NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35591   NODE_NAME_CASE(VPERMILPV)
35592   NODE_NAME_CASE(VPERMILPI)
35593   NODE_NAME_CASE(VPERM2X128)
35594   NODE_NAME_CASE(VPERMV)
35595   NODE_NAME_CASE(VPERMV3)
35596   NODE_NAME_CASE(VPERMI)
35597   NODE_NAME_CASE(VPTERNLOG)
35598   NODE_NAME_CASE(VFIXUPIMM)
35599   NODE_NAME_CASE(VFIXUPIMM_SAE)
35600   NODE_NAME_CASE(VFIXUPIMMS)
35601   NODE_NAME_CASE(VFIXUPIMMS_SAE)
35602   NODE_NAME_CASE(VRANGE)
35603   NODE_NAME_CASE(VRANGE_SAE)
35604   NODE_NAME_CASE(VRANGES)
35605   NODE_NAME_CASE(VRANGES_SAE)
35606   NODE_NAME_CASE(PMULUDQ)
35607   NODE_NAME_CASE(PMULDQ)
35608   NODE_NAME_CASE(PSADBW)
35609   NODE_NAME_CASE(DBPSADBW)
35610   NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35611   NODE_NAME_CASE(VAARG_64)
35612   NODE_NAME_CASE(VAARG_X32)
35613   NODE_NAME_CASE(DYN_ALLOCA)
35614   NODE_NAME_CASE(MFENCE)
35615   NODE_NAME_CASE(SEG_ALLOCA)
35616   NODE_NAME_CASE(PROBED_ALLOCA)
35617   NODE_NAME_CASE(RDRAND)
35618   NODE_NAME_CASE(RDSEED)
35619   NODE_NAME_CASE(RDPKRU)
35620   NODE_NAME_CASE(WRPKRU)
35621   NODE_NAME_CASE(VPMADDUBSW)
35622   NODE_NAME_CASE(VPMADDWD)
35623   NODE_NAME_CASE(VPSHA)
35624   NODE_NAME_CASE(VPSHL)
35625   NODE_NAME_CASE(VPCOM)
35626   NODE_NAME_CASE(VPCOMU)
35627   NODE_NAME_CASE(VPERMIL2)
35628   NODE_NAME_CASE(FMSUB)
35629   NODE_NAME_CASE(STRICT_FMSUB)
35630   NODE_NAME_CASE(FNMADD)
35631   NODE_NAME_CASE(STRICT_FNMADD)
35632   NODE_NAME_CASE(FNMSUB)
35633   NODE_NAME_CASE(STRICT_FNMSUB)
35634   NODE_NAME_CASE(FMADDSUB)
35635   NODE_NAME_CASE(FMSUBADD)
35636   NODE_NAME_CASE(FMADD_RND)
35637   NODE_NAME_CASE(FNMADD_RND)
35638   NODE_NAME_CASE(FMSUB_RND)
35639   NODE_NAME_CASE(FNMSUB_RND)
35640   NODE_NAME_CASE(FMADDSUB_RND)
35641   NODE_NAME_CASE(FMSUBADD_RND)
35642   NODE_NAME_CASE(VFMADDC)
35643   NODE_NAME_CASE(VFMADDC_RND)
35644   NODE_NAME_CASE(VFCMADDC)
35645   NODE_NAME_CASE(VFCMADDC_RND)
35646   NODE_NAME_CASE(VFMULC)
35647   NODE_NAME_CASE(VFMULC_RND)
35648   NODE_NAME_CASE(VFCMULC)
35649   NODE_NAME_CASE(VFCMULC_RND)
35650   NODE_NAME_CASE(VFMULCSH)
35651   NODE_NAME_CASE(VFMULCSH_RND)
35652   NODE_NAME_CASE(VFCMULCSH)
35653   NODE_NAME_CASE(VFCMULCSH_RND)
35654   NODE_NAME_CASE(VFMADDCSH)
35655   NODE_NAME_CASE(VFMADDCSH_RND)
35656   NODE_NAME_CASE(VFCMADDCSH)
35657   NODE_NAME_CASE(VFCMADDCSH_RND)
35658   NODE_NAME_CASE(VPMADD52H)
35659   NODE_NAME_CASE(VPMADD52L)
35660   NODE_NAME_CASE(VRNDSCALE)
35661   NODE_NAME_CASE(STRICT_VRNDSCALE)
35662   NODE_NAME_CASE(VRNDSCALE_SAE)
35663   NODE_NAME_CASE(VRNDSCALES)
35664   NODE_NAME_CASE(VRNDSCALES_SAE)
35665   NODE_NAME_CASE(VREDUCE)
35666   NODE_NAME_CASE(VREDUCE_SAE)
35667   NODE_NAME_CASE(VREDUCES)
35668   NODE_NAME_CASE(VREDUCES_SAE)
35669   NODE_NAME_CASE(VGETMANT)
35670   NODE_NAME_CASE(VGETMANT_SAE)
35671   NODE_NAME_CASE(VGETMANTS)
35672   NODE_NAME_CASE(VGETMANTS_SAE)
35673   NODE_NAME_CASE(PCMPESTR)
35674   NODE_NAME_CASE(PCMPISTR)
35675   NODE_NAME_CASE(XTEST)
35676   NODE_NAME_CASE(COMPRESS)
35677   NODE_NAME_CASE(EXPAND)
35678   NODE_NAME_CASE(SELECTS)
35679   NODE_NAME_CASE(ADDSUB)
35680   NODE_NAME_CASE(RCP14)
35681   NODE_NAME_CASE(RCP14S)
35682   NODE_NAME_CASE(RCP28)
35683   NODE_NAME_CASE(RCP28_SAE)
35684   NODE_NAME_CASE(RCP28S)
35685   NODE_NAME_CASE(RCP28S_SAE)
35686   NODE_NAME_CASE(EXP2)
35687   NODE_NAME_CASE(EXP2_SAE)
35688   NODE_NAME_CASE(RSQRT14)
35689   NODE_NAME_CASE(RSQRT14S)
35690   NODE_NAME_CASE(RSQRT28)
35691   NODE_NAME_CASE(RSQRT28_SAE)
35692   NODE_NAME_CASE(RSQRT28S)
35693   NODE_NAME_CASE(RSQRT28S_SAE)
35694   NODE_NAME_CASE(FADD_RND)
35695   NODE_NAME_CASE(FADDS)
35696   NODE_NAME_CASE(FADDS_RND)
35697   NODE_NAME_CASE(FSUB_RND)
35698   NODE_NAME_CASE(FSUBS)
35699   NODE_NAME_CASE(FSUBS_RND)
35700   NODE_NAME_CASE(FMUL_RND)
35701   NODE_NAME_CASE(FMULS)
35702   NODE_NAME_CASE(FMULS_RND)
35703   NODE_NAME_CASE(FDIV_RND)
35704   NODE_NAME_CASE(FDIVS)
35705   NODE_NAME_CASE(FDIVS_RND)
35706   NODE_NAME_CASE(FSQRT_RND)
35707   NODE_NAME_CASE(FSQRTS)
35708   NODE_NAME_CASE(FSQRTS_RND)
35709   NODE_NAME_CASE(FGETEXP)
35710   NODE_NAME_CASE(FGETEXP_SAE)
35711   NODE_NAME_CASE(FGETEXPS)
35712   NODE_NAME_CASE(FGETEXPS_SAE)
35713   NODE_NAME_CASE(SCALEF)
35714   NODE_NAME_CASE(SCALEF_RND)
35715   NODE_NAME_CASE(SCALEFS)
35716   NODE_NAME_CASE(SCALEFS_RND)
35717   NODE_NAME_CASE(MULHRS)
35718   NODE_NAME_CASE(SINT_TO_FP_RND)
35719   NODE_NAME_CASE(UINT_TO_FP_RND)
35720   NODE_NAME_CASE(CVTTP2SI)
35721   NODE_NAME_CASE(CVTTP2UI)
35722   NODE_NAME_CASE(STRICT_CVTTP2SI)
35723   NODE_NAME_CASE(STRICT_CVTTP2UI)
35724   NODE_NAME_CASE(MCVTTP2SI)
35725   NODE_NAME_CASE(MCVTTP2UI)
35726   NODE_NAME_CASE(CVTTP2SI_SAE)
35727   NODE_NAME_CASE(CVTTP2UI_SAE)
35728   NODE_NAME_CASE(CVTTS2SI)
35729   NODE_NAME_CASE(CVTTS2UI)
35730   NODE_NAME_CASE(CVTTS2SI_SAE)
35731   NODE_NAME_CASE(CVTTS2UI_SAE)
35732   NODE_NAME_CASE(CVTSI2P)
35733   NODE_NAME_CASE(CVTUI2P)
35734   NODE_NAME_CASE(STRICT_CVTSI2P)
35735   NODE_NAME_CASE(STRICT_CVTUI2P)
35736   NODE_NAME_CASE(MCVTSI2P)
35737   NODE_NAME_CASE(MCVTUI2P)
35738   NODE_NAME_CASE(VFPCLASS)
35739   NODE_NAME_CASE(VFPCLASSS)
35740   NODE_NAME_CASE(MULTISHIFT)
35741   NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35742   NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35743   NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35744   NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35745   NODE_NAME_CASE(CVTPS2PH)
35746   NODE_NAME_CASE(STRICT_CVTPS2PH)
35747   NODE_NAME_CASE(CVTPS2PH_SAE)
35748   NODE_NAME_CASE(MCVTPS2PH)
35749   NODE_NAME_CASE(MCVTPS2PH_SAE)
35750   NODE_NAME_CASE(CVTPH2PS)
35751   NODE_NAME_CASE(STRICT_CVTPH2PS)
35752   NODE_NAME_CASE(CVTPH2PS_SAE)
35753   NODE_NAME_CASE(CVTP2SI)
35754   NODE_NAME_CASE(CVTP2UI)
35755   NODE_NAME_CASE(MCVTP2SI)
35756   NODE_NAME_CASE(MCVTP2UI)
35757   NODE_NAME_CASE(CVTP2SI_RND)
35758   NODE_NAME_CASE(CVTP2UI_RND)
35759   NODE_NAME_CASE(CVTS2SI)
35760   NODE_NAME_CASE(CVTS2UI)
35761   NODE_NAME_CASE(CVTS2SI_RND)
35762   NODE_NAME_CASE(CVTS2UI_RND)
35763   NODE_NAME_CASE(CVTNE2PS2BF16)
35764   NODE_NAME_CASE(CVTNEPS2BF16)
35765   NODE_NAME_CASE(MCVTNEPS2BF16)
35766   NODE_NAME_CASE(DPBF16PS)
35767   NODE_NAME_CASE(LWPINS)
35768   NODE_NAME_CASE(MGATHER)
35769   NODE_NAME_CASE(MSCATTER)
35770   NODE_NAME_CASE(VPDPBUSD)
35771   NODE_NAME_CASE(VPDPBUSDS)
35772   NODE_NAME_CASE(VPDPWSSD)
35773   NODE_NAME_CASE(VPDPWSSDS)
35774   NODE_NAME_CASE(VPSHUFBITQMB)
35775   NODE_NAME_CASE(GF2P8MULB)
35776   NODE_NAME_CASE(GF2P8AFFINEQB)
35777   NODE_NAME_CASE(GF2P8AFFINEINVQB)
35778   NODE_NAME_CASE(NT_CALL)
35779   NODE_NAME_CASE(NT_BRIND)
35780   NODE_NAME_CASE(UMWAIT)
35781   NODE_NAME_CASE(TPAUSE)
35782   NODE_NAME_CASE(ENQCMD)
35783   NODE_NAME_CASE(ENQCMDS)
35784   NODE_NAME_CASE(VP2INTERSECT)
35785   NODE_NAME_CASE(VPDPBSUD)
35786   NODE_NAME_CASE(VPDPBSUDS)
35787   NODE_NAME_CASE(VPDPBUUD)
35788   NODE_NAME_CASE(VPDPBUUDS)
35789   NODE_NAME_CASE(VPDPBSSD)
35790   NODE_NAME_CASE(VPDPBSSDS)
35791   NODE_NAME_CASE(AESENC128KL)
35792   NODE_NAME_CASE(AESDEC128KL)
35793   NODE_NAME_CASE(AESENC256KL)
35794   NODE_NAME_CASE(AESDEC256KL)
35795   NODE_NAME_CASE(AESENCWIDE128KL)
35796   NODE_NAME_CASE(AESDECWIDE128KL)
35797   NODE_NAME_CASE(AESENCWIDE256KL)
35798   NODE_NAME_CASE(AESDECWIDE256KL)
35799   NODE_NAME_CASE(CMPCCXADD)
35800   NODE_NAME_CASE(TESTUI)
35801   NODE_NAME_CASE(FP80_ADD)
35802   NODE_NAME_CASE(STRICT_FP80_ADD)
35803   }
35804   return nullptr;
35805 #undef NODE_NAME_CASE
35806 }
35807
35808 /// Return true if the addressing mode represented by AM is legal for this
35809 /// target, for a load/store of the specified type.
35810 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
35811                                               const AddrMode &AM, Type *Ty,
35812                                               unsigned AS,
35813                                               Instruction *I) const {
35814   // X86 supports extremely general addressing modes.
35815   CodeModel::Model M = getTargetMachine().getCodeModel();
35816
35817   // X86 allows a sign-extended 32-bit immediate field as a displacement.
35818   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35819     return false;
35820
35821   if (AM.BaseGV) {
35822     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35823
35824     // If a reference to this global requires an extra load, we can't fold it.
35825     if (isGlobalStubReference(GVFlags))
35826       return false;
35827
35828     // If BaseGV requires a register for the PIC base, we cannot also have a
35829     // BaseReg specified.
35830     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35831       return false;
35832
35833     // If lower 4G is not available, then we must use rip-relative addressing.
35834     if ((M != CodeModel::Small || isPositionIndependent()) &&
35835         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35836       return false;
35837   }
35838
35839   switch (AM.Scale) {
35840   case 0:
35841   case 1:
35842   case 2:
35843   case 4:
35844   case 8:
35845     // These scales always work.
35846     break;
35847   case 3:
35848   case 5:
35849   case 9:
35850     // These scales are formed with basereg+scalereg.  Only accept if there is
35851     // no basereg yet.
35852     if (AM.HasBaseReg)
35853       return false;
35854     break;
35855   default:  // Other stuff never works.
35856     return false;
35857   }
35858
35859   return true;
35860 }
35861
35862 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
35863   unsigned Bits = Ty->getScalarSizeInBits();
35864
35865   // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
35866   // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
35867   if (Subtarget.hasXOP() &&
35868       (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
35869     return false;
35870
35871   // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
35872   // shifts just as cheap as scalar ones.
35873   if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
35874     return false;
35875
35876   // AVX512BW has shifts such as vpsllvw.
35877   if (Subtarget.hasBWI() && Bits == 16)
35878     return false;
35879
35880   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
35881   // fully general vector.
35882   return true;
35883 }
35884
35885 bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35886   switch (Opcode) {
35887   // These are non-commutative binops.
35888   // TODO: Add more X86ISD opcodes once we have test coverage.
35889   case X86ISD::ANDNP:
35890   case X86ISD::PCMPGT:
35891   case X86ISD::FMAX:
35892   case X86ISD::FMIN:
35893   case X86ISD::FANDN:
35894   case X86ISD::VPSHA:
35895   case X86ISD::VPSHL:
35896   case X86ISD::VSHLV:
35897   case X86ISD::VSRLV:
35898   case X86ISD::VSRAV:
35899     return true;
35900   }
35901
35902   return TargetLoweringBase::isBinOp(Opcode);
35903 }
35904
35905 bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35906   switch (Opcode) {
35907   // TODO: Add more X86ISD opcodes once we have test coverage.
35908   case X86ISD::PCMPEQ:
35909   case X86ISD::PMULDQ:
35910   case X86ISD::PMULUDQ:
35911   case X86ISD::FMAXC:
35912   case X86ISD::FMINC:
35913   case X86ISD::FAND:
35914   case X86ISD::FOR:
35915   case X86ISD::FXOR:
35916     return true;
35917   }
35918
35919   return TargetLoweringBase::isCommutativeBinOp(Opcode);
35920 }
35921
35922 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
35923   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35924     return false;
35925   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35926   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35927   return NumBits1 > NumBits2;
35928 }
35929
35930 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
35931   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35932     return false;
35933
35934   if (!isTypeLegal(EVT::getEVT(Ty1)))
35935     return false;
35936
35937   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35938
35939   // Assuming the caller doesn't have a zeroext or signext return parameter,
35940   // truncation all the way down to i1 is valid.
35941   return true;
35942 }
35943
35944 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
35945   return isInt<32>(Imm);
35946 }
35947
35948 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
35949   // Can also use sub to handle negated immediates.
35950   return isInt<32>(Imm);
35951 }
35952
35953 bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
35954   return isInt<32>(Imm);
35955 }
35956
35957 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
35958   if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35959     return false;
35960   unsigned NumBits1 = VT1.getSizeInBits();
35961   unsigned NumBits2 = VT2.getSizeInBits();
35962   return NumBits1 > NumBits2;
35963 }
35964
35965 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
35966   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35967   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35968 }
35969
35970 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
35971   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35972   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35973 }
35974
35975 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
35976   EVT VT1 = Val.getValueType();
35977   if (isZExtFree(VT1, VT2))
35978     return true;
35979
35980   if (Val.getOpcode() != ISD::LOAD)
35981     return false;
35982
35983   if (!VT1.isSimple() || !VT1.isInteger() ||
35984       !VT2.isSimple() || !VT2.isInteger())
35985     return false;
35986
35987   switch (VT1.getSimpleVT().SimpleTy) {
35988   default: break;
35989   case MVT::i8:
35990   case MVT::i16:
35991   case MVT::i32:
35992     // X86 has 8, 16, and 32-bit zero-extending loads.
35993     return true;
35994   }
35995
35996   return false;
35997 }
35998
35999 bool X86TargetLowering::shouldSinkOperands(Instruction *I,
36000                                            SmallVectorImpl<Use *> &Ops) const {
36001   using namespace llvm::PatternMatch;
36002
36003   FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
36004   if (!VTy)
36005     return false;
36006
36007   if (I->getOpcode() == Instruction::Mul &&
36008       VTy->getElementType()->isIntegerTy(64)) {
36009     for (auto &Op : I->operands()) {
36010       // Make sure we are not already sinking this operand
36011       if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
36012         continue;
36013
36014       // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
36015       // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
36016       if (Subtarget.hasSSE41() &&
36017           match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
36018                                  m_SpecificInt(32)))) {
36019         Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
36020         Ops.push_back(&Op);
36021       } else if (Subtarget.hasSSE2() &&
36022                  match(Op.get(),
36023                        m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
36024         Ops.push_back(&Op);
36025       }
36026     }
36027
36028     return !Ops.empty();
36029   }
36030
36031   // A uniform shift amount in a vector shift or funnel shift may be much
36032   // cheaper than a generic variable vector shift, so make that pattern visible
36033   // to SDAG by sinking the shuffle instruction next to the shift.
36034   int ShiftAmountOpNum = -1;
36035   if (I->isShift())
36036     ShiftAmountOpNum = 1;
36037   else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
36038     if (II->getIntrinsicID() == Intrinsic::fshl ||
36039         II->getIntrinsicID() == Intrinsic::fshr)
36040       ShiftAmountOpNum = 2;
36041   }
36042
36043   if (ShiftAmountOpNum == -1)
36044     return false;
36045
36046   auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
36047   if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
36048       isVectorShiftByScalarCheap(I->getType())) {
36049     Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
36050     return true;
36051   }
36052
36053   return false;
36054 }
36055
36056 bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
36057   if (!Subtarget.is64Bit())
36058     return false;
36059   return TargetLowering::shouldConvertPhiType(From, To);
36060 }
36061
36062 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
36063   if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
36064     return false;
36065
36066   EVT SrcVT = ExtVal.getOperand(0).getValueType();
36067
36068   // There is no extending load for vXi1.
36069   if (SrcVT.getScalarType() == MVT::i1)
36070     return false;
36071
36072   return true;
36073 }
36074
36075 bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
36076                                                    EVT VT) const {
36077   if (!Subtarget.hasAnyFMA())
36078     return false;
36079
36080   VT = VT.getScalarType();
36081
36082   if (!VT.isSimple())
36083     return false;
36084
36085   switch (VT.getSimpleVT().SimpleTy) {
36086   case MVT::f16:
36087     return Subtarget.hasFP16();
36088   case MVT::f32:
36089   case MVT::f64:
36090     return true;
36091   default:
36092     break;
36093   }
36094
36095   return false;
36096 }
36097
36098 bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
36099   // i16 instructions are longer (0x66 prefix) and potentially slower.
36100   return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
36101 }
36102
36103 bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
36104                                                              EVT VT) const {
36105   // TODO: This is too general. There are cases where pre-AVX512 codegen would
36106   //       benefit. The transform may also be profitable for scalar code.
36107   if (!Subtarget.hasAVX512())
36108     return false;
36109   if (!Subtarget.hasVLX() && !VT.is512BitVector())
36110     return false;
36111   if (!VT.isVector() || VT.getScalarType() == MVT::i1)
36112     return false;
36113
36114   return true;
36115 }
36116
36117 /// Targets can use this to indicate that they only support *some*
36118 /// VECTOR_SHUFFLE operations, those with specific masks.
36119 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
36120 /// are assumed to be legal.
36121 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
36122   if (!VT.isSimple())
36123     return false;
36124
36125   // Not for i1 vectors
36126   if (VT.getSimpleVT().getScalarType() == MVT::i1)
36127     return false;
36128
36129   // Very little shuffling can be done for 64-bit vectors right now.
36130   if (VT.getSimpleVT().getSizeInBits() == 64)
36131     return false;
36132
36133   // We only care that the types being shuffled are legal. The lowering can
36134   // handle any possible shuffle mask that results.
36135   return isTypeLegal(VT.getSimpleVT());
36136 }
36137
36138 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
36139                                                EVT VT) const {
36140   // Don't convert an 'and' into a shuffle that we don't directly support.
36141   // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
36142   if (!Subtarget.hasAVX2())
36143     if (VT == MVT::v32i8 || VT == MVT::v16i16)
36144       return false;
36145
36146   // Just delegate to the generic legality, clear masks aren't special.
36147   return isShuffleMaskLegal(Mask, VT);
36148 }
36149
36150 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
36151   // If the subtarget is using thunks, we need to not generate jump tables.
36152   if (Subtarget.useIndirectThunkBranches())
36153     return false;
36154
36155   // Otherwise, fallback on the generic logic.
36156   return TargetLowering::areJTsAllowed(Fn);
36157 }
36158
36159 MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,
36160                                                        EVT ConditionVT) const {
36161   // Avoid 8 and 16 bit types because they increase the chance for unnecessary
36162   // zero-extensions.
36163   if (ConditionVT.getSizeInBits() < 32)
36164     return MVT::i32;
36165   return TargetLoweringBase::getPreferredSwitchConditionType(Context,
36166                                                              ConditionVT);
36167 }
36168
36169 //===----------------------------------------------------------------------===//
36170 //                           X86 Scheduler Hooks
36171 //===----------------------------------------------------------------------===//
36172
36173 // Returns true if EFLAG is consumed after this iterator in the rest of the
36174 // basic block or any successors of the basic block.
36175 static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
36176                               MachineBasicBlock *BB) {
36177   // Scan forward through BB for a use/def of EFLAGS.
36178   for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
36179     if (mi.readsRegister(X86::EFLAGS))
36180       return true;
36181     // If we found a def, we can stop searching.
36182     if (mi.definesRegister(X86::EFLAGS))
36183       return false;
36184   }
36185
36186   // If we hit the end of the block, check whether EFLAGS is live into a
36187   // successor.
36188   for (MachineBasicBlock *Succ : BB->successors())
36189     if (Succ->isLiveIn(X86::EFLAGS))
36190       return true;
36191
36192   return false;
36193 }
36194
36195 /// Utility function to emit xbegin specifying the start of an RTM region.
36196 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
36197                                      const TargetInstrInfo *TII) {
36198   const DebugLoc &DL = MI.getDebugLoc();
36199
36200   const BasicBlock *BB = MBB->getBasicBlock();
36201   MachineFunction::iterator I = ++MBB->getIterator();
36202
36203   // For the v = xbegin(), we generate
36204   //
36205   // thisMBB:
36206   //  xbegin sinkMBB
36207   //
36208   // mainMBB:
36209   //  s0 = -1
36210   //
36211   // fallBB:
36212   //  eax = # XABORT_DEF
36213   //  s1 = eax
36214   //
36215   // sinkMBB:
36216   //  v = phi(s0/mainBB, s1/fallBB)
36217
36218   MachineBasicBlock *thisMBB = MBB;
36219   MachineFunction *MF = MBB->getParent();
36220   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
36221   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
36222   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
36223   MF->insert(I, mainMBB);
36224   MF->insert(I, fallMBB);
36225   MF->insert(I, sinkMBB);
36226
36227   if (isEFLAGSLiveAfter(MI, MBB)) {
36228     mainMBB->addLiveIn(X86::EFLAGS);
36229     fallMBB->addLiveIn(X86::EFLAGS);
36230     sinkMBB->addLiveIn(X86::EFLAGS);
36231   }
36232
36233   // Transfer the remainder of BB and its successor edges to sinkMBB.
36234   sinkMBB->splice(sinkMBB->begin(), MBB,
36235                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36236   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
36237
36238   MachineRegisterInfo &MRI = MF->getRegInfo();
36239   Register DstReg = MI.getOperand(0).getReg();
36240   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
36241   Register mainDstReg = MRI.createVirtualRegister(RC);
36242   Register fallDstReg = MRI.createVirtualRegister(RC);
36243
36244   // thisMBB:
36245   //  xbegin fallMBB
36246   //  # fallthrough to mainMBB
36247   //  # abortion to fallMBB
36248   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
36249   thisMBB->addSuccessor(mainMBB);
36250   thisMBB->addSuccessor(fallMBB);
36251
36252   // mainMBB:
36253   //  mainDstReg := -1
36254   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
36255   BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
36256   mainMBB->addSuccessor(sinkMBB);
36257
36258   // fallMBB:
36259   //  ; pseudo instruction to model hardware's definition from XABORT
36260   //  EAX := XABORT_DEF
36261   //  fallDstReg := EAX
36262   BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
36263   BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
36264       .addReg(X86::EAX);
36265   fallMBB->addSuccessor(sinkMBB);
36266
36267   // sinkMBB:
36268   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
36269   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
36270       .addReg(mainDstReg).addMBB(mainMBB)
36271       .addReg(fallDstReg).addMBB(fallMBB);
36272
36273   MI.eraseFromParent();
36274   return sinkMBB;
36275 }
36276
36277 MachineBasicBlock *
36278 X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
36279                                                MachineBasicBlock *MBB) const {
36280   // Emit va_arg instruction on X86-64.
36281
36282   // Operands to this pseudo-instruction:
36283   // 0  ) Output        : destination address (reg)
36284   // 1-5) Input         : va_list address (addr, i64mem)
36285   // 6  ) ArgSize       : Size (in bytes) of vararg type
36286   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
36287   // 8  ) Align         : Alignment of type
36288   // 9  ) EFLAGS (implicit-def)
36289
36290   assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
36291   static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
36292
36293   Register DestReg = MI.getOperand(0).getReg();
36294   MachineOperand &Base = MI.getOperand(1);
36295   MachineOperand &Scale = MI.getOperand(2);
36296   MachineOperand &Index = MI.getOperand(3);
36297   MachineOperand &Disp = MI.getOperand(4);
36298   MachineOperand &Segment = MI.getOperand(5);
36299   unsigned ArgSize = MI.getOperand(6).getImm();
36300   unsigned ArgMode = MI.getOperand(7).getImm();
36301   Align Alignment = Align(MI.getOperand(8).getImm());
36302
36303   MachineFunction *MF = MBB->getParent();
36304
36305   // Memory Reference
36306   assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
36307
36308   MachineMemOperand *OldMMO = MI.memoperands().front();
36309
36310   // Clone the MMO into two separate MMOs for loading and storing
36311   MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
36312       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
36313   MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
36314       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
36315
36316   // Machine Information
36317   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36318   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
36319   const TargetRegisterClass *AddrRegClass =
36320       getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
36321   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
36322   const DebugLoc &DL = MI.getDebugLoc();
36323
36324   // struct va_list {
36325   //   i32   gp_offset
36326   //   i32   fp_offset
36327   //   i64   overflow_area (address)
36328   //   i64   reg_save_area (address)
36329   // }
36330   // sizeof(va_list) = 24
36331   // alignment(va_list) = 8
36332
36333   unsigned TotalNumIntRegs = 6;
36334   unsigned TotalNumXMMRegs = 8;
36335   bool UseGPOffset = (ArgMode == 1);
36336   bool UseFPOffset = (ArgMode == 2);
36337   unsigned MaxOffset = TotalNumIntRegs * 8 +
36338                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
36339
36340   /* Align ArgSize to a multiple of 8 */
36341   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
36342   bool NeedsAlign = (Alignment > 8);
36343
36344   MachineBasicBlock *thisMBB = MBB;
36345   MachineBasicBlock *overflowMBB;
36346   MachineBasicBlock *offsetMBB;
36347   MachineBasicBlock *endMBB;
36348
36349   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
36350   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
36351   unsigned OffsetReg = 0;
36352
36353   if (!UseGPOffset && !UseFPOffset) {
36354     // If we only pull from the overflow region, we don't create a branch.
36355     // We don't need to alter control flow.
36356     OffsetDestReg = 0; // unused
36357     OverflowDestReg = DestReg;
36358
36359     offsetMBB = nullptr;
36360     overflowMBB = thisMBB;
36361     endMBB = thisMBB;
36362   } else {
36363     // First emit code to check if gp_offset (or fp_offset) is below the bound.
36364     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
36365     // If not, pull from overflow_area. (branch to overflowMBB)
36366     //
36367     //       thisMBB
36368     //         |     .
36369     //         |        .
36370     //     offsetMBB   overflowMBB
36371     //         |        .
36372     //         |     .
36373     //        endMBB
36374
36375     // Registers for the PHI in endMBB
36376     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
36377     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
36378
36379     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36380     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36381     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36382     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36383
36384     MachineFunction::iterator MBBIter = ++MBB->getIterator();
36385
36386     // Insert the new basic blocks
36387     MF->insert(MBBIter, offsetMBB);
36388     MF->insert(MBBIter, overflowMBB);
36389     MF->insert(MBBIter, endMBB);
36390
36391     // Transfer the remainder of MBB and its successor edges to endMBB.
36392     endMBB->splice(endMBB->begin(), thisMBB,
36393                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
36394     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
36395
36396     // Make offsetMBB and overflowMBB successors of thisMBB
36397     thisMBB->addSuccessor(offsetMBB);
36398     thisMBB->addSuccessor(overflowMBB);
36399
36400     // endMBB is a successor of both offsetMBB and overflowMBB
36401     offsetMBB->addSuccessor(endMBB);
36402     overflowMBB->addSuccessor(endMBB);
36403
36404     // Load the offset value into a register
36405     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36406     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
36407         .add(Base)
36408         .add(Scale)
36409         .add(Index)
36410         .addDisp(Disp, UseFPOffset ? 4 : 0)
36411         .add(Segment)
36412         .setMemRefs(LoadOnlyMMO);
36413
36414     // Check if there is enough room left to pull this argument.
36415     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
36416       .addReg(OffsetReg)
36417       .addImm(MaxOffset + 8 - ArgSizeA8);
36418
36419     // Branch to "overflowMBB" if offset >= max
36420     // Fall through to "offsetMBB" otherwise
36421     BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
36422       .addMBB(overflowMBB).addImm(X86::COND_AE);
36423   }
36424
36425   // In offsetMBB, emit code to use the reg_save_area.
36426   if (offsetMBB) {
36427     assert(OffsetReg != 0);
36428
36429     // Read the reg_save_area address.
36430     Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
36431     BuildMI(
36432         offsetMBB, DL,
36433         TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36434         RegSaveReg)
36435         .add(Base)
36436         .add(Scale)
36437         .add(Index)
36438         .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
36439         .add(Segment)
36440         .setMemRefs(LoadOnlyMMO);
36441
36442     if (Subtarget.isTarget64BitLP64()) {
36443       // Zero-extend the offset
36444       Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
36445       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
36446           .addImm(0)
36447           .addReg(OffsetReg)
36448           .addImm(X86::sub_32bit);
36449
36450       // Add the offset to the reg_save_area to get the final address.
36451       BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
36452           .addReg(OffsetReg64)
36453           .addReg(RegSaveReg);
36454     } else {
36455       // Add the offset to the reg_save_area to get the final address.
36456       BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
36457           .addReg(OffsetReg)
36458           .addReg(RegSaveReg);
36459     }
36460
36461     // Compute the offset for the next argument
36462     Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
36463     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
36464       .addReg(OffsetReg)
36465       .addImm(UseFPOffset ? 16 : 8);
36466
36467     // Store it back into the va_list.
36468     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
36469         .add(Base)
36470         .add(Scale)
36471         .add(Index)
36472         .addDisp(Disp, UseFPOffset ? 4 : 0)
36473         .add(Segment)
36474         .addReg(NextOffsetReg)
36475         .setMemRefs(StoreOnlyMMO);
36476
36477     // Jump to endMBB
36478     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
36479       .addMBB(endMBB);
36480   }
36481
36482   //
36483   // Emit code to use overflow area
36484   //
36485
36486   // Load the overflow_area address into a register.
36487   Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36488   BuildMI(overflowMBB, DL,
36489           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36490           OverflowAddrReg)
36491       .add(Base)
36492       .add(Scale)
36493       .add(Index)
36494       .addDisp(Disp, 8)
36495       .add(Segment)
36496       .setMemRefs(LoadOnlyMMO);
36497
36498   // If we need to align it, do so. Otherwise, just copy the address
36499   // to OverflowDestReg.
36500   if (NeedsAlign) {
36501     // Align the overflow address
36502     Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36503
36504     // aligned_addr = (addr + (align-1)) & ~(align-1)
36505     BuildMI(
36506         overflowMBB, DL,
36507         TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36508         TmpReg)
36509         .addReg(OverflowAddrReg)
36510         .addImm(Alignment.value() - 1);
36511
36512     BuildMI(
36513         overflowMBB, DL,
36514         TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36515         OverflowDestReg)
36516         .addReg(TmpReg)
36517         .addImm(~(uint64_t)(Alignment.value() - 1));
36518   } else {
36519     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
36520       .addReg(OverflowAddrReg);
36521   }
36522
36523   // Compute the next overflow address after this argument.
36524   // (the overflow address should be kept 8-byte aligned)
36525   Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36526   BuildMI(
36527       overflowMBB, DL,
36528       TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36529       NextAddrReg)
36530       .addReg(OverflowDestReg)
36531       .addImm(ArgSizeA8);
36532
36533   // Store the new overflow address.
36534   BuildMI(overflowMBB, DL,
36535           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36536       .add(Base)
36537       .add(Scale)
36538       .add(Index)
36539       .addDisp(Disp, 8)
36540       .add(Segment)
36541       .addReg(NextAddrReg)
36542       .setMemRefs(StoreOnlyMMO);
36543
36544   // If we branched, emit the PHI to the front of endMBB.
36545   if (offsetMBB) {
36546     BuildMI(*endMBB, endMBB->begin(), DL,
36547             TII->get(X86::PHI), DestReg)
36548       .addReg(OffsetDestReg).addMBB(offsetMBB)
36549       .addReg(OverflowDestReg).addMBB(overflowMBB);
36550   }
36551
36552   // Erase the pseudo instruction
36553   MI.eraseFromParent();
36554
36555   return endMBB;
36556 }
36557
36558 // The EFLAGS operand of SelectItr might be missing a kill marker
36559 // because there were multiple uses of EFLAGS, and ISel didn't know
36560 // which to mark. Figure out whether SelectItr should have had a
36561 // kill marker, and set it if it should. Returns the correct kill
36562 // marker value.
36563 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
36564                                      MachineBasicBlock* BB,
36565                                      const TargetRegisterInfo* TRI) {
36566   if (isEFLAGSLiveAfter(SelectItr, BB))
36567     return false;
36568
36569   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36570   // out. SelectMI should have a kill flag on EFLAGS.
36571   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36572   return true;
36573 }
36574
36575 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36576 // together with other CMOV pseudo-opcodes into a single basic-block with
36577 // conditional jump around it.
36578 static bool isCMOVPseudo(MachineInstr &MI) {
36579   switch (MI.getOpcode()) {
36580   case X86::CMOV_FR16:
36581   case X86::CMOV_FR16X:
36582   case X86::CMOV_FR32:
36583   case X86::CMOV_FR32X:
36584   case X86::CMOV_FR64:
36585   case X86::CMOV_FR64X:
36586   case X86::CMOV_GR8:
36587   case X86::CMOV_GR16:
36588   case X86::CMOV_GR32:
36589   case X86::CMOV_RFP32:
36590   case X86::CMOV_RFP64:
36591   case X86::CMOV_RFP80:
36592   case X86::CMOV_VR64:
36593   case X86::CMOV_VR128:
36594   case X86::CMOV_VR128X:
36595   case X86::CMOV_VR256:
36596   case X86::CMOV_VR256X:
36597   case X86::CMOV_VR512:
36598   case X86::CMOV_VK1:
36599   case X86::CMOV_VK2:
36600   case X86::CMOV_VK4:
36601   case X86::CMOV_VK8:
36602   case X86::CMOV_VK16:
36603   case X86::CMOV_VK32:
36604   case X86::CMOV_VK64:
36605     return true;
36606
36607   default:
36608     return false;
36609   }
36610 }
36611
36612 // Helper function, which inserts PHI functions into SinkMBB:
36613 //   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36614 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36615 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36616 // the last PHI function inserted.
36617 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
36618     MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
36619     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36620     MachineBasicBlock *SinkMBB) {
36621   MachineFunction *MF = TrueMBB->getParent();
36622   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
36623   const DebugLoc &DL = MIItBegin->getDebugLoc();
36624
36625   X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36626   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
36627
36628   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36629
36630   // As we are creating the PHIs, we have to be careful if there is more than
36631   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
36632   // PHIs have to reference the individual true/false inputs from earlier PHIs.
36633   // That also means that PHI construction must work forward from earlier to
36634   // later, and that the code must maintain a mapping from earlier PHI's
36635   // destination registers, and the registers that went into the PHI.
36636   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
36637   MachineInstrBuilder MIB;
36638
36639   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36640     Register DestReg = MIIt->getOperand(0).getReg();
36641     Register Op1Reg = MIIt->getOperand(1).getReg();
36642     Register Op2Reg = MIIt->getOperand(2).getReg();
36643
36644     // If this CMOV we are generating is the opposite condition from
36645     // the jump we generated, then we have to swap the operands for the
36646     // PHI that is going to be generated.
36647     if (MIIt->getOperand(3).getImm() == OppCC)
36648       std::swap(Op1Reg, Op2Reg);
36649
36650     if (RegRewriteTable.contains(Op1Reg))
36651       Op1Reg = RegRewriteTable[Op1Reg].first;
36652
36653     if (RegRewriteTable.contains(Op2Reg))
36654       Op2Reg = RegRewriteTable[Op2Reg].second;
36655
36656     MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
36657               .addReg(Op1Reg)
36658               .addMBB(FalseMBB)
36659               .addReg(Op2Reg)
36660               .addMBB(TrueMBB);
36661
36662     // Add this PHI to the rewrite table.
36663     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36664   }
36665
36666   return MIB;
36667 }
36668
36669 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36670 MachineBasicBlock *
36671 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36672                                              MachineInstr &SecondCascadedCMOV,
36673                                              MachineBasicBlock *ThisMBB) const {
36674   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36675   const DebugLoc &DL = FirstCMOV.getDebugLoc();
36676
36677   // We lower cascaded CMOVs such as
36678   //
36679   //   (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36680   //
36681   // to two successive branches.
36682   //
36683   // Without this, we would add a PHI between the two jumps, which ends up
36684   // creating a few copies all around. For instance, for
36685   //
36686   //    (sitofp (zext (fcmp une)))
36687   //
36688   // we would generate:
36689   //
36690   //         ucomiss %xmm1, %xmm0
36691   //         movss  <1.0f>, %xmm0
36692   //         movaps  %xmm0, %xmm1
36693   //         jne     .LBB5_2
36694   //         xorps   %xmm1, %xmm1
36695   // .LBB5_2:
36696   //         jp      .LBB5_4
36697   //         movaps  %xmm1, %xmm0
36698   // .LBB5_4:
36699   //         retq
36700   //
36701   // because this custom-inserter would have generated:
36702   //
36703   //   A
36704   //   | \
36705   //   |  B
36706   //   | /
36707   //   C
36708   //   | \
36709   //   |  D
36710   //   | /
36711   //   E
36712   //
36713   // A: X = ...; Y = ...
36714   // B: empty
36715   // C: Z = PHI [X, A], [Y, B]
36716   // D: empty
36717   // E: PHI [X, C], [Z, D]
36718   //
36719   // If we lower both CMOVs in a single step, we can instead generate:
36720   //
36721   //   A
36722   //   | \
36723   //   |  C
36724   //   | /|
36725   //   |/ |
36726   //   |  |
36727   //   |  D
36728   //   | /
36729   //   E
36730   //
36731   // A: X = ...; Y = ...
36732   // D: empty
36733   // E: PHI [X, A], [X, C], [Y, D]
36734   //
36735   // Which, in our sitofp/fcmp example, gives us something like:
36736   //
36737   //         ucomiss %xmm1, %xmm0
36738   //         movss  <1.0f>, %xmm0
36739   //         jne     .LBB5_4
36740   //         jp      .LBB5_4
36741   //         xorps   %xmm0, %xmm0
36742   // .LBB5_4:
36743   //         retq
36744   //
36745
36746   // We lower cascaded CMOV into two successive branches to the same block.
36747   // EFLAGS is used by both, so mark it as live in the second.
36748   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36749   MachineFunction *F = ThisMBB->getParent();
36750   MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36751   MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36752   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36753
36754   MachineFunction::iterator It = ++ThisMBB->getIterator();
36755   F->insert(It, FirstInsertedMBB);
36756   F->insert(It, SecondInsertedMBB);
36757   F->insert(It, SinkMBB);
36758
36759   // For a cascaded CMOV, we lower it to two successive branches to
36760   // the same block (SinkMBB).  EFLAGS is used by both, so mark it as live in
36761   // the FirstInsertedMBB.
36762   FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36763
36764   // If the EFLAGS register isn't dead in the terminator, then claim that it's
36765   // live into the sink and copy blocks.
36766   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36767   if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
36768       !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36769     SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36770     SinkMBB->addLiveIn(X86::EFLAGS);
36771   }
36772
36773   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36774   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36775                   std::next(MachineBasicBlock::iterator(FirstCMOV)),
36776                   ThisMBB->end());
36777   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36778
36779   // Fallthrough block for ThisMBB.
36780   ThisMBB->addSuccessor(FirstInsertedMBB);
36781   // The true block target of the first branch is always SinkMBB.
36782   ThisMBB->addSuccessor(SinkMBB);
36783   // Fallthrough block for FirstInsertedMBB.
36784   FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36785   // The true block for the branch of FirstInsertedMBB.
36786   FirstInsertedMBB->addSuccessor(SinkMBB);
36787   // This is fallthrough.
36788   SecondInsertedMBB->addSuccessor(SinkMBB);
36789
36790   // Create the conditional branch instructions.
36791   X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36792   BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36793
36794   X86::CondCode SecondCC =
36795       X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36796   BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
36797
36798   //  SinkMBB:
36799   //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36800   Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36801   Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36802   Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36803   MachineInstrBuilder MIB =
36804       BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
36805           .addReg(Op1Reg)
36806           .addMBB(SecondInsertedMBB)
36807           .addReg(Op2Reg)
36808           .addMBB(ThisMBB);
36809
36810   // The second SecondInsertedMBB provides the same incoming value as the
36811   // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36812   MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36813
36814   // Now remove the CMOVs.
36815   FirstCMOV.eraseFromParent();
36816   SecondCascadedCMOV.eraseFromParent();
36817
36818   return SinkMBB;
36819 }
36820
36821 MachineBasicBlock *
36822 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36823                                      MachineBasicBlock *ThisMBB) const {
36824   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36825   const DebugLoc &DL = MI.getDebugLoc();
36826
36827   // To "insert" a SELECT_CC instruction, we actually have to insert the
36828   // diamond control-flow pattern.  The incoming instruction knows the
36829   // destination vreg to set, the condition code register to branch on, the
36830   // true/false values to select between and a branch opcode to use.
36831
36832   //  ThisMBB:
36833   //  ...
36834   //   TrueVal = ...
36835   //   cmpTY ccX, r1, r2
36836   //   bCC copy1MBB
36837   //   fallthrough --> FalseMBB
36838
36839   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36840   // as described above, by inserting a BB, and then making a PHI at the join
36841   // point to select the true and false operands of the CMOV in the PHI.
36842   //
36843   // The code also handles two different cases of multiple CMOV opcodes
36844   // in a row.
36845   //
36846   // Case 1:
36847   // In this case, there are multiple CMOVs in a row, all which are based on
36848   // the same condition setting (or the exact opposite condition setting).
36849   // In this case we can lower all the CMOVs using a single inserted BB, and
36850   // then make a number of PHIs at the join point to model the CMOVs. The only
36851   // trickiness here, is that in a case like:
36852   //
36853   // t2 = CMOV cond1 t1, f1
36854   // t3 = CMOV cond1 t2, f2
36855   //
36856   // when rewriting this into PHIs, we have to perform some renaming on the
36857   // temps since you cannot have a PHI operand refer to a PHI result earlier
36858   // in the same block.  The "simple" but wrong lowering would be:
36859   //
36860   // t2 = PHI t1(BB1), f1(BB2)
36861   // t3 = PHI t2(BB1), f2(BB2)
36862   //
36863   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36864   // renaming is to note that on the path through BB1, t2 is really just a
36865   // copy of t1, and do that renaming, properly generating:
36866   //
36867   // t2 = PHI t1(BB1), f1(BB2)
36868   // t3 = PHI t1(BB1), f2(BB2)
36869   //
36870   // Case 2:
36871   // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36872   // function - EmitLoweredCascadedSelect.
36873
36874   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36875   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
36876   MachineInstr *LastCMOV = &MI;
36877   MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
36878
36879   // Check for case 1, where there are multiple CMOVs with the same condition
36880   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
36881   // number of jumps the most.
36882
36883   if (isCMOVPseudo(MI)) {
36884     // See if we have a string of CMOVS with the same condition. Skip over
36885     // intervening debug insts.
36886     while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36887            (NextMIIt->getOperand(3).getImm() == CC ||
36888             NextMIIt->getOperand(3).getImm() == OppCC)) {
36889       LastCMOV = &*NextMIIt;
36890       NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36891     }
36892   }
36893
36894   // This checks for case 2, but only do this if we didn't already find
36895   // case 1, as indicated by LastCMOV == MI.
36896   if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36897       NextMIIt->getOpcode() == MI.getOpcode() &&
36898       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36899       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36900       NextMIIt->getOperand(1).isKill()) {
36901     return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36902   }
36903
36904   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36905   MachineFunction *F = ThisMBB->getParent();
36906   MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36907   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36908
36909   MachineFunction::iterator It = ++ThisMBB->getIterator();
36910   F->insert(It, FalseMBB);
36911   F->insert(It, SinkMBB);
36912
36913   // If the EFLAGS register isn't dead in the terminator, then claim that it's
36914   // live into the sink and copy blocks.
36915   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36916   if (!LastCMOV->killsRegister(X86::EFLAGS) &&
36917       !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36918     FalseMBB->addLiveIn(X86::EFLAGS);
36919     SinkMBB->addLiveIn(X86::EFLAGS);
36920   }
36921
36922   // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36923   auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),
36924                                    MachineBasicBlock::iterator(LastCMOV));
36925   for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36926     if (MI.isDebugInstr())
36927       SinkMBB->push_back(MI.removeFromParent());
36928
36929   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36930   SinkMBB->splice(SinkMBB->end(), ThisMBB,
36931                   std::next(MachineBasicBlock::iterator(LastCMOV)),
36932                   ThisMBB->end());
36933   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36934
36935   // Fallthrough block for ThisMBB.
36936   ThisMBB->addSuccessor(FalseMBB);
36937   // The true block target of the first (or only) branch is always a SinkMBB.
36938   ThisMBB->addSuccessor(SinkMBB);
36939   // Fallthrough block for FalseMBB.
36940   FalseMBB->addSuccessor(SinkMBB);
36941
36942   // Create the conditional branch instruction.
36943   BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36944
36945   //  SinkMBB:
36946   //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36947   //  ...
36948   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
36949   MachineBasicBlock::iterator MIItEnd =
36950       std::next(MachineBasicBlock::iterator(LastCMOV));
36951   createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36952
36953   // Now remove the CMOV(s).
36954   ThisMBB->erase(MIItBegin, MIItEnd);
36955
36956   return SinkMBB;
36957 }
36958
36959 static unsigned getSUBriOpcode(bool IsLP64) {
36960   if (IsLP64)
36961     return X86::SUB64ri32;
36962   else
36963     return X86::SUB32ri;
36964 }
36965
36966 MachineBasicBlock *
36967 X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36968                                            MachineBasicBlock *MBB) const {
36969   MachineFunction *MF = MBB->getParent();
36970   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36971   const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36972   const DebugLoc &DL = MI.getDebugLoc();
36973   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36974
36975   const unsigned ProbeSize = getStackProbeSize(*MF);
36976
36977   MachineRegisterInfo &MRI = MF->getRegInfo();
36978   MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36979   MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36980   MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36981
36982   MachineFunction::iterator MBBIter = ++MBB->getIterator();
36983   MF->insert(MBBIter, testMBB);
36984   MF->insert(MBBIter, blockMBB);
36985   MF->insert(MBBIter, tailMBB);
36986
36987   Register sizeVReg = MI.getOperand(1).getReg();
36988
36989   Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36990
36991   Register TmpStackPtr = MRI.createVirtualRegister(
36992       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36993   Register FinalStackPtr = MRI.createVirtualRegister(
36994       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36995
36996   BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
36997       .addReg(physSPReg);
36998   {
36999     const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
37000     BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
37001         .addReg(TmpStackPtr)
37002         .addReg(sizeVReg);
37003   }
37004
37005   // test rsp size
37006
37007   BuildMI(testMBB, DL,
37008           TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
37009       .addReg(FinalStackPtr)
37010       .addReg(physSPReg);
37011
37012   BuildMI(testMBB, DL, TII->get(X86::JCC_1))
37013       .addMBB(tailMBB)
37014       .addImm(X86::COND_GE);
37015   testMBB->addSuccessor(blockMBB);
37016   testMBB->addSuccessor(tailMBB);
37017
37018   // Touch the block then extend it. This is done on the opposite side of
37019   // static probe where we allocate then touch, to avoid the need of probing the
37020   // tail of the static alloca. Possible scenarios are:
37021   //
37022   //       + ---- <- ------------ <- ------------- <- ------------ +
37023   //       |                                                       |
37024   // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
37025   //                                                               |                                                               |
37026   //                                                               + <- ----------- <- ------------ <- ----------- <- ------------ +
37027   //
37028   // The property we want to enforce is to never have more than [page alloc] between two probes.
37029
37030   const unsigned XORMIOpc =
37031       TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
37032   addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
37033       .addImm(0);
37034
37035   BuildMI(blockMBB, DL,
37036           TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)), physSPReg)
37037       .addReg(physSPReg)
37038       .addImm(ProbeSize);
37039
37040
37041   BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
37042   blockMBB->addSuccessor(testMBB);
37043
37044   // Replace original instruction by the expected stack ptr
37045   BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
37046       .addReg(FinalStackPtr);
37047
37048   tailMBB->splice(tailMBB->end(), MBB,
37049                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37050   tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
37051   MBB->addSuccessor(testMBB);
37052
37053   // Delete the original pseudo instruction.
37054   MI.eraseFromParent();
37055
37056   // And we're done.
37057   return tailMBB;
37058 }
37059
37060 MachineBasicBlock *
37061 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
37062                                         MachineBasicBlock *BB) const {
37063   MachineFunction *MF = BB->getParent();
37064   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37065   const DebugLoc &DL = MI.getDebugLoc();
37066   const BasicBlock *LLVM_BB = BB->getBasicBlock();
37067
37068   assert(MF->shouldSplitStack());
37069
37070   const bool Is64Bit = Subtarget.is64Bit();
37071   const bool IsLP64 = Subtarget.isTarget64BitLP64();
37072
37073   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
37074   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
37075
37076   // BB:
37077   //  ... [Till the alloca]
37078   // If stacklet is not large enough, jump to mallocMBB
37079   //
37080   // bumpMBB:
37081   //  Allocate by subtracting from RSP
37082   //  Jump to continueMBB
37083   //
37084   // mallocMBB:
37085   //  Allocate by call to runtime
37086   //
37087   // continueMBB:
37088   //  ...
37089   //  [rest of original BB]
37090   //
37091
37092   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
37093   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
37094   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
37095
37096   MachineRegisterInfo &MRI = MF->getRegInfo();
37097   const TargetRegisterClass *AddrRegClass =
37098       getRegClassFor(getPointerTy(MF->getDataLayout()));
37099
37100   Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
37101            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
37102            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
37103            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
37104            sizeVReg = MI.getOperand(1).getReg(),
37105            physSPReg =
37106                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
37107
37108   MachineFunction::iterator MBBIter = ++BB->getIterator();
37109
37110   MF->insert(MBBIter, bumpMBB);
37111   MF->insert(MBBIter, mallocMBB);
37112   MF->insert(MBBIter, continueMBB);
37113
37114   continueMBB->splice(continueMBB->begin(), BB,
37115                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
37116   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
37117
37118   // Add code to the main basic block to check if the stack limit has been hit,
37119   // and if so, jump to mallocMBB otherwise to bumpMBB.
37120   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
37121   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
37122     .addReg(tmpSPVReg).addReg(sizeVReg);
37123   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
37124     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
37125     .addReg(SPLimitVReg);
37126   BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
37127
37128   // bumpMBB simply decreases the stack pointer, since we know the current
37129   // stacklet has enough space.
37130   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
37131     .addReg(SPLimitVReg);
37132   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
37133     .addReg(SPLimitVReg);
37134   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
37135
37136   // Calls into a routine in libgcc to allocate more space from the heap.
37137   const uint32_t *RegMask =
37138       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
37139   if (IsLP64) {
37140     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
37141       .addReg(sizeVReg);
37142     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
37143       .addExternalSymbol("__morestack_allocate_stack_space")
37144       .addRegMask(RegMask)
37145       .addReg(X86::RDI, RegState::Implicit)
37146       .addReg(X86::RAX, RegState::ImplicitDefine);
37147   } else if (Is64Bit) {
37148     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
37149       .addReg(sizeVReg);
37150     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
37151       .addExternalSymbol("__morestack_allocate_stack_space")
37152       .addRegMask(RegMask)
37153       .addReg(X86::EDI, RegState::Implicit)
37154       .addReg(X86::EAX, RegState::ImplicitDefine);
37155   } else {
37156     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
37157       .addImm(12);
37158     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
37159     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
37160       .addExternalSymbol("__morestack_allocate_stack_space")
37161       .addRegMask(RegMask)
37162       .addReg(X86::EAX, RegState::ImplicitDefine);
37163   }
37164
37165   if (!Is64Bit)
37166     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
37167       .addImm(16);
37168
37169   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
37170     .addReg(IsLP64 ? X86::RAX : X86::EAX);
37171   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
37172
37173   // Set up the CFG correctly.
37174   BB->addSuccessor(bumpMBB);
37175   BB->addSuccessor(mallocMBB);
37176   mallocMBB->addSuccessor(continueMBB);
37177   bumpMBB->addSuccessor(continueMBB);
37178
37179   // Take care of the PHI nodes.
37180   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
37181           MI.getOperand(0).getReg())
37182       .addReg(mallocPtrVReg)
37183       .addMBB(mallocMBB)
37184       .addReg(bumpSPPtrVReg)
37185       .addMBB(bumpMBB);
37186
37187   // Delete the original pseudo instruction.
37188   MI.eraseFromParent();
37189
37190   // And we're done.
37191   return continueMBB;
37192 }
37193
37194 MachineBasicBlock *
37195 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
37196                                        MachineBasicBlock *BB) const {
37197   MachineFunction *MF = BB->getParent();
37198   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37199   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
37200   const DebugLoc &DL = MI.getDebugLoc();
37201
37202   assert(!isAsynchronousEHPersonality(
37203              classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
37204          "SEH does not use catchret!");
37205
37206   // Only 32-bit EH needs to worry about manually restoring stack pointers.
37207   if (!Subtarget.is32Bit())
37208     return BB;
37209
37210   // C++ EH creates a new target block to hold the restore code, and wires up
37211   // the new block to the return destination with a normal JMP_4.
37212   MachineBasicBlock *RestoreMBB =
37213       MF->CreateMachineBasicBlock(BB->getBasicBlock());
37214   assert(BB->succ_size() == 1);
37215   MF->insert(std::next(BB->getIterator()), RestoreMBB);
37216   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
37217   BB->addSuccessor(RestoreMBB);
37218   MI.getOperand(0).setMBB(RestoreMBB);
37219
37220   // Marking this as an EH pad but not a funclet entry block causes PEI to
37221   // restore stack pointers in the block.
37222   RestoreMBB->setIsEHPad(true);
37223
37224   auto RestoreMBBI = RestoreMBB->begin();
37225   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
37226   return BB;
37227 }
37228
37229 MachineBasicBlock *
37230 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
37231                                       MachineBasicBlock *BB) const {
37232   // So, here we replace TLSADDR with the sequence:
37233   // adjust_stackdown -> TLSADDR -> adjust_stackup.
37234   // We need this because TLSADDR is lowered into calls
37235   // inside MC, therefore without the two markers shrink-wrapping
37236   // may push the prologue/epilogue pass them.
37237   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37238   const DebugLoc &DL = MI.getDebugLoc();
37239   MachineFunction &MF = *BB->getParent();
37240
37241   // Emit CALLSEQ_START right before the instruction.
37242   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37243   MachineInstrBuilder CallseqStart =
37244     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37245   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37246
37247   // Emit CALLSEQ_END right after the instruction.
37248   // We don't call erase from parent because we want to keep the
37249   // original instruction around.
37250   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37251   MachineInstrBuilder CallseqEnd =
37252     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
37253   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37254
37255   return BB;
37256 }
37257
37258 MachineBasicBlock *
37259 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
37260                                       MachineBasicBlock *BB) const {
37261   // This is pretty easy.  We're taking the value that we received from
37262   // our load from the relocation, sticking it in either RDI (x86-64)
37263   // or EAX and doing an indirect call.  The return value will then
37264   // be in the normal return register.
37265   MachineFunction *F = BB->getParent();
37266   const X86InstrInfo *TII = Subtarget.getInstrInfo();
37267   const DebugLoc &DL = MI.getDebugLoc();
37268
37269   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
37270   assert(MI.getOperand(3).isGlobal() && "This should be a global");
37271
37272   // Get a register mask for the lowered call.
37273   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
37274   // proper register mask.
37275   const uint32_t *RegMask =
37276       Subtarget.is64Bit() ?
37277       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
37278       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
37279   if (Subtarget.is64Bit()) {
37280     MachineInstrBuilder MIB =
37281         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
37282             .addReg(X86::RIP)
37283             .addImm(0)
37284             .addReg(0)
37285             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
37286                               MI.getOperand(3).getTargetFlags())
37287             .addReg(0);
37288     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
37289     addDirectMem(MIB, X86::RDI);
37290     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
37291   } else if (!isPositionIndependent()) {
37292     MachineInstrBuilder MIB =
37293         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
37294             .addReg(0)
37295             .addImm(0)
37296             .addReg(0)
37297             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
37298                               MI.getOperand(3).getTargetFlags())
37299             .addReg(0);
37300     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
37301     addDirectMem(MIB, X86::EAX);
37302     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
37303   } else {
37304     MachineInstrBuilder MIB =
37305         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
37306             .addReg(TII->getGlobalBaseReg(F))
37307             .addImm(0)
37308             .addReg(0)
37309             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
37310                               MI.getOperand(3).getTargetFlags())
37311             .addReg(0);
37312     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
37313     addDirectMem(MIB, X86::EAX);
37314     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
37315   }
37316
37317   MI.eraseFromParent(); // The pseudo instruction is gone now.
37318   return BB;
37319 }
37320
37321 static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
37322   switch (RPOpc) {
37323   case X86::INDIRECT_THUNK_CALL32:
37324     return X86::CALLpcrel32;
37325   case X86::INDIRECT_THUNK_CALL64:
37326     return X86::CALL64pcrel32;
37327   case X86::INDIRECT_THUNK_TCRETURN32:
37328     return X86::TCRETURNdi;
37329   case X86::INDIRECT_THUNK_TCRETURN64:
37330     return X86::TCRETURNdi64;
37331   }
37332   llvm_unreachable("not indirect thunk opcode");
37333 }
37334
37335 static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
37336                                           unsigned Reg) {
37337   if (Subtarget.useRetpolineExternalThunk()) {
37338     // When using an external thunk for retpolines, we pick names that match the
37339     // names GCC happens to use as well. This helps simplify the implementation
37340     // of the thunks for kernels where they have no easy ability to create
37341     // aliases and are doing non-trivial configuration of the thunk's body. For
37342     // example, the Linux kernel will do boot-time hot patching of the thunk
37343     // bodies and cannot easily export aliases of these to loaded modules.
37344     //
37345     // Note that at any point in the future, we may need to change the semantics
37346     // of how we implement retpolines and at that time will likely change the
37347     // name of the called thunk. Essentially, there is no hard guarantee that
37348     // LLVM will generate calls to specific thunks, we merely make a best-effort
37349     // attempt to help out kernels and other systems where duplicating the
37350     // thunks is costly.
37351     switch (Reg) {
37352     case X86::EAX:
37353       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
37354       return "__x86_indirect_thunk_eax";
37355     case X86::ECX:
37356       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
37357       return "__x86_indirect_thunk_ecx";
37358     case X86::EDX:
37359       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
37360       return "__x86_indirect_thunk_edx";
37361     case X86::EDI:
37362       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
37363       return "__x86_indirect_thunk_edi";
37364     case X86::R11:
37365       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
37366       return "__x86_indirect_thunk_r11";
37367     }
37368     llvm_unreachable("unexpected reg for external indirect thunk");
37369   }
37370
37371   if (Subtarget.useRetpolineIndirectCalls() ||
37372       Subtarget.useRetpolineIndirectBranches()) {
37373     // When targeting an internal COMDAT thunk use an LLVM-specific name.
37374     switch (Reg) {
37375     case X86::EAX:
37376       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
37377       return "__llvm_retpoline_eax";
37378     case X86::ECX:
37379       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
37380       return "__llvm_retpoline_ecx";
37381     case X86::EDX:
37382       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
37383       return "__llvm_retpoline_edx";
37384     case X86::EDI:
37385       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
37386       return "__llvm_retpoline_edi";
37387     case X86::R11:
37388       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
37389       return "__llvm_retpoline_r11";
37390     }
37391     llvm_unreachable("unexpected reg for retpoline");
37392   }
37393
37394   if (Subtarget.useLVIControlFlowIntegrity()) {
37395     assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
37396     return "__llvm_lvi_thunk_r11";
37397   }
37398   llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
37399 }
37400
37401 MachineBasicBlock *
37402 X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
37403                                             MachineBasicBlock *BB) const {
37404   // Copy the virtual register into the R11 physical register and
37405   // call the retpoline thunk.
37406   const DebugLoc &DL = MI.getDebugLoc();
37407   const X86InstrInfo *TII = Subtarget.getInstrInfo();
37408   Register CalleeVReg = MI.getOperand(0).getReg();
37409   unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
37410
37411   // Find an available scratch register to hold the callee. On 64-bit, we can
37412   // just use R11, but we scan for uses anyway to ensure we don't generate
37413   // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
37414   // already a register use operand to the call to hold the callee. If none
37415   // are available, use EDI instead. EDI is chosen because EBX is the PIC base
37416   // register and ESI is the base pointer to realigned stack frames with VLAs.
37417   SmallVector<unsigned, 3> AvailableRegs;
37418   if (Subtarget.is64Bit())
37419     AvailableRegs.push_back(X86::R11);
37420   else
37421     AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
37422
37423   // Zero out any registers that are already used.
37424   for (const auto &MO : MI.operands()) {
37425     if (MO.isReg() && MO.isUse())
37426       for (unsigned &Reg : AvailableRegs)
37427         if (Reg == MO.getReg())
37428           Reg = 0;
37429   }
37430
37431   // Choose the first remaining non-zero available register.
37432   unsigned AvailableReg = 0;
37433   for (unsigned MaybeReg : AvailableRegs) {
37434     if (MaybeReg) {
37435       AvailableReg = MaybeReg;
37436       break;
37437     }
37438   }
37439   if (!AvailableReg)
37440     report_fatal_error("calling convention incompatible with retpoline, no "
37441                        "available registers");
37442
37443   const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
37444
37445   BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
37446       .addReg(CalleeVReg);
37447   MI.getOperand(0).ChangeToES(Symbol);
37448   MI.setDesc(TII->get(Opc));
37449   MachineInstrBuilder(*BB->getParent(), &MI)
37450       .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
37451   return BB;
37452 }
37453
37454 /// SetJmp implies future control flow change upon calling the corresponding
37455 /// LongJmp.
37456 /// Instead of using the 'return' instruction, the long jump fixes the stack and
37457 /// performs an indirect branch. To do so it uses the registers that were stored
37458 /// in the jump buffer (when calling SetJmp).
37459 /// In case the shadow stack is enabled we need to fix it as well, because some
37460 /// return addresses will be skipped.
37461 /// The function will save the SSP for future fixing in the function
37462 /// emitLongJmpShadowStackFix.
37463 /// \sa emitLongJmpShadowStackFix
37464 /// \param [in] MI The temporary Machine Instruction for the builtin.
37465 /// \param [in] MBB The Machine Basic Block that will be modified.
37466 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
37467                                                  MachineBasicBlock *MBB) const {
37468   const DebugLoc &DL = MI.getDebugLoc();
37469   MachineFunction *MF = MBB->getParent();
37470   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37471   MachineRegisterInfo &MRI = MF->getRegInfo();
37472   MachineInstrBuilder MIB;
37473
37474   // Memory Reference.
37475   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37476                                            MI.memoperands_end());
37477
37478   // Initialize a register with zero.
37479   MVT PVT = getPointerTy(MF->getDataLayout());
37480   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37481   Register ZReg = MRI.createVirtualRegister(PtrRC);
37482   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
37483   BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
37484       .addDef(ZReg)
37485       .addReg(ZReg, RegState::Undef)
37486       .addReg(ZReg, RegState::Undef);
37487
37488   // Read the current SSP Register value to the zeroed register.
37489   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37490   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37491   BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37492
37493   // Write the SSP register value to offset 3 in input memory buffer.
37494   unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37495   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
37496   const int64_t SSPOffset = 3 * PVT.getStoreSize();
37497   const unsigned MemOpndSlot = 1;
37498   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37499     if (i == X86::AddrDisp)
37500       MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
37501     else
37502       MIB.add(MI.getOperand(MemOpndSlot + i));
37503   }
37504   MIB.addReg(SSPCopyReg);
37505   MIB.setMemRefs(MMOs);
37506 }
37507
37508 MachineBasicBlock *
37509 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37510                                     MachineBasicBlock *MBB) const {
37511   const DebugLoc &DL = MI.getDebugLoc();
37512   MachineFunction *MF = MBB->getParent();
37513   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37514   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37515   MachineRegisterInfo &MRI = MF->getRegInfo();
37516
37517   const BasicBlock *BB = MBB->getBasicBlock();
37518   MachineFunction::iterator I = ++MBB->getIterator();
37519
37520   // Memory Reference
37521   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37522                                            MI.memoperands_end());
37523
37524   unsigned DstReg;
37525   unsigned MemOpndSlot = 0;
37526
37527   unsigned CurOp = 0;
37528
37529   DstReg = MI.getOperand(CurOp++).getReg();
37530   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37531   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37532   (void)TRI;
37533   Register mainDstReg = MRI.createVirtualRegister(RC);
37534   Register restoreDstReg = MRI.createVirtualRegister(RC);
37535
37536   MemOpndSlot = CurOp;
37537
37538   MVT PVT = getPointerTy(MF->getDataLayout());
37539   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37540          "Invalid Pointer Size!");
37541
37542   // For v = setjmp(buf), we generate
37543   //
37544   // thisMBB:
37545   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37546   //  SjLjSetup restoreMBB
37547   //
37548   // mainMBB:
37549   //  v_main = 0
37550   //
37551   // sinkMBB:
37552   //  v = phi(main, restore)
37553   //
37554   // restoreMBB:
37555   //  if base pointer being used, load it from frame
37556   //  v_restore = 1
37557
37558   MachineBasicBlock *thisMBB = MBB;
37559   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37560   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37561   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37562   MF->insert(I, mainMBB);
37563   MF->insert(I, sinkMBB);
37564   MF->push_back(restoreMBB);
37565   restoreMBB->setMachineBlockAddressTaken();
37566
37567   MachineInstrBuilder MIB;
37568
37569   // Transfer the remainder of BB and its successor edges to sinkMBB.
37570   sinkMBB->splice(sinkMBB->begin(), MBB,
37571                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37572   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
37573
37574   // thisMBB:
37575   unsigned PtrStoreOpc = 0;
37576   unsigned LabelReg = 0;
37577   const int64_t LabelOffset = 1 * PVT.getStoreSize();
37578   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37579                      !isPositionIndependent();
37580
37581   // Prepare IP either in reg or imm.
37582   if (!UseImmLabel) {
37583     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37584     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37585     LabelReg = MRI.createVirtualRegister(PtrRC);
37586     if (Subtarget.is64Bit()) {
37587       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
37588               .addReg(X86::RIP)
37589               .addImm(0)
37590               .addReg(0)
37591               .addMBB(restoreMBB)
37592               .addReg(0);
37593     } else {
37594       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37595       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
37596               .addReg(XII->getGlobalBaseReg(MF))
37597               .addImm(0)
37598               .addReg(0)
37599               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37600               .addReg(0);
37601     }
37602   } else
37603     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37604   // Store IP
37605   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
37606   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37607     if (i == X86::AddrDisp)
37608       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37609     else
37610       MIB.add(MI.getOperand(MemOpndSlot + i));
37611   }
37612   if (!UseImmLabel)
37613     MIB.addReg(LabelReg);
37614   else
37615     MIB.addMBB(restoreMBB);
37616   MIB.setMemRefs(MMOs);
37617
37618   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
37619     emitSetJmpShadowStackFix(MI, thisMBB);
37620   }
37621
37622   // Setup
37623   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
37624           .addMBB(restoreMBB);
37625
37626   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37627   MIB.addRegMask(RegInfo->getNoPreservedMask());
37628   thisMBB->addSuccessor(mainMBB);
37629   thisMBB->addSuccessor(restoreMBB);
37630
37631   // mainMBB:
37632   //  EAX = 0
37633   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
37634   mainMBB->addSuccessor(sinkMBB);
37635
37636   // sinkMBB:
37637   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
37638           TII->get(X86::PHI), DstReg)
37639     .addReg(mainDstReg).addMBB(mainMBB)
37640     .addReg(restoreDstReg).addMBB(restoreMBB);
37641
37642   // restoreMBB:
37643   if (RegInfo->hasBasePointer(*MF)) {
37644     const bool Uses64BitFramePtr =
37645         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
37646     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37647     X86FI->setRestoreBasePointer(MF);
37648     Register FramePtr = RegInfo->getFrameRegister(*MF);
37649     Register BasePtr = RegInfo->getBaseRegister();
37650     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37651     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
37652                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
37653       .setMIFlag(MachineInstr::FrameSetup);
37654   }
37655   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37656   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37657   restoreMBB->addSuccessor(sinkMBB);
37658
37659   MI.eraseFromParent();
37660   return sinkMBB;
37661 }
37662
37663 /// Fix the shadow stack using the previously saved SSP pointer.
37664 /// \sa emitSetJmpShadowStackFix
37665 /// \param [in] MI The temporary Machine Instruction for the builtin.
37666 /// \param [in] MBB The Machine Basic Block that will be modified.
37667 /// \return The sink MBB that will perform the future indirect branch.
37668 MachineBasicBlock *
37669 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37670                                              MachineBasicBlock *MBB) const {
37671   const DebugLoc &DL = MI.getDebugLoc();
37672   MachineFunction *MF = MBB->getParent();
37673   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37674   MachineRegisterInfo &MRI = MF->getRegInfo();
37675
37676   // Memory Reference
37677   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37678                                            MI.memoperands_end());
37679
37680   MVT PVT = getPointerTy(MF->getDataLayout());
37681   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37682
37683   // checkSspMBB:
37684   //         xor vreg1, vreg1
37685   //         rdssp vreg1
37686   //         test vreg1, vreg1
37687   //         je sinkMBB   # Jump if Shadow Stack is not supported
37688   // fallMBB:
37689   //         mov buf+24/12(%rip), vreg2
37690   //         sub vreg1, vreg2
37691   //         jbe sinkMBB  # No need to fix the Shadow Stack
37692   // fixShadowMBB:
37693   //         shr 3/2, vreg2
37694   //         incssp vreg2  # fix the SSP according to the lower 8 bits
37695   //         shr 8, vreg2
37696   //         je sinkMBB
37697   // fixShadowLoopPrepareMBB:
37698   //         shl vreg2
37699   //         mov 128, vreg3
37700   // fixShadowLoopMBB:
37701   //         incssp vreg3
37702   //         dec vreg2
37703   //         jne fixShadowLoopMBB # Iterate until you finish fixing
37704   //                              # the Shadow Stack
37705   // sinkMBB:
37706
37707   MachineFunction::iterator I = ++MBB->getIterator();
37708   const BasicBlock *BB = MBB->getBasicBlock();
37709
37710   MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37711   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37712   MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37713   MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37714   MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37715   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37716   MF->insert(I, checkSspMBB);
37717   MF->insert(I, fallMBB);
37718   MF->insert(I, fixShadowMBB);
37719   MF->insert(I, fixShadowLoopPrepareMBB);
37720   MF->insert(I, fixShadowLoopMBB);
37721   MF->insert(I, sinkMBB);
37722
37723   // Transfer the remainder of BB and its successor edges to sinkMBB.
37724   sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37725                   MBB->end());
37726   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
37727
37728   MBB->addSuccessor(checkSspMBB);
37729
37730   // Initialize a register with zero.
37731   Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37732   BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
37733
37734   if (PVT == MVT::i64) {
37735     Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37736     BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37737       .addImm(0)
37738       .addReg(ZReg)
37739       .addImm(X86::sub_32bit);
37740     ZReg = TmpZReg;
37741   }
37742
37743   // Read the current SSP Register value to the zeroed register.
37744   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37745   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37746   BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37747
37748   // Check whether the result of the SSP register is zero and jump directly
37749   // to the sink.
37750   unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37751   BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
37752       .addReg(SSPCopyReg)
37753       .addReg(SSPCopyReg);
37754   BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
37755   checkSspMBB->addSuccessor(sinkMBB);
37756   checkSspMBB->addSuccessor(fallMBB);
37757
37758   // Reload the previously saved SSP register value.
37759   Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37760   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37761   const int64_t SPPOffset = 3 * PVT.getStoreSize();
37762   MachineInstrBuilder MIB =
37763       BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
37764   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37765     const MachineOperand &MO = MI.getOperand(i);
37766     if (i == X86::AddrDisp)
37767       MIB.addDisp(MO, SPPOffset);
37768     else if (MO.isReg()) // Don't add the whole operand, we don't want to
37769                          // preserve kill flags.
37770       MIB.addReg(MO.getReg());
37771     else
37772       MIB.add(MO);
37773   }
37774   MIB.setMemRefs(MMOs);
37775
37776   // Subtract the current SSP from the previous SSP.
37777   Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37778   unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37779   BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
37780       .addReg(PrevSSPReg)
37781       .addReg(SSPCopyReg);
37782
37783   // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37784   BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
37785   fallMBB->addSuccessor(sinkMBB);
37786   fallMBB->addSuccessor(fixShadowMBB);
37787
37788   // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37789   unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37790   unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37791   Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37792   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
37793       .addReg(SspSubReg)
37794       .addImm(Offset);
37795
37796   // Increase SSP when looking only on the lower 8 bits of the delta.
37797   unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37798   BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37799
37800   // Reset the lower 8 bits.
37801   Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37802   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
37803       .addReg(SspFirstShrReg)
37804       .addImm(8);
37805
37806   // Jump if the result of the shift is zero.
37807   BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
37808   fixShadowMBB->addSuccessor(sinkMBB);
37809   fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37810
37811   // Do a single shift left.
37812   unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37813   Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37814   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
37815       .addReg(SspSecondShrReg)
37816       .addImm(1);
37817
37818   // Save the value 128 to a register (will be used next with incssp).
37819   Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37820   unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37821   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
37822       .addImm(128);
37823   fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37824
37825   // Since incssp only looks at the lower 8 bits, we might need to do several
37826   // iterations of incssp until we finish fixing the shadow stack.
37827   Register DecReg = MRI.createVirtualRegister(PtrRC);
37828   Register CounterReg = MRI.createVirtualRegister(PtrRC);
37829   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
37830       .addReg(SspAfterShlReg)
37831       .addMBB(fixShadowLoopPrepareMBB)
37832       .addReg(DecReg)
37833       .addMBB(fixShadowLoopMBB);
37834
37835   // Every iteration we increase the SSP by 128.
37836   BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
37837
37838   // Every iteration we decrement the counter by 1.
37839   unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37840   BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
37841
37842   // Jump if the counter is not zero yet.
37843   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
37844   fixShadowLoopMBB->addSuccessor(sinkMBB);
37845   fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37846
37847   return sinkMBB;
37848 }
37849
37850 MachineBasicBlock *
37851 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37852                                      MachineBasicBlock *MBB) const {
37853   const DebugLoc &DL = MI.getDebugLoc();
37854   MachineFunction *MF = MBB->getParent();
37855   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37856   MachineRegisterInfo &MRI = MF->getRegInfo();
37857
37858   // Memory Reference
37859   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
37860                                            MI.memoperands_end());
37861
37862   MVT PVT = getPointerTy(MF->getDataLayout());
37863   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37864          "Invalid Pointer Size!");
37865
37866   const TargetRegisterClass *RC =
37867     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37868   Register Tmp = MRI.createVirtualRegister(RC);
37869   // Since FP is only updated here but NOT referenced, it's treated as GPR.
37870   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37871   Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37872   Register SP = RegInfo->getStackRegister();
37873
37874   MachineInstrBuilder MIB;
37875
37876   const int64_t LabelOffset = 1 * PVT.getStoreSize();
37877   const int64_t SPOffset = 2 * PVT.getStoreSize();
37878
37879   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37880   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37881
37882   MachineBasicBlock *thisMBB = MBB;
37883
37884   // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37885   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
37886     thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37887   }
37888
37889   // Reload FP
37890   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
37891   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37892     const MachineOperand &MO = MI.getOperand(i);
37893     if (MO.isReg()) // Don't add the whole operand, we don't want to
37894                     // preserve kill flags.
37895       MIB.addReg(MO.getReg());
37896     else
37897       MIB.add(MO);
37898   }
37899   MIB.setMemRefs(MMOs);
37900
37901   // Reload IP
37902   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
37903   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37904     const MachineOperand &MO = MI.getOperand(i);
37905     if (i == X86::AddrDisp)
37906       MIB.addDisp(MO, LabelOffset);
37907     else if (MO.isReg()) // Don't add the whole operand, we don't want to
37908                          // preserve kill flags.
37909       MIB.addReg(MO.getReg());
37910     else
37911       MIB.add(MO);
37912   }
37913   MIB.setMemRefs(MMOs);
37914
37915   // Reload SP
37916   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
37917   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37918     if (i == X86::AddrDisp)
37919       MIB.addDisp(MI.getOperand(i), SPOffset);
37920     else
37921       MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37922                                  // the last instruction of the expansion.
37923   }
37924   MIB.setMemRefs(MMOs);
37925
37926   // Jump
37927   BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
37928
37929   MI.eraseFromParent();
37930   return thisMBB;
37931 }
37932
37933 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37934                                                MachineBasicBlock *MBB,
37935                                                MachineBasicBlock *DispatchBB,
37936                                                int FI) const {
37937   const DebugLoc &DL = MI.getDebugLoc();
37938   MachineFunction *MF = MBB->getParent();
37939   MachineRegisterInfo *MRI = &MF->getRegInfo();
37940   const X86InstrInfo *TII = Subtarget.getInstrInfo();
37941
37942   MVT PVT = getPointerTy(MF->getDataLayout());
37943   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37944
37945   unsigned Op = 0;
37946   unsigned VR = 0;
37947
37948   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37949                      !isPositionIndependent();
37950
37951   if (UseImmLabel) {
37952     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37953   } else {
37954     const TargetRegisterClass *TRC =
37955         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37956     VR = MRI->createVirtualRegister(TRC);
37957     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37958
37959     if (Subtarget.is64Bit())
37960       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
37961           .addReg(X86::RIP)
37962           .addImm(1)
37963           .addReg(0)
37964           .addMBB(DispatchBB)
37965           .addReg(0);
37966     else
37967       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
37968           .addReg(0) /* TII->getGlobalBaseReg(MF) */
37969           .addImm(1)
37970           .addReg(0)
37971           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37972           .addReg(0);
37973   }
37974
37975   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
37976   addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37977   if (UseImmLabel)
37978     MIB.addMBB(DispatchBB);
37979   else
37980     MIB.addReg(VR);
37981 }
37982
37983 MachineBasicBlock *
37984 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37985                                          MachineBasicBlock *BB) const {
37986   const DebugLoc &DL = MI.getDebugLoc();
37987   MachineFunction *MF = BB->getParent();
37988   MachineRegisterInfo *MRI = &MF->getRegInfo();
37989   const X86InstrInfo *TII = Subtarget.getInstrInfo();
37990   int FI = MF->getFrameInfo().getFunctionContextIndex();
37991
37992   // Get a mapping of the call site numbers to all of the landing pads they're
37993   // associated with.
37994   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37995   unsigned MaxCSNum = 0;
37996   for (auto &MBB : *MF) {
37997     if (!MBB.isEHPad())
37998       continue;
37999
38000     MCSymbol *Sym = nullptr;
38001     for (const auto &MI : MBB) {
38002       if (MI.isDebugInstr())
38003         continue;
38004
38005       assert(MI.isEHLabel() && "expected EH_LABEL");
38006       Sym = MI.getOperand(0).getMCSymbol();
38007       break;
38008     }
38009
38010     if (!MF->hasCallSiteLandingPad(Sym))
38011       continue;
38012
38013     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
38014       CallSiteNumToLPad[CSI].push_back(&MBB);
38015       MaxCSNum = std::max(MaxCSNum, CSI);
38016     }
38017   }
38018
38019   // Get an ordered list of the machine basic blocks for the jump table.
38020   std::vector<MachineBasicBlock *> LPadList;
38021   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
38022   LPadList.reserve(CallSiteNumToLPad.size());
38023
38024   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
38025     for (auto &LP : CallSiteNumToLPad[CSI]) {
38026       LPadList.push_back(LP);
38027       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
38028     }
38029   }
38030
38031   assert(!LPadList.empty() &&
38032          "No landing pad destinations for the dispatch jump table!");
38033
38034   // Create the MBBs for the dispatch code.
38035
38036   // Shove the dispatch's address into the return slot in the function context.
38037   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
38038   DispatchBB->setIsEHPad(true);
38039
38040   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
38041   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
38042   DispatchBB->addSuccessor(TrapBB);
38043
38044   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
38045   DispatchBB->addSuccessor(DispContBB);
38046
38047   // Insert MBBs.
38048   MF->push_back(DispatchBB);
38049   MF->push_back(DispContBB);
38050   MF->push_back(TrapBB);
38051
38052   // Insert code into the entry block that creates and registers the function
38053   // context.
38054   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
38055
38056   // Create the jump table and associated information
38057   unsigned JTE = getJumpTableEncoding();
38058   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
38059   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
38060
38061   const X86RegisterInfo &RI = TII->getRegisterInfo();
38062   // Add a register mask with no preserved registers.  This results in all
38063   // registers being marked as clobbered.
38064   if (RI.hasBasePointer(*MF)) {
38065     const bool FPIs64Bit =
38066         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
38067     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
38068     MFI->setRestoreBasePointer(MF);
38069
38070     Register FP = RI.getFrameRegister(*MF);
38071     Register BP = RI.getBaseRegister();
38072     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
38073     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
38074                  MFI->getRestoreBasePointerOffset())
38075         .addRegMask(RI.getNoPreservedMask());
38076   } else {
38077     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
38078         .addRegMask(RI.getNoPreservedMask());
38079   }
38080
38081   // IReg is used as an index in a memory operand and therefore can't be SP
38082   Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
38083   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
38084                     Subtarget.is64Bit() ? 8 : 4);
38085   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
38086       .addReg(IReg)
38087       .addImm(LPadList.size());
38088   BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
38089
38090   if (Subtarget.is64Bit()) {
38091     Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
38092     Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
38093
38094     // leaq .LJTI0_0(%rip), BReg
38095     BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
38096         .addReg(X86::RIP)
38097         .addImm(1)
38098         .addReg(0)
38099         .addJumpTableIndex(MJTI)
38100         .addReg(0);
38101     // movzx IReg64, IReg
38102     BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
38103         .addImm(0)
38104         .addReg(IReg)
38105         .addImm(X86::sub_32bit);
38106
38107     switch (JTE) {
38108     case MachineJumpTableInfo::EK_BlockAddress:
38109       // jmpq *(BReg,IReg64,8)
38110       BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
38111           .addReg(BReg)
38112           .addImm(8)
38113           .addReg(IReg64)
38114           .addImm(0)
38115           .addReg(0);
38116       break;
38117     case MachineJumpTableInfo::EK_LabelDifference32: {
38118       Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
38119       Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
38120       Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
38121
38122       // movl (BReg,IReg64,4), OReg
38123       BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
38124           .addReg(BReg)
38125           .addImm(4)
38126           .addReg(IReg64)
38127           .addImm(0)
38128           .addReg(0);
38129       // movsx OReg64, OReg
38130       BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
38131       // addq BReg, OReg64, TReg
38132       BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
38133           .addReg(OReg64)
38134           .addReg(BReg);
38135       // jmpq *TReg
38136       BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
38137       break;
38138     }
38139     default:
38140       llvm_unreachable("Unexpected jump table encoding");
38141     }
38142   } else {
38143     // jmpl *.LJTI0_0(,IReg,4)
38144     BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
38145         .addReg(0)
38146         .addImm(4)
38147         .addReg(IReg)
38148         .addJumpTableIndex(MJTI)
38149         .addReg(0);
38150   }
38151
38152   // Add the jump table entries as successors to the MBB.
38153   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
38154   for (auto &LP : LPadList)
38155     if (SeenMBBs.insert(LP).second)
38156       DispContBB->addSuccessor(LP);
38157
38158   // N.B. the order the invoke BBs are processed in doesn't matter here.
38159   SmallVector<MachineBasicBlock *, 64> MBBLPads;
38160   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
38161   for (MachineBasicBlock *MBB : InvokeBBs) {
38162     // Remove the landing pad successor from the invoke block and replace it
38163     // with the new dispatch block.
38164     // Keep a copy of Successors since it's modified inside the loop.
38165     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
38166                                                    MBB->succ_rend());
38167     // FIXME: Avoid quadratic complexity.
38168     for (auto *MBBS : Successors) {
38169       if (MBBS->isEHPad()) {
38170         MBB->removeSuccessor(MBBS);
38171         MBBLPads.push_back(MBBS);
38172       }
38173     }
38174
38175     MBB->addSuccessor(DispatchBB);
38176
38177     // Find the invoke call and mark all of the callee-saved registers as
38178     // 'implicit defined' so that they're spilled.  This prevents code from
38179     // moving instructions to before the EH block, where they will never be
38180     // executed.
38181     for (auto &II : reverse(*MBB)) {
38182       if (!II.isCall())
38183         continue;
38184
38185       DenseMap<unsigned, bool> DefRegs;
38186       for (auto &MOp : II.operands())
38187         if (MOp.isReg())
38188           DefRegs[MOp.getReg()] = true;
38189
38190       MachineInstrBuilder MIB(*MF, &II);
38191       for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
38192         unsigned Reg = SavedRegs[RegIdx];
38193         if (!DefRegs[Reg])
38194           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
38195       }
38196
38197       break;
38198     }
38199   }
38200
38201   // Mark all former landing pads as non-landing pads.  The dispatch is the only
38202   // landing pad now.
38203   for (auto &LP : MBBLPads)
38204     LP->setIsEHPad(false);
38205
38206   // The instruction is gone now.
38207   MI.eraseFromParent();
38208   return BB;
38209 }
38210
38211 MachineBasicBlock *
38212 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
38213                                                MachineBasicBlock *BB) const {
38214   MachineFunction *MF = BB->getParent();
38215   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
38216   const DebugLoc &DL = MI.getDebugLoc();
38217
38218   auto TMMImmToTMMReg = [](unsigned Imm) {
38219     assert (Imm < 8 && "Illegal tmm index");
38220     return X86::TMM0 + Imm;
38221   };
38222   switch (MI.getOpcode()) {
38223   default: llvm_unreachable("Unexpected instr type to insert");
38224   case X86::TLS_addr32:
38225   case X86::TLS_addr64:
38226   case X86::TLS_addrX32:
38227   case X86::TLS_base_addr32:
38228   case X86::TLS_base_addr64:
38229   case X86::TLS_base_addrX32:
38230     return EmitLoweredTLSAddr(MI, BB);
38231   case X86::INDIRECT_THUNK_CALL32:
38232   case X86::INDIRECT_THUNK_CALL64:
38233   case X86::INDIRECT_THUNK_TCRETURN32:
38234   case X86::INDIRECT_THUNK_TCRETURN64:
38235     return EmitLoweredIndirectThunk(MI, BB);
38236   case X86::CATCHRET:
38237     return EmitLoweredCatchRet(MI, BB);
38238   case X86::SEG_ALLOCA_32:
38239   case X86::SEG_ALLOCA_64:
38240     return EmitLoweredSegAlloca(MI, BB);
38241   case X86::PROBED_ALLOCA_32:
38242   case X86::PROBED_ALLOCA_64:
38243     return EmitLoweredProbedAlloca(MI, BB);
38244   case X86::TLSCall_32:
38245   case X86::TLSCall_64:
38246     return EmitLoweredTLSCall(MI, BB);
38247   case X86::CMOV_FR16:
38248   case X86::CMOV_FR16X:
38249   case X86::CMOV_FR32:
38250   case X86::CMOV_FR32X:
38251   case X86::CMOV_FR64:
38252   case X86::CMOV_FR64X:
38253   case X86::CMOV_GR8:
38254   case X86::CMOV_GR16:
38255   case X86::CMOV_GR32:
38256   case X86::CMOV_RFP32:
38257   case X86::CMOV_RFP64:
38258   case X86::CMOV_RFP80:
38259   case X86::CMOV_VR64:
38260   case X86::CMOV_VR128:
38261   case X86::CMOV_VR128X:
38262   case X86::CMOV_VR256:
38263   case X86::CMOV_VR256X:
38264   case X86::CMOV_VR512:
38265   case X86::CMOV_VK1:
38266   case X86::CMOV_VK2:
38267   case X86::CMOV_VK4:
38268   case X86::CMOV_VK8:
38269   case X86::CMOV_VK16:
38270   case X86::CMOV_VK32:
38271   case X86::CMOV_VK64:
38272     return EmitLoweredSelect(MI, BB);
38273
38274   case X86::FP80_ADDr:
38275   case X86::FP80_ADDm32: {
38276     // Change the floating point control register to use double extended
38277     // precision when performing the addition.
38278     int OrigCWFrameIdx =
38279         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38280     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),
38281                       OrigCWFrameIdx);
38282
38283     // Load the old value of the control word...
38284     Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38285     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
38286                       OrigCWFrameIdx);
38287
38288     // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
38289     // precision.
38290     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38291     BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
38292         .addReg(OldCW, RegState::Kill)
38293         .addImm(0x300);
38294
38295     // Extract to 16 bits.
38296     Register NewCW16 =
38297         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
38298     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
38299         .addReg(NewCW, RegState::Kill, X86::sub_16bit);
38300
38301     // Prepare memory for FLDCW.
38302     int NewCWFrameIdx =
38303         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38304     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
38305                       NewCWFrameIdx)
38306         .addReg(NewCW16, RegState::Kill);
38307
38308     // Reload the modified control word now...
38309     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
38310                       NewCWFrameIdx);
38311
38312     // Do the addition.
38313     if (MI.getOpcode() == X86::FP80_ADDr) {
38314       BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))
38315           .add(MI.getOperand(0))
38316           .add(MI.getOperand(1))
38317           .add(MI.getOperand(2));
38318     } else {
38319       BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))
38320           .add(MI.getOperand(0))
38321           .add(MI.getOperand(1))
38322           .add(MI.getOperand(2))
38323           .add(MI.getOperand(3))
38324           .add(MI.getOperand(4))
38325           .add(MI.getOperand(5))
38326           .add(MI.getOperand(6));
38327     }
38328
38329     // Reload the original control word now.
38330     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
38331                       OrigCWFrameIdx);
38332
38333     MI.eraseFromParent(); // The pseudo instruction is gone now.
38334     return BB;
38335   }
38336
38337   case X86::FP32_TO_INT16_IN_MEM:
38338   case X86::FP32_TO_INT32_IN_MEM:
38339   case X86::FP32_TO_INT64_IN_MEM:
38340   case X86::FP64_TO_INT16_IN_MEM:
38341   case X86::FP64_TO_INT32_IN_MEM:
38342   case X86::FP64_TO_INT64_IN_MEM:
38343   case X86::FP80_TO_INT16_IN_MEM:
38344   case X86::FP80_TO_INT32_IN_MEM:
38345   case X86::FP80_TO_INT64_IN_MEM: {
38346     // Change the floating point control register to use "round towards zero"
38347     // mode when truncating to an integer value.
38348     int OrigCWFrameIdx =
38349         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38350     addFrameReference(BuildMI(*BB, MI, DL,
38351                               TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
38352
38353     // Load the old value of the control word...
38354     Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38355     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
38356                       OrigCWFrameIdx);
38357
38358     // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
38359     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
38360     BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
38361       .addReg(OldCW, RegState::Kill).addImm(0xC00);
38362
38363     // Extract to 16 bits.
38364     Register NewCW16 =
38365         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
38366     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
38367       .addReg(NewCW, RegState::Kill, X86::sub_16bit);
38368
38369     // Prepare memory for FLDCW.
38370     int NewCWFrameIdx =
38371         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
38372     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
38373                       NewCWFrameIdx)
38374       .addReg(NewCW16, RegState::Kill);
38375
38376     // Reload the modified control word now...
38377     addFrameReference(BuildMI(*BB, MI, DL,
38378                               TII->get(X86::FLDCW16m)), NewCWFrameIdx);
38379
38380     // Get the X86 opcode to use.
38381     unsigned Opc;
38382     switch (MI.getOpcode()) {
38383     default: llvm_unreachable("illegal opcode!");
38384     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
38385     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
38386     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
38387     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
38388     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
38389     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
38390     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
38391     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
38392     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
38393     }
38394
38395     X86AddressMode AM = getAddressFromInstr(&MI, 0);
38396     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
38397         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
38398
38399     // Reload the original control word now.
38400     addFrameReference(BuildMI(*BB, MI, DL,
38401                               TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
38402
38403     MI.eraseFromParent(); // The pseudo instruction is gone now.
38404     return BB;
38405   }
38406
38407   // xbegin
38408   case X86::XBEGIN:
38409     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
38410
38411   case X86::VAARG_64:
38412   case X86::VAARG_X32:
38413     return EmitVAARGWithCustomInserter(MI, BB);
38414
38415   case X86::EH_SjLj_SetJmp32:
38416   case X86::EH_SjLj_SetJmp64:
38417     return emitEHSjLjSetJmp(MI, BB);
38418
38419   case X86::EH_SjLj_LongJmp32:
38420   case X86::EH_SjLj_LongJmp64:
38421     return emitEHSjLjLongJmp(MI, BB);
38422
38423   case X86::Int_eh_sjlj_setup_dispatch:
38424     return EmitSjLjDispatchBlock(MI, BB);
38425
38426   case TargetOpcode::STATEPOINT:
38427     // As an implementation detail, STATEPOINT shares the STACKMAP format at
38428     // this point in the process.  We diverge later.
38429     return emitPatchPoint(MI, BB);
38430
38431   case TargetOpcode::STACKMAP:
38432   case TargetOpcode::PATCHPOINT:
38433     return emitPatchPoint(MI, BB);
38434
38435   case TargetOpcode::PATCHABLE_EVENT_CALL:
38436   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
38437     return BB;
38438
38439   case X86::LCMPXCHG8B: {
38440     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38441     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
38442     // requires a memory operand. If it happens that current architecture is
38443     // i686 and for current function we need a base pointer
38444     // - which is ESI for i686 - register allocator would not be able to
38445     // allocate registers for an address in form of X(%reg, %reg, Y)
38446     // - there never would be enough unreserved registers during regalloc
38447     // (without the need for base ptr the only option would be X(%edi, %esi, Y).
38448     // We are giving a hand to register allocator by precomputing the address in
38449     // a new vreg using LEA.
38450
38451     // If it is not i686 or there is no base pointer - nothing to do here.
38452     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
38453       return BB;
38454
38455     // Even though this code does not necessarily needs the base pointer to
38456     // be ESI, we check for that. The reason: if this assert fails, there are
38457     // some changes happened in the compiler base pointer handling, which most
38458     // probably have to be addressed somehow here.
38459     assert(TRI->getBaseRegister() == X86::ESI &&
38460            "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
38461            "base pointer in mind");
38462
38463     MachineRegisterInfo &MRI = MF->getRegInfo();
38464     MVT SPTy = getPointerTy(MF->getDataLayout());
38465     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
38466     Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
38467
38468     X86AddressMode AM = getAddressFromInstr(&MI, 0);
38469     // Regalloc does not need any help when the memory operand of CMPXCHG8B
38470     // does not use index register.
38471     if (AM.IndexReg == X86::NoRegister)
38472       return BB;
38473
38474     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
38475     // four operand definitions that are E[ABCD] registers. We skip them and
38476     // then insert the LEA.
38477     MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38478     while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
38479                                    RMBBI->definesRegister(X86::EBX) ||
38480                                    RMBBI->definesRegister(X86::ECX) ||
38481                                    RMBBI->definesRegister(X86::EDX))) {
38482       ++RMBBI;
38483     }
38484     MachineBasicBlock::iterator MBBI(RMBBI);
38485     addFullAddress(
38486         BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
38487
38488     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38489
38490     return BB;
38491   }
38492   case X86::LCMPXCHG16B_NO_RBX: {
38493     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38494     Register BasePtr = TRI->getBaseRegister();
38495     if (TRI->hasBasePointer(*MF) &&
38496         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38497       if (!BB->isLiveIn(BasePtr))
38498         BB->addLiveIn(BasePtr);
38499       // Save RBX into a virtual register.
38500       Register SaveRBX =
38501           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38502       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
38503           .addReg(X86::RBX);
38504       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38505       MachineInstrBuilder MIB =
38506           BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38507       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38508         MIB.add(MI.getOperand(Idx));
38509       MIB.add(MI.getOperand(X86::AddrNumOperands));
38510       MIB.addReg(SaveRBX);
38511     } else {
38512       // Simple case, just copy the virtual register to RBX.
38513       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
38514           .add(MI.getOperand(X86::AddrNumOperands));
38515       MachineInstrBuilder MIB =
38516           BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
38517       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38518         MIB.add(MI.getOperand(Idx));
38519     }
38520     MI.eraseFromParent();
38521     return BB;
38522   }
38523   case X86::MWAITX: {
38524     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38525     Register BasePtr = TRI->getBaseRegister();
38526     bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38527     // If no need to save the base pointer, we generate MWAITXrrr,
38528     // else we generate pseudo MWAITX_SAVE_RBX.
38529     if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38530       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
38531           .addReg(MI.getOperand(0).getReg());
38532       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
38533           .addReg(MI.getOperand(1).getReg());
38534       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
38535           .addReg(MI.getOperand(2).getReg());
38536       BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
38537       MI.eraseFromParent();
38538     } else {
38539       if (!BB->isLiveIn(BasePtr)) {
38540         BB->addLiveIn(BasePtr);
38541       }
38542       // Parameters can be copied into ECX and EAX but not EBX yet.
38543       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
38544           .addReg(MI.getOperand(0).getReg());
38545       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
38546           .addReg(MI.getOperand(1).getReg());
38547       assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38548       // Save RBX into a virtual register.
38549       Register SaveRBX =
38550           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38551       BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
38552           .addReg(X86::RBX);
38553       // Generate mwaitx pseudo.
38554       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38555       BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
38556           .addDef(Dst) // Destination tied in with SaveRBX.
38557           .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38558           .addUse(SaveRBX);                  // Save of base pointer.
38559       MI.eraseFromParent();
38560     }
38561     return BB;
38562   }
38563   case TargetOpcode::PREALLOCATED_SETUP: {
38564     assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38565     auto MFI = MF->getInfo<X86MachineFunctionInfo>();
38566     MFI->setHasPreallocatedCall(true);
38567     int64_t PreallocatedId = MI.getOperand(0).getImm();
38568     size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38569     assert(StackAdjustment != 0 && "0 stack adjustment");
38570     LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38571                       << StackAdjustment << "\n");
38572     BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
38573         .addReg(X86::ESP)
38574         .addImm(StackAdjustment);
38575     MI.eraseFromParent();
38576     return BB;
38577   }
38578   case TargetOpcode::PREALLOCATED_ARG: {
38579     assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38580     int64_t PreallocatedId = MI.getOperand(1).getImm();
38581     int64_t ArgIdx = MI.getOperand(2).getImm();
38582     auto MFI = MF->getInfo<X86MachineFunctionInfo>();
38583     size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38584     LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38585                       << ", arg offset " << ArgOffset << "\n");
38586     // stack pointer + offset
38587     addRegOffset(
38588         BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
38589         X86::ESP, false, ArgOffset);
38590     MI.eraseFromParent();
38591     return BB;
38592   }
38593   case X86::PTDPBSSD:
38594   case X86::PTDPBSUD:
38595   case X86::PTDPBUSD:
38596   case X86::PTDPBUUD:
38597   case X86::PTDPBF16PS:
38598   case X86::PTDPFP16PS: {
38599     unsigned Opc;
38600     switch (MI.getOpcode()) {
38601     default: llvm_unreachable("illegal opcode!");
38602     case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38603     case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38604     case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38605     case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38606     case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38607     case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38608     }
38609
38610     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38611     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38612     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38613     MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38614     MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38615
38616     MI.eraseFromParent(); // The pseudo is gone now.
38617     return BB;
38618   }
38619   case X86::PTILEZERO: {
38620     unsigned Imm = MI.getOperand(0).getImm();
38621     BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38622     MI.eraseFromParent(); // The pseudo is gone now.
38623     return BB;
38624   }
38625   case X86::PTILELOADD:
38626   case X86::PTILELOADDT1:
38627   case X86::PTILESTORED: {
38628     unsigned Opc;
38629     switch (MI.getOpcode()) {
38630     default: llvm_unreachable("illegal opcode!");
38631     case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
38632     case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
38633     case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
38634     }
38635
38636     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38637     unsigned CurOp = 0;
38638     if (Opc != X86::TILESTORED)
38639       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38640                  RegState::Define);
38641
38642     MIB.add(MI.getOperand(CurOp++)); // base
38643     MIB.add(MI.getOperand(CurOp++)); // scale
38644     MIB.add(MI.getOperand(CurOp++)); // index -- stride
38645     MIB.add(MI.getOperand(CurOp++)); // displacement
38646     MIB.add(MI.getOperand(CurOp++)); // segment
38647
38648     if (Opc == X86::TILESTORED)
38649       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38650                  RegState::Undef);
38651
38652     MI.eraseFromParent(); // The pseudo is gone now.
38653     return BB;
38654   }
38655   case X86::PTCMMIMFP16PS:
38656   case X86::PTCMMRLFP16PS: {
38657     const DebugLoc &DL = MI.getDebugLoc();
38658     unsigned Opc;
38659     switch (MI.getOpcode()) {
38660     default: llvm_unreachable("Unexpected instruction!");
38661     case X86::PTCMMIMFP16PS:     Opc = X86::TCMMIMFP16PS;     break;
38662     case X86::PTCMMRLFP16PS:     Opc = X86::TCMMRLFP16PS;     break;
38663     }
38664     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38665     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38666     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38667     MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38668     MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38669     MI.eraseFromParent(); // The pseudo is gone now.
38670     return BB;
38671   }
38672   }
38673 }
38674
38675 //===----------------------------------------------------------------------===//
38676 //                           X86 Optimization Hooks
38677 //===----------------------------------------------------------------------===//
38678
38679 bool
38680 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
38681                                                 const APInt &DemandedBits,
38682                                                 const APInt &DemandedElts,
38683                                                 TargetLoweringOpt &TLO) const {
38684   EVT VT = Op.getValueType();
38685   unsigned Opcode = Op.getOpcode();
38686   unsigned EltSize = VT.getScalarSizeInBits();
38687
38688   if (VT.isVector()) {
38689     // If the constant is only all signbits in the active bits, then we should
38690     // extend it to the entire constant to allow it act as a boolean constant
38691     // vector.
38692     auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38693       if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38694         return false;
38695       for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38696         if (!DemandedElts[i] || V.getOperand(i).isUndef())
38697           continue;
38698         const APInt &Val = V.getConstantOperandAPInt(i);
38699         if (Val.getBitWidth() > Val.getNumSignBits() &&
38700             Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38701           return true;
38702       }
38703       return false;
38704     };
38705     // For vectors - if we have a constant, then try to sign extend.
38706     // TODO: Handle AND cases.
38707     unsigned ActiveBits = DemandedBits.getActiveBits();
38708     if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38709         (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38710         NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38711       EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38712       EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38713                                    VT.getVectorNumElements());
38714       SDValue NewC =
38715           TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
38716                           Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38717       SDValue NewOp =
38718           TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38719       return TLO.CombineTo(Op, NewOp);
38720     }
38721     return false;
38722   }
38723
38724   // Only optimize Ands to prevent shrinking a constant that could be
38725   // matched by movzx.
38726   if (Opcode != ISD::AND)
38727     return false;
38728
38729   // Make sure the RHS really is a constant.
38730   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38731   if (!C)
38732     return false;
38733
38734   const APInt &Mask = C->getAPIntValue();
38735
38736   // Clear all non-demanded bits initially.
38737   APInt ShrunkMask = Mask & DemandedBits;
38738
38739   // Find the width of the shrunk mask.
38740   unsigned Width = ShrunkMask.getActiveBits();
38741
38742   // If the mask is all 0s there's nothing to do here.
38743   if (Width == 0)
38744     return false;
38745
38746   // Find the next power of 2 width, rounding up to a byte.
38747   Width = llvm::bit_ceil(std::max(Width, 8U));
38748   // Truncate the width to size to handle illegal types.
38749   Width = std::min(Width, EltSize);
38750
38751   // Calculate a possible zero extend mask for this constant.
38752   APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38753
38754   // If we aren't changing the mask, just return true to keep it and prevent
38755   // the caller from optimizing.
38756   if (ZeroExtendMask == Mask)
38757     return true;
38758
38759   // Make sure the new mask can be represented by a combination of mask bits
38760   // and non-demanded bits.
38761   if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38762     return false;
38763
38764   // Replace the constant with the zero extend mask.
38765   SDLoc DL(Op);
38766   SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38767   SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38768   return TLO.CombineTo(Op, NewOp);
38769 }
38770
38771 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
38772                                                       KnownBits &Known,
38773                                                       const APInt &DemandedElts,
38774                                                       const SelectionDAG &DAG,
38775                                                       unsigned Depth) const {
38776   unsigned BitWidth = Known.getBitWidth();
38777   unsigned NumElts = DemandedElts.getBitWidth();
38778   unsigned Opc = Op.getOpcode();
38779   EVT VT = Op.getValueType();
38780   assert((Opc >= ISD::BUILTIN_OP_END ||
38781           Opc == ISD::INTRINSIC_WO_CHAIN ||
38782           Opc == ISD::INTRINSIC_W_CHAIN ||
38783           Opc == ISD::INTRINSIC_VOID) &&
38784          "Should use MaskedValueIsZero if you don't know whether Op"
38785          " is a target node!");
38786
38787   Known.resetAll();
38788   switch (Opc) {
38789   default: break;
38790   case X86ISD::MUL_IMM: {
38791     KnownBits Known2;
38792     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38793     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38794     Known = KnownBits::mul(Known, Known2);
38795     break;
38796   }
38797   case X86ISD::SETCC:
38798     Known.Zero.setBitsFrom(1);
38799     break;
38800   case X86ISD::MOVMSK: {
38801     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38802     Known.Zero.setBitsFrom(NumLoBits);
38803     break;
38804   }
38805   case X86ISD::PEXTRB:
38806   case X86ISD::PEXTRW: {
38807     SDValue Src = Op.getOperand(0);
38808     EVT SrcVT = Src.getValueType();
38809     APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38810                                             Op.getConstantOperandVal(1));
38811     Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38812     Known = Known.anyextOrTrunc(BitWidth);
38813     Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38814     break;
38815   }
38816   case X86ISD::VSRAI:
38817   case X86ISD::VSHLI:
38818   case X86ISD::VSRLI: {
38819     unsigned ShAmt = Op.getConstantOperandVal(1);
38820     if (ShAmt >= VT.getScalarSizeInBits()) {
38821       // Out of range logical bit shifts are guaranteed to be zero.
38822       // Out of range arithmetic bit shifts splat the sign bit.
38823       if (Opc != X86ISD::VSRAI) {
38824         Known.setAllZero();
38825         break;
38826       }
38827
38828       ShAmt = VT.getScalarSizeInBits() - 1;
38829     }
38830
38831     Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38832     if (Opc == X86ISD::VSHLI) {
38833       Known.Zero <<= ShAmt;
38834       Known.One <<= ShAmt;
38835       // Low bits are known zero.
38836       Known.Zero.setLowBits(ShAmt);
38837     } else if (Opc == X86ISD::VSRLI) {
38838       Known.Zero.lshrInPlace(ShAmt);
38839       Known.One.lshrInPlace(ShAmt);
38840       // High bits are known zero.
38841       Known.Zero.setHighBits(ShAmt);
38842     } else {
38843       Known.Zero.ashrInPlace(ShAmt);
38844       Known.One.ashrInPlace(ShAmt);
38845     }
38846     break;
38847   }
38848   case X86ISD::PACKUS: {
38849     // PACKUS is just a truncation if the upper half is zero.
38850     APInt DemandedLHS, DemandedRHS;
38851     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38852
38853     Known.One = APInt::getAllOnes(BitWidth * 2);
38854     Known.Zero = APInt::getAllOnes(BitWidth * 2);
38855
38856     KnownBits Known2;
38857     if (!!DemandedLHS) {
38858       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38859       Known = Known.intersectWith(Known2);
38860     }
38861     if (!!DemandedRHS) {
38862       Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38863       Known = Known.intersectWith(Known2);
38864     }
38865
38866     if (Known.countMinLeadingZeros() < BitWidth)
38867       Known.resetAll();
38868     Known = Known.trunc(BitWidth);
38869     break;
38870   }
38871   case X86ISD::VBROADCAST: {
38872     SDValue Src = Op.getOperand(0);
38873     if (!Src.getSimpleValueType().isVector()) {
38874       Known = DAG.computeKnownBits(Src, Depth + 1);
38875       return;
38876     }
38877     break;
38878   }
38879   case X86ISD::AND: {
38880     if (Op.getResNo() == 0) {
38881       KnownBits Known2;
38882       Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38883       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38884       Known &= Known2;
38885     }
38886     break;
38887   }
38888   case X86ISD::ANDNP: {
38889     KnownBits Known2;
38890     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38891     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38892
38893     // ANDNP = (~X & Y);
38894     Known.One &= Known2.Zero;
38895     Known.Zero |= Known2.One;
38896     break;
38897   }
38898   case X86ISD::FOR: {
38899     KnownBits Known2;
38900     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38901     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38902
38903     Known |= Known2;
38904     break;
38905   }
38906   case X86ISD::PSADBW: {
38907     assert(VT.getScalarType() == MVT::i64 &&
38908            Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
38909            "Unexpected PSADBW types");
38910
38911     // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
38912     Known.Zero.setBitsFrom(16);
38913     break;
38914   }
38915   case X86ISD::PCMPGT:
38916   case X86ISD::PCMPEQ: {
38917     KnownBits KnownLhs =
38918         DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38919     KnownBits KnownRhs =
38920         DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38921     std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38922                                   ? KnownBits::eq(KnownLhs, KnownRhs)
38923                                   : KnownBits::sgt(KnownLhs, KnownRhs);
38924     if (Res) {
38925       if (*Res)
38926         Known.setAllOnes();
38927       else
38928         Known.setAllZero();
38929     }
38930     break;
38931   }
38932   case X86ISD::PMULUDQ: {
38933     KnownBits Known2;
38934     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38935     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38936
38937     Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38938     Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38939     Known = KnownBits::mul(Known, Known2);
38940     break;
38941   }
38942   case X86ISD::CMOV: {
38943     Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38944     // If we don't know any bits, early out.
38945     if (Known.isUnknown())
38946       break;
38947     KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38948
38949     // Only known if known in both the LHS and RHS.
38950     Known = Known.intersectWith(Known2);
38951     break;
38952   }
38953   case X86ISD::BEXTR:
38954   case X86ISD::BEXTRI: {
38955     SDValue Op0 = Op.getOperand(0);
38956     SDValue Op1 = Op.getOperand(1);
38957
38958     if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38959       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38960       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38961
38962       // If the length is 0, the result is 0.
38963       if (Length == 0) {
38964         Known.setAllZero();
38965         break;
38966       }
38967
38968       if ((Shift + Length) <= BitWidth) {
38969         Known = DAG.computeKnownBits(Op0, Depth + 1);
38970         Known = Known.extractBits(Length, Shift);
38971         Known = Known.zextOrTrunc(BitWidth);
38972       }
38973     }
38974     break;
38975   }
38976   case X86ISD::PDEP: {
38977     KnownBits Known2;
38978     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38979     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38980     // Zeros are retained from the mask operand. But not ones.
38981     Known.One.clearAllBits();
38982     // The result will have at least as many trailing zeros as the non-mask
38983     // operand since bits can only map to the same or higher bit position.
38984     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38985     break;
38986   }
38987   case X86ISD::PEXT: {
38988     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38989     // The result has as many leading zeros as the number of zeroes in the mask.
38990     unsigned Count = Known.Zero.popcount();
38991     Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
38992     Known.One.clearAllBits();
38993     break;
38994   }
38995   case X86ISD::VTRUNC:
38996   case X86ISD::VTRUNCS:
38997   case X86ISD::VTRUNCUS:
38998   case X86ISD::CVTSI2P:
38999   case X86ISD::CVTUI2P:
39000   case X86ISD::CVTP2SI:
39001   case X86ISD::CVTP2UI:
39002   case X86ISD::MCVTP2SI:
39003   case X86ISD::MCVTP2UI:
39004   case X86ISD::CVTTP2SI:
39005   case X86ISD::CVTTP2UI:
39006   case X86ISD::MCVTTP2SI:
39007   case X86ISD::MCVTTP2UI:
39008   case X86ISD::MCVTSI2P:
39009   case X86ISD::MCVTUI2P:
39010   case X86ISD::VFPROUND:
39011   case X86ISD::VMFPROUND:
39012   case X86ISD::CVTPS2PH:
39013   case X86ISD::MCVTPS2PH: {
39014     // Truncations/Conversions - upper elements are known zero.
39015     EVT SrcVT = Op.getOperand(0).getValueType();
39016     if (SrcVT.isVector()) {
39017       unsigned NumSrcElts = SrcVT.getVectorNumElements();
39018       if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
39019         Known.setAllZero();
39020     }
39021     break;
39022   }
39023   case X86ISD::STRICT_CVTTP2SI:
39024   case X86ISD::STRICT_CVTTP2UI:
39025   case X86ISD::STRICT_CVTSI2P:
39026   case X86ISD::STRICT_CVTUI2P:
39027   case X86ISD::STRICT_VFPROUND:
39028   case X86ISD::STRICT_CVTPS2PH: {
39029     // Strict Conversions - upper elements are known zero.
39030     EVT SrcVT = Op.getOperand(1).getValueType();
39031     if (SrcVT.isVector()) {
39032       unsigned NumSrcElts = SrcVT.getVectorNumElements();
39033       if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
39034         Known.setAllZero();
39035     }
39036     break;
39037   }
39038   case X86ISD::MOVQ2DQ: {
39039     // Move from MMX to XMM. Upper half of XMM should be 0.
39040     if (DemandedElts.countr_zero() >= (NumElts / 2))
39041       Known.setAllZero();
39042     break;
39043   }
39044   case X86ISD::VBROADCAST_LOAD: {
39045     APInt UndefElts;
39046     SmallVector<APInt, 16> EltBits;
39047     if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
39048                                       /*AllowWholeUndefs*/ false,
39049                                       /*AllowPartialUndefs*/ false)) {
39050       Known.Zero.setAllBits();
39051       Known.One.setAllBits();
39052       for (unsigned I = 0; I != NumElts; ++I) {
39053         if (!DemandedElts[I])
39054           continue;
39055         if (UndefElts[I]) {
39056           Known.resetAll();
39057           break;
39058         }
39059         KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
39060         Known = Known.intersectWith(Known2);
39061       }
39062       return;
39063     }
39064     break;
39065   }
39066   }
39067
39068   // Handle target shuffles.
39069   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39070   if (isTargetShuffle(Opc)) {
39071     SmallVector<int, 64> Mask;
39072     SmallVector<SDValue, 2> Ops;
39073     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
39074       unsigned NumOps = Ops.size();
39075       unsigned NumElts = VT.getVectorNumElements();
39076       if (Mask.size() == NumElts) {
39077         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39078         Known.Zero.setAllBits(); Known.One.setAllBits();
39079         for (unsigned i = 0; i != NumElts; ++i) {
39080           if (!DemandedElts[i])
39081             continue;
39082           int M = Mask[i];
39083           if (M == SM_SentinelUndef) {
39084             // For UNDEF elements, we don't know anything about the common state
39085             // of the shuffle result.
39086             Known.resetAll();
39087             break;
39088           }
39089           if (M == SM_SentinelZero) {
39090             Known.One.clearAllBits();
39091             continue;
39092           }
39093           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39094                  "Shuffle index out of range");
39095
39096           unsigned OpIdx = (unsigned)M / NumElts;
39097           unsigned EltIdx = (unsigned)M % NumElts;
39098           if (Ops[OpIdx].getValueType() != VT) {
39099             // TODO - handle target shuffle ops with different value types.
39100             Known.resetAll();
39101             break;
39102           }
39103           DemandedOps[OpIdx].setBit(EltIdx);
39104         }
39105         // Known bits are the values that are shared by every demanded element.
39106         for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39107           if (!DemandedOps[i])
39108             continue;
39109           KnownBits Known2 =
39110               DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39111           Known = Known.intersectWith(Known2);
39112         }
39113       }
39114     }
39115   }
39116 }
39117
39118 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
39119     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39120     unsigned Depth) const {
39121   EVT VT = Op.getValueType();
39122   unsigned VTBits = VT.getScalarSizeInBits();
39123   unsigned Opcode = Op.getOpcode();
39124   switch (Opcode) {
39125   case X86ISD::SETCC_CARRY:
39126     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39127     return VTBits;
39128
39129   case X86ISD::VTRUNC: {
39130     SDValue Src = Op.getOperand(0);
39131     MVT SrcVT = Src.getSimpleValueType();
39132     unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39133     assert(VTBits < NumSrcBits && "Illegal truncation input type");
39134     APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39135     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39136     if (Tmp > (NumSrcBits - VTBits))
39137       return Tmp - (NumSrcBits - VTBits);
39138     return 1;
39139   }
39140
39141   case X86ISD::PACKSS: {
39142     // PACKSS is just a truncation if the sign bits extend to the packed size.
39143     APInt DemandedLHS, DemandedRHS;
39144     getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39145                         DemandedRHS);
39146
39147     // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39148     // patterns often used to compact vXi64 allsignbit patterns.
39149     auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39150       SDValue BC = peekThroughBitcasts(V);
39151       if (BC.getOpcode() == X86ISD::PACKSS &&
39152           BC.getScalarValueSizeInBits() == 16 &&
39153           V.getScalarValueSizeInBits() == 32) {
39154         SDValue BC0 = peekThroughBitcasts(BC.getOperand(0));
39155         SDValue BC1 = peekThroughBitcasts(BC.getOperand(1));
39156         if (BC0.getScalarValueSizeInBits() == 64 &&
39157             BC1.getScalarValueSizeInBits() == 64 &&
39158             DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39159             DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39160           return 32;
39161       }
39162       return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39163     };
39164
39165     unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39166     unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39167     if (!!DemandedLHS)
39168       Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39169     if (!!DemandedRHS)
39170       Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39171     unsigned Tmp = std::min(Tmp0, Tmp1);
39172     if (Tmp > (SrcBits - VTBits))
39173       return Tmp - (SrcBits - VTBits);
39174     return 1;
39175   }
39176
39177   case X86ISD::VBROADCAST: {
39178     SDValue Src = Op.getOperand(0);
39179     if (!Src.getSimpleValueType().isVector())
39180       return DAG.ComputeNumSignBits(Src, Depth + 1);
39181     break;
39182   }
39183
39184   case X86ISD::VSHLI: {
39185     SDValue Src = Op.getOperand(0);
39186     const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39187     if (ShiftVal.uge(VTBits))
39188       return VTBits; // Shifted all bits out --> zero.
39189     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39190     if (ShiftVal.uge(Tmp))
39191       return 1; // Shifted all sign bits out --> unknown.
39192     return Tmp - ShiftVal.getZExtValue();
39193   }
39194
39195   case X86ISD::VSRAI: {
39196     SDValue Src = Op.getOperand(0);
39197     APInt ShiftVal = Op.getConstantOperandAPInt(1);
39198     if (ShiftVal.uge(VTBits - 1))
39199       return VTBits; // Sign splat.
39200     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39201     ShiftVal += Tmp;
39202     return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39203   }
39204
39205   case X86ISD::FSETCC:
39206     // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39207     if (VT == MVT::f32 || VT == MVT::f64 ||
39208         ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39209       return VTBits;
39210     break;
39211
39212   case X86ISD::PCMPGT:
39213   case X86ISD::PCMPEQ:
39214   case X86ISD::CMPP:
39215   case X86ISD::VPCOM:
39216   case X86ISD::VPCOMU:
39217     // Vector compares return zero/all-bits result values.
39218     return VTBits;
39219
39220   case X86ISD::ANDNP: {
39221     unsigned Tmp0 =
39222         DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39223     if (Tmp0 == 1) return 1; // Early out.
39224     unsigned Tmp1 =
39225         DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39226     return std::min(Tmp0, Tmp1);
39227   }
39228
39229   case X86ISD::CMOV: {
39230     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39231     if (Tmp0 == 1) return 1;  // Early out.
39232     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39233     return std::min(Tmp0, Tmp1);
39234   }
39235   }
39236
39237   // Handle target shuffles.
39238   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39239   if (isTargetShuffle(Opcode)) {
39240     SmallVector<int, 64> Mask;
39241     SmallVector<SDValue, 2> Ops;
39242     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
39243       unsigned NumOps = Ops.size();
39244       unsigned NumElts = VT.getVectorNumElements();
39245       if (Mask.size() == NumElts) {
39246         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39247         for (unsigned i = 0; i != NumElts; ++i) {
39248           if (!DemandedElts[i])
39249             continue;
39250           int M = Mask[i];
39251           if (M == SM_SentinelUndef) {
39252             // For UNDEF elements, we don't know anything about the common state
39253             // of the shuffle result.
39254             return 1;
39255           } else if (M == SM_SentinelZero) {
39256             // Zero = all sign bits.
39257             continue;
39258           }
39259           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39260                  "Shuffle index out of range");
39261
39262           unsigned OpIdx = (unsigned)M / NumElts;
39263           unsigned EltIdx = (unsigned)M % NumElts;
39264           if (Ops[OpIdx].getValueType() != VT) {
39265             // TODO - handle target shuffle ops with different value types.
39266             return 1;
39267           }
39268           DemandedOps[OpIdx].setBit(EltIdx);
39269         }
39270         unsigned Tmp0 = VTBits;
39271         for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39272           if (!DemandedOps[i])
39273             continue;
39274           unsigned Tmp1 =
39275               DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39276           Tmp0 = std::min(Tmp0, Tmp1);
39277         }
39278         return Tmp0;
39279       }
39280     }
39281   }
39282
39283   // Fallback case.
39284   return 1;
39285 }
39286
39287 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
39288   if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39289     return N->getOperand(0);
39290   return N;
39291 }
39292
39293 // Helper to look for a normal load that can be narrowed into a vzload with the
39294 // specified VT and memory VT. Returns SDValue() on failure.
39295 static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
39296                                   SelectionDAG &DAG) {
39297   // Can't if the load is volatile or atomic.
39298   if (!LN->isSimple())
39299     return SDValue();
39300
39301   SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39302   SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39303   return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39304                                  LN->getPointerInfo(), LN->getOriginalAlign(),
39305                                  LN->getMemOperand()->getFlags());
39306 }
39307
39308 // Attempt to match a combined shuffle mask against supported unary shuffle
39309 // instructions.
39310 // TODO: Investigate sharing more of this with shuffle lowering.
39311 static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39312                               bool AllowFloatDomain, bool AllowIntDomain,
39313                               SDValue V1, const SelectionDAG &DAG,
39314                               const X86Subtarget &Subtarget, unsigned &Shuffle,
39315                               MVT &SrcVT, MVT &DstVT) {
39316   unsigned NumMaskElts = Mask.size();
39317   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39318
39319   // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39320   if (Mask[0] == 0 &&
39321       (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39322     if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39323         (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39324          isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39325       Shuffle = X86ISD::VZEXT_MOVL;
39326       if (MaskEltSize == 16)
39327         SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39328       else
39329         SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39330       return true;
39331     }
39332   }
39333
39334   // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
39335   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
39336   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39337                          (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
39338     unsigned MaxScale = 64 / MaskEltSize;
39339     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39340       bool MatchAny = true;
39341       bool MatchZero = true;
39342       unsigned NumDstElts = NumMaskElts / Scale;
39343       for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
39344         if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39345           MatchAny = MatchZero = false;
39346           break;
39347         }
39348         MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
39349         MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
39350       }
39351       if (MatchAny || MatchZero) {
39352         assert(MatchZero && "Failed to match zext but matched aext?");
39353         unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39354         MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
39355                                             MVT::getIntegerVT(MaskEltSize);
39356         SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39357
39358         Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
39359         if (SrcVT.getVectorNumElements() != NumDstElts)
39360           Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39361
39362         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39363         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39364         return true;
39365       }
39366     }
39367   }
39368
39369   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39370   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39371        (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39372       isUndefOrEqual(Mask[0], 0) &&
39373       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39374     Shuffle = X86ISD::VZEXT_MOVL;
39375     if (MaskEltSize == 16)
39376       SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39377     else
39378       SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39379     return true;
39380   }
39381
39382   // Check if we have SSE3 which will let us use MOVDDUP etc. The
39383   // instructions are no slower than UNPCKLPD but has the option to
39384   // fold the input operand into even an unaligned memory load.
39385   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39386     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39387       Shuffle = X86ISD::MOVDDUP;
39388       SrcVT = DstVT = MVT::v2f64;
39389       return true;
39390     }
39391     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39392       Shuffle = X86ISD::MOVSLDUP;
39393       SrcVT = DstVT = MVT::v4f32;
39394       return true;
39395     }
39396     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39397       Shuffle = X86ISD::MOVSHDUP;
39398       SrcVT = DstVT = MVT::v4f32;
39399       return true;
39400     }
39401   }
39402
39403   if (MaskVT.is256BitVector() && AllowFloatDomain) {
39404     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39405     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39406       Shuffle = X86ISD::MOVDDUP;
39407       SrcVT = DstVT = MVT::v4f64;
39408       return true;
39409     }
39410     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39411                                   V1)) {
39412       Shuffle = X86ISD::MOVSLDUP;
39413       SrcVT = DstVT = MVT::v8f32;
39414       return true;
39415     }
39416     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39417                                   V1)) {
39418       Shuffle = X86ISD::MOVSHDUP;
39419       SrcVT = DstVT = MVT::v8f32;
39420       return true;
39421     }
39422   }
39423
39424   if (MaskVT.is512BitVector() && AllowFloatDomain) {
39425     assert(Subtarget.hasAVX512() &&
39426            "AVX512 required for 512-bit vector shuffles");
39427     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39428                                   V1)) {
39429       Shuffle = X86ISD::MOVDDUP;
39430       SrcVT = DstVT = MVT::v8f64;
39431       return true;
39432     }
39433     if (isTargetShuffleEquivalent(
39434             MaskVT, Mask,
39435             {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39436       Shuffle = X86ISD::MOVSLDUP;
39437       SrcVT = DstVT = MVT::v16f32;
39438       return true;
39439     }
39440     if (isTargetShuffleEquivalent(
39441             MaskVT, Mask,
39442             {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39443       Shuffle = X86ISD::MOVSHDUP;
39444       SrcVT = DstVT = MVT::v16f32;
39445       return true;
39446     }
39447   }
39448
39449   return false;
39450 }
39451
39452 // Attempt to match a combined shuffle mask against supported unary immediate
39453 // permute instructions.
39454 // TODO: Investigate sharing more of this with shuffle lowering.
39455 static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
39456                                      const APInt &Zeroable,
39457                                      bool AllowFloatDomain, bool AllowIntDomain,
39458                                      const SelectionDAG &DAG,
39459                                      const X86Subtarget &Subtarget,
39460                                      unsigned &Shuffle, MVT &ShuffleVT,
39461                                      unsigned &PermuteImm) {
39462   unsigned NumMaskElts = Mask.size();
39463   unsigned InputSizeInBits = MaskVT.getSizeInBits();
39464   unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39465   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39466   bool ContainsZeros = isAnyZero(Mask);
39467
39468   // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39469   if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39470     // Check for lane crossing permutes.
39471     if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39472       // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39473       if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39474         Shuffle = X86ISD::VPERMI;
39475         ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39476         PermuteImm = getV4X86ShuffleImm(Mask);
39477         return true;
39478       }
39479       if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39480         SmallVector<int, 4> RepeatedMask;
39481         if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39482           Shuffle = X86ISD::VPERMI;
39483           ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39484           PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39485           return true;
39486         }
39487       }
39488     } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39489       // VPERMILPD can permute with a non-repeating shuffle.
39490       Shuffle = X86ISD::VPERMILPI;
39491       ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39492       PermuteImm = 0;
39493       for (int i = 0, e = Mask.size(); i != e; ++i) {
39494         int M = Mask[i];
39495         if (M == SM_SentinelUndef)
39496           continue;
39497         assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39498         PermuteImm |= (M & 1) << i;
39499       }
39500       return true;
39501     }
39502   }
39503
39504   // We are checking for shuffle match or shift match. Loop twice so we can
39505   // order which we try and match first depending on target preference.
39506   for (unsigned Order = 0; Order < 2; ++Order) {
39507     if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39508       // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39509       // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39510       // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39511       if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39512           !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39513         SmallVector<int, 4> RepeatedMask;
39514         if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39515           // Narrow the repeated mask to create 32-bit element permutes.
39516           SmallVector<int, 4> WordMask = RepeatedMask;
39517           if (MaskScalarSizeInBits == 64)
39518             narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39519
39520           Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39521           ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39522           ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39523           PermuteImm = getV4X86ShuffleImm(WordMask);
39524           return true;
39525         }
39526       }
39527
39528       // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39529       if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39530           ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39531            (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39532            (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39533         SmallVector<int, 4> RepeatedMask;
39534         if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39535           ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39536           ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39537
39538           // PSHUFLW: permute lower 4 elements only.
39539           if (isUndefOrInRange(LoMask, 0, 4) &&
39540               isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39541             Shuffle = X86ISD::PSHUFLW;
39542             ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39543             PermuteImm = getV4X86ShuffleImm(LoMask);
39544             return true;
39545           }
39546
39547           // PSHUFHW: permute upper 4 elements only.
39548           if (isUndefOrInRange(HiMask, 4, 8) &&
39549               isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39550             // Offset the HiMask so that we can create the shuffle immediate.
39551             int OffsetHiMask[4];
39552             for (int i = 0; i != 4; ++i)
39553               OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39554
39555             Shuffle = X86ISD::PSHUFHW;
39556             ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39557             PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39558             return true;
39559           }
39560         }
39561       }
39562     } else {
39563       // Attempt to match against bit rotates.
39564       if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39565           ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39566            Subtarget.hasAVX512())) {
39567         int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39568                                                 Subtarget, Mask);
39569         if (0 < RotateAmt) {
39570           Shuffle = X86ISD::VROTLI;
39571           PermuteImm = (unsigned)RotateAmt;
39572           return true;
39573         }
39574       }
39575     }
39576     // Attempt to match against byte/bit shifts.
39577     if (AllowIntDomain &&
39578         ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39579          (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39580          (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39581       int ShiftAmt =
39582           matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39583                               Zeroable, Subtarget);
39584       if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39585                            32 <= ShuffleVT.getScalarSizeInBits())) {
39586         // Byte shifts can be slower so only match them on second attempt.
39587         if (Order == 0 &&
39588             (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39589           continue;
39590
39591         PermuteImm = (unsigned)ShiftAmt;
39592         return true;
39593       }
39594
39595     }
39596   }
39597
39598   return false;
39599 }
39600
39601 // Attempt to match a combined unary shuffle mask against supported binary
39602 // shuffle instructions.
39603 // TODO: Investigate sharing more of this with shuffle lowering.
39604 static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39605                                bool AllowFloatDomain, bool AllowIntDomain,
39606                                SDValue &V1, SDValue &V2, const SDLoc &DL,
39607                                SelectionDAG &DAG, const X86Subtarget &Subtarget,
39608                                unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39609                                bool IsUnary) {
39610   unsigned NumMaskElts = Mask.size();
39611   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39612   unsigned SizeInBits = MaskVT.getSizeInBits();
39613
39614   if (MaskVT.is128BitVector()) {
39615     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39616         AllowFloatDomain) {
39617       V2 = V1;
39618       V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39619       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39620       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39621       return true;
39622     }
39623     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39624         AllowFloatDomain) {
39625       V2 = V1;
39626       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39627       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39628       return true;
39629     }
39630     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39631         Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39632       std::swap(V1, V2);
39633       Shuffle = X86ISD::MOVSD;
39634       SrcVT = DstVT = MVT::v2f64;
39635       return true;
39636     }
39637     if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39638         (AllowFloatDomain || !Subtarget.hasSSE41())) {
39639       Shuffle = X86ISD::MOVSS;
39640       SrcVT = DstVT = MVT::v4f32;
39641       return true;
39642     }
39643     if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39644                                   DAG) &&
39645         Subtarget.hasFP16()) {
39646       Shuffle = X86ISD::MOVSH;
39647       SrcVT = DstVT = MVT::v8f16;
39648       return true;
39649     }
39650   }
39651
39652   // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39653   if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39654       ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39655       ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39656     if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39657                              Subtarget)) {
39658       DstVT = MaskVT;
39659       return true;
39660     }
39661   }
39662
39663   // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39664   if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39665       (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39666       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39667       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39668       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
39669     if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39670                               Subtarget)) {
39671       SrcVT = DstVT = MaskVT;
39672       if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39673         SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39674       return true;
39675     }
39676   }
39677
39678   // Attempt to match against a OR if we're performing a blend shuffle and the
39679   // non-blended source element is zero in each case.
39680   // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39681   if (SizeInBits == V1.getValueSizeInBits() &&
39682       SizeInBits == V2.getValueSizeInBits() &&
39683       (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39684       (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39685     bool IsBlend = true;
39686     unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39687     unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39688     unsigned Scale1 = NumV1Elts / NumMaskElts;
39689     unsigned Scale2 = NumV2Elts / NumMaskElts;
39690     APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39691     APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39692     for (unsigned i = 0; i != NumMaskElts; ++i) {
39693       int M = Mask[i];
39694       if (M == SM_SentinelUndef)
39695         continue;
39696       if (M == SM_SentinelZero) {
39697         DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39698         DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39699         continue;
39700       }
39701       if (M == (int)i) {
39702         DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39703         continue;
39704       }
39705       if (M == (int)(i + NumMaskElts)) {
39706         DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39707         continue;
39708       }
39709       IsBlend = false;
39710       break;
39711     }
39712     if (IsBlend) {
39713       if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39714           DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39715         Shuffle = ISD::OR;
39716         SrcVT = DstVT = MaskVT.changeTypeToInteger();
39717         return true;
39718       }
39719       if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39720         // FIXME: handle mismatched sizes?
39721         // TODO: investigate if `ISD::OR` handling in
39722         // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39723         auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39724           unsigned NumElts = V.getValueType().getVectorNumElements();
39725           KnownBits Known(NumElts);
39726           for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39727             APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39728             KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39729             if (PeepholeKnown.isZero())
39730               Known.Zero.setBit(EltIdx);
39731             if (PeepholeKnown.isAllOnes())
39732               Known.One.setBit(EltIdx);
39733           }
39734           return Known;
39735         };
39736
39737         KnownBits V1Known = computeKnownBitsElementWise(V1);
39738         KnownBits V2Known = computeKnownBitsElementWise(V2);
39739
39740         for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39741           int M = Mask[i];
39742           if (M == SM_SentinelUndef)
39743             continue;
39744           if (M == SM_SentinelZero) {
39745             IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39746             continue;
39747           }
39748           if (M == (int)i) {
39749             IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39750             continue;
39751           }
39752           if (M == (int)(i + NumMaskElts)) {
39753             IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39754             continue;
39755           }
39756           llvm_unreachable("will not get here.");
39757         }
39758         if (IsBlend) {
39759           Shuffle = ISD::OR;
39760           SrcVT = DstVT = MaskVT.changeTypeToInteger();
39761           return true;
39762         }
39763       }
39764     }
39765   }
39766
39767   return false;
39768 }
39769
39770 static bool matchBinaryPermuteShuffle(
39771     MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39772     bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39773     const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39774     unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39775   unsigned NumMaskElts = Mask.size();
39776   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39777
39778   // Attempt to match against VALIGND/VALIGNQ rotate.
39779   if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39780       ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39781        (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39782        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39783     if (!isAnyZero(Mask)) {
39784       int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39785       if (0 < Rotation) {
39786         Shuffle = X86ISD::VALIGN;
39787         if (EltSizeInBits == 64)
39788           ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
39789         else
39790           ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
39791         PermuteImm = Rotation;
39792         return true;
39793       }
39794     }
39795   }
39796
39797   // Attempt to match against PALIGNR byte rotate.
39798   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39799                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39800                          (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39801     int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39802     if (0 < ByteRotation) {
39803       Shuffle = X86ISD::PALIGNR;
39804       ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39805       PermuteImm = ByteRotation;
39806       return true;
39807     }
39808   }
39809
39810   // Attempt to combine to X86ISD::BLENDI.
39811   if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39812                             (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39813       (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39814     uint64_t BlendMask = 0;
39815     bool ForceV1Zero = false, ForceV2Zero = false;
39816     SmallVector<int, 8> TargetMask(Mask);
39817     if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39818                             ForceV2Zero, BlendMask)) {
39819       if (MaskVT == MVT::v16i16) {
39820         // We can only use v16i16 PBLENDW if the lanes are repeated.
39821         SmallVector<int, 8> RepeatedMask;
39822         if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39823                                         RepeatedMask)) {
39824           assert(RepeatedMask.size() == 8 &&
39825                  "Repeated mask size doesn't match!");
39826           PermuteImm = 0;
39827           for (int i = 0; i < 8; ++i)
39828             if (RepeatedMask[i] >= 8)
39829               PermuteImm |= 1 << i;
39830           V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39831           V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39832           Shuffle = X86ISD::BLENDI;
39833           ShuffleVT = MaskVT;
39834           return true;
39835         }
39836       } else {
39837         V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39838         V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39839         PermuteImm = (unsigned)BlendMask;
39840         Shuffle = X86ISD::BLENDI;
39841         ShuffleVT = MaskVT;
39842         return true;
39843       }
39844     }
39845   }
39846
39847   // Attempt to combine to INSERTPS, but only if it has elements that need to
39848   // be set to zero.
39849   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39850       MaskVT.is128BitVector() && isAnyZero(Mask) &&
39851       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39852     Shuffle = X86ISD::INSERTPS;
39853     ShuffleVT = MVT::v4f32;
39854     return true;
39855   }
39856
39857   // Attempt to combine to SHUFPD.
39858   if (AllowFloatDomain && EltSizeInBits == 64 &&
39859       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39860        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39861        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39862     bool ForceV1Zero = false, ForceV2Zero = false;
39863     if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39864                                PermuteImm, Mask, Zeroable)) {
39865       V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39866       V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39867       Shuffle = X86ISD::SHUFP;
39868       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39869       return true;
39870     }
39871   }
39872
39873   // Attempt to combine to SHUFPS.
39874   if (AllowFloatDomain && EltSizeInBits == 32 &&
39875       ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39876        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39877        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39878     SmallVector<int, 4> RepeatedMask;
39879     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39880       // Match each half of the repeated mask, to determine if its just
39881       // referencing one of the vectors, is zeroable or entirely undef.
39882       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39883         int M0 = RepeatedMask[Offset];
39884         int M1 = RepeatedMask[Offset + 1];
39885
39886         if (isUndefInRange(RepeatedMask, Offset, 2)) {
39887           return DAG.getUNDEF(MaskVT);
39888         } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39889           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39890           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39891           return getZeroVector(MaskVT, Subtarget, DAG, DL);
39892         } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39893           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39894           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39895           return V1;
39896         } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39897           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39898           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39899           return V2;
39900         }
39901
39902         return SDValue();
39903       };
39904
39905       int ShufMask[4] = {-1, -1, -1, -1};
39906       SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39907       SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39908
39909       if (Lo && Hi) {
39910         V1 = Lo;
39911         V2 = Hi;
39912         Shuffle = X86ISD::SHUFP;
39913         ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39914         PermuteImm = getV4X86ShuffleImm(ShufMask);
39915         return true;
39916       }
39917     }
39918   }
39919
39920   // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39921   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39922       MaskVT.is128BitVector() &&
39923       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39924     Shuffle = X86ISD::INSERTPS;
39925     ShuffleVT = MVT::v4f32;
39926     return true;
39927   }
39928
39929   return false;
39930 }
39931
39932 static SDValue combineX86ShuffleChainWithExtract(
39933     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
39934     bool HasVariableMask, bool AllowVariableCrossLaneMask,
39935     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39936     const X86Subtarget &Subtarget);
39937
39938 /// Combine an arbitrary chain of shuffles into a single instruction if
39939 /// possible.
39940 ///
39941 /// This is the leaf of the recursive combine below. When we have found some
39942 /// chain of single-use x86 shuffle instructions and accumulated the combined
39943 /// shuffle mask represented by them, this will try to pattern match that mask
39944 /// into either a single instruction if there is a special purpose instruction
39945 /// for this operation, or into a PSHUFB instruction which is a fully general
39946 /// instruction but should only be used to replace chains over a certain depth.
39947 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
39948                                       ArrayRef<int> BaseMask, int Depth,
39949                                       bool HasVariableMask,
39950                                       bool AllowVariableCrossLaneMask,
39951                                       bool AllowVariablePerLaneMask,
39952                                       SelectionDAG &DAG,
39953                                       const X86Subtarget &Subtarget) {
39954   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39955   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39956          "Unexpected number of shuffle inputs!");
39957
39958   SDLoc DL(Root);
39959   MVT RootVT = Root.getSimpleValueType();
39960   unsigned RootSizeInBits = RootVT.getSizeInBits();
39961   unsigned NumRootElts = RootVT.getVectorNumElements();
39962
39963   // Canonicalize shuffle input op to the requested type.
39964   auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39965     if (VT.getSizeInBits() > Op.getValueSizeInBits())
39966       Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39967     else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39968       Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39969     return DAG.getBitcast(VT, Op);
39970   };
39971
39972   // Find the inputs that enter the chain. Note that multiple uses are OK
39973   // here, we're not going to remove the operands we find.
39974   bool UnaryShuffle = (Inputs.size() == 1);
39975   SDValue V1 = peekThroughBitcasts(Inputs[0]);
39976   SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39977                              : peekThroughBitcasts(Inputs[1]));
39978
39979   MVT VT1 = V1.getSimpleValueType();
39980   MVT VT2 = V2.getSimpleValueType();
39981   assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
39982          (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
39983
39984   SDValue Res;
39985
39986   unsigned NumBaseMaskElts = BaseMask.size();
39987   if (NumBaseMaskElts == 1) {
39988     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
39989     return CanonicalizeShuffleInput(RootVT, V1);
39990   }
39991
39992   bool OptForSize = DAG.shouldOptForSize();
39993   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
39994   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
39995                      (RootVT.isFloatingPoint() && Depth >= 1) ||
39996                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
39997
39998   // Don't combine if we are a AVX512/EVEX target and the mask element size
39999   // is different from the root element size - this would prevent writemasks
40000   // from being reused.
40001   bool IsMaskedShuffle = false;
40002   if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
40003     if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
40004         Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
40005       IsMaskedShuffle = true;
40006     }
40007   }
40008
40009   // If we are shuffling a splat (and not introducing zeros) then we can just
40010   // use it directly. This works for smaller elements as well as they already
40011   // repeat across each mask element.
40012   if (UnaryShuffle && !isAnyZero(BaseMask) &&
40013       V1.getValueSizeInBits() >= RootSizeInBits &&
40014       (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40015       DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40016     return CanonicalizeShuffleInput(RootVT, V1);
40017   }
40018
40019   SmallVector<int, 64> Mask(BaseMask);
40020
40021   // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40022   // etc. can be simplified.
40023   if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40024     SmallVector<int> ScaledMask, IdentityMask;
40025     unsigned NumElts = VT1.getVectorNumElements();
40026     if (Mask.size() <= NumElts &&
40027         scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40028       for (unsigned i = 0; i != NumElts; ++i)
40029         IdentityMask.push_back(i);
40030       if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40031                                     V2))
40032         return CanonicalizeShuffleInput(RootVT, V1);
40033     }
40034   }
40035
40036   // Handle 128/256-bit lane shuffles of 512-bit vectors.
40037   if (RootVT.is512BitVector() &&
40038       (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40039     // If the upper subvectors are zeroable, then an extract+insert is more
40040     // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40041     // to zero the upper subvectors.
40042     if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40043       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
40044         return SDValue(); // Nothing to do!
40045       assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40046              "Unexpected lane shuffle");
40047       Res = CanonicalizeShuffleInput(RootVT, V1);
40048       unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40049       bool UseZero = isAnyZero(Mask);
40050       Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40051       return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40052     }
40053
40054     // Narrow shuffle mask to v4x128.
40055     SmallVector<int, 4> ScaledMask;
40056     assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40057     narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40058
40059     // Try to lower to vshuf64x2/vshuf32x4.
40060     auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40061                             ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40062                             SelectionDAG &DAG) {
40063       unsigned PermMask = 0;
40064       // Insure elements came from the same Op.
40065       SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40066       for (int i = 0; i < 4; ++i) {
40067         assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40068         if (ScaledMask[i] < 0)
40069           continue;
40070
40071         SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40072         unsigned OpIndex = i / 2;
40073         if (Ops[OpIndex].isUndef())
40074           Ops[OpIndex] = Op;
40075         else if (Ops[OpIndex] != Op)
40076           return SDValue();
40077
40078         // Convert the 128-bit shuffle mask selection values into 128-bit
40079         // selection bits defined by a vshuf64x2 instruction's immediate control
40080         // byte.
40081         PermMask |= (ScaledMask[i] % 4) << (i * 2);
40082       }
40083
40084       return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40085                          CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40086                          CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40087                          DAG.getTargetConstant(PermMask, DL, MVT::i8));
40088     };
40089
40090     // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40091     // doesn't work because our mask is for 128 bits and we don't have an MVT
40092     // to match that.
40093     bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
40094                        isUndefOrInRange(ScaledMask[1], 0, 2) &&
40095                        isUndefOrInRange(ScaledMask[2], 2, 4) &&
40096                        isUndefOrInRange(ScaledMask[3], 2, 4) &&
40097                        (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40098                         ScaledMask[0] == (ScaledMask[2] % 2)) &&
40099                        (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40100                         ScaledMask[1] == (ScaledMask[3] % 2));
40101
40102     if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40103       if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
40104         return SDValue(); // Nothing to do!
40105       MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40106       if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40107         return DAG.getBitcast(RootVT, V);
40108     }
40109   }
40110
40111   // Handle 128-bit lane shuffles of 256-bit vectors.
40112   if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40113     // If the upper half is zeroable, then an extract+insert is more optimal
40114     // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40115     // zero the upper half.
40116     if (isUndefOrZero(Mask[1])) {
40117       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
40118         return SDValue(); // Nothing to do!
40119       assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40120       Res = CanonicalizeShuffleInput(RootVT, V1);
40121       Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40122       return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40123                             256);
40124     }
40125
40126     // If we're inserting the low subvector, an insert-subvector 'concat'
40127     // pattern is quicker than VPERM2X128.
40128     // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
40129     if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40130         !Subtarget.hasAVX2()) {
40131       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
40132         return SDValue(); // Nothing to do!
40133       SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40134       SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40135       Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40136       return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40137     }
40138
40139     if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
40140       return SDValue(); // Nothing to do!
40141
40142     // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
40143     // we need to use the zeroing feature.
40144     // Prefer blends for sequential shuffles unless we are optimizing for size.
40145     if (UnaryShuffle &&
40146         !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40147         (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40148       unsigned PermMask = 0;
40149       PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40150       PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40151       return DAG.getNode(
40152           X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40153           DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40154     }
40155
40156     if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
40157       return SDValue(); // Nothing to do!
40158
40159     // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40160     if (!UnaryShuffle && !IsMaskedShuffle) {
40161       assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40162              "Unexpected shuffle sentinel value");
40163       // Prefer blends to X86ISD::VPERM2X128.
40164       if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40165         unsigned PermMask = 0;
40166         PermMask |= ((Mask[0] & 3) << 0);
40167         PermMask |= ((Mask[1] & 3) << 4);
40168         SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40169         SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40170         return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40171                           CanonicalizeShuffleInput(RootVT, LHS),
40172                           CanonicalizeShuffleInput(RootVT, RHS),
40173                           DAG.getTargetConstant(PermMask, DL, MVT::i8));
40174       }
40175     }
40176   }
40177
40178   // For masks that have been widened to 128-bit elements or more,
40179   // narrow back down to 64-bit elements.
40180   if (BaseMaskEltSizeInBits > 64) {
40181     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40182     int MaskScale = BaseMaskEltSizeInBits / 64;
40183     SmallVector<int, 64> ScaledMask;
40184     narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40185     Mask = std::move(ScaledMask);
40186   }
40187
40188   // For masked shuffles, we're trying to match the root width for better
40189   // writemask folding, attempt to scale the mask.
40190   // TODO - variable shuffles might need this to be widened again.
40191   if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40192     assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40193     int MaskScale = NumRootElts / Mask.size();
40194     SmallVector<int, 64> ScaledMask;
40195     narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40196     Mask = std::move(ScaledMask);
40197   }
40198
40199   unsigned NumMaskElts = Mask.size();
40200   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40201
40202   // Determine the effective mask value type.
40203   FloatDomain &= (32 <= MaskEltSizeInBits);
40204   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40205                            : MVT::getIntegerVT(MaskEltSizeInBits);
40206   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40207
40208   // Only allow legal mask types.
40209   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
40210     return SDValue();
40211
40212   // Attempt to match the mask against known shuffle patterns.
40213   MVT ShuffleSrcVT, ShuffleVT;
40214   unsigned Shuffle, PermuteImm;
40215
40216   // Which shuffle domains are permitted?
40217   // Permit domain crossing at higher combine depths.
40218   // TODO: Should we indicate which domain is preferred if both are allowed?
40219   bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40220   bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40221                         (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40222
40223   // Determine zeroable mask elements.
40224   APInt KnownUndef, KnownZero;
40225   resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40226   APInt Zeroable = KnownUndef | KnownZero;
40227
40228   if (UnaryShuffle) {
40229     // Attempt to match against broadcast-from-vector.
40230     // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40231     if ((Subtarget.hasAVX2() ||
40232          (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40233         (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40234       if (isUndefOrEqual(Mask, 0)) {
40235         if (V1.getValueType() == MaskVT &&
40236             V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40237             X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40238           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
40239             return SDValue(); // Nothing to do!
40240           Res = V1.getOperand(0);
40241           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40242           return DAG.getBitcast(RootVT, Res);
40243         }
40244         if (Subtarget.hasAVX2()) {
40245           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
40246             return SDValue(); // Nothing to do!
40247           Res = CanonicalizeShuffleInput(MaskVT, V1);
40248           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40249           return DAG.getBitcast(RootVT, Res);
40250         }
40251       }
40252     }
40253
40254     if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40255                           DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40256         (!IsMaskedShuffle ||
40257          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40258       if (Depth == 0 && Root.getOpcode() == Shuffle)
40259         return SDValue(); // Nothing to do!
40260       Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40261       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40262       return DAG.getBitcast(RootVT, Res);
40263     }
40264
40265     if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40266                                  AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40267                                  PermuteImm) &&
40268         (!IsMaskedShuffle ||
40269          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40270       if (Depth == 0 && Root.getOpcode() == Shuffle)
40271         return SDValue(); // Nothing to do!
40272       Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40273       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40274                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40275       return DAG.getBitcast(RootVT, Res);
40276     }
40277   }
40278
40279   // Attempt to combine to INSERTPS, but only if the inserted element has come
40280   // from a scalar.
40281   // TODO: Handle other insertions here as well?
40282   if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40283       Subtarget.hasSSE41() &&
40284       !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40285     if (MaskEltSizeInBits == 32) {
40286       SDValue SrcV1 = V1, SrcV2 = V2;
40287       if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40288                                  DAG) &&
40289           SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40290         if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
40291           return SDValue(); // Nothing to do!
40292         Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40293                           CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40294                           CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40295                           DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40296         return DAG.getBitcast(RootVT, Res);
40297       }
40298     }
40299     if (MaskEltSizeInBits == 64 &&
40300         isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40301         V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40302         V2.getScalarValueSizeInBits() <= 32) {
40303       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
40304         return SDValue(); // Nothing to do!
40305       PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40306       Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40307                         CanonicalizeShuffleInput(MVT::v4f32, V1),
40308                         CanonicalizeShuffleInput(MVT::v4f32, V2),
40309                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40310       return DAG.getBitcast(RootVT, Res);
40311     }
40312   }
40313
40314   SDValue NewV1 = V1; // Save operands in case early exit happens.
40315   SDValue NewV2 = V2;
40316   if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40317                          NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40318                          ShuffleVT, UnaryShuffle) &&
40319       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40320     if (Depth == 0 && Root.getOpcode() == Shuffle)
40321       return SDValue(); // Nothing to do!
40322     NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40323     NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40324     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40325     return DAG.getBitcast(RootVT, Res);
40326   }
40327
40328   NewV1 = V1; // Save operands in case early exit happens.
40329   NewV2 = V2;
40330   if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40331                                 AllowIntDomain, NewV1, NewV2, DL, DAG,
40332                                 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40333       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40334     if (Depth == 0 && Root.getOpcode() == Shuffle)
40335       return SDValue(); // Nothing to do!
40336     NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40337     NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40338     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40339                       DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40340     return DAG.getBitcast(RootVT, Res);
40341   }
40342
40343   // Typically from here on, we need an integer version of MaskVT.
40344   MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40345   IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40346
40347   // Annoyingly, SSE4A instructions don't map into the above match helpers.
40348   if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40349     uint64_t BitLen, BitIdx;
40350     if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40351                             Zeroable)) {
40352       if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
40353         return SDValue(); // Nothing to do!
40354       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40355       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40356                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
40357                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40358       return DAG.getBitcast(RootVT, Res);
40359     }
40360
40361     if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40362       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
40363         return SDValue(); // Nothing to do!
40364       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40365       V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40366       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40367                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
40368                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40369       return DAG.getBitcast(RootVT, Res);
40370     }
40371   }
40372
40373   // Match shuffle against TRUNCATE patterns.
40374   if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40375     // Match against a VTRUNC instruction, accounting for src/dst sizes.
40376     if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40377                              Subtarget)) {
40378       bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40379                         ShuffleSrcVT.getVectorNumElements();
40380       unsigned Opc =
40381           IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
40382       if (Depth == 0 && Root.getOpcode() == Opc)
40383         return SDValue(); // Nothing to do!
40384       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40385       Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40386       if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40387         Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40388       return DAG.getBitcast(RootVT, Res);
40389     }
40390
40391     // Do we need a more general binary truncation pattern?
40392     if (RootSizeInBits < 512 &&
40393         ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40394          (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40395         (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40396         isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40397       // Bail if this was already a truncation or PACK node.
40398       // We sometimes fail to match PACK if we demand known undef elements.
40399       if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
40400                          Root.getOpcode() == X86ISD::PACKSS ||
40401                          Root.getOpcode() == X86ISD::PACKUS))
40402         return SDValue(); // Nothing to do!
40403       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40404       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40405       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40406       V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40407       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40408       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40409       Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40410       Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40411       return DAG.getBitcast(RootVT, Res);
40412     }
40413   }
40414
40415   // Don't try to re-form single instruction chains under any circumstances now
40416   // that we've done encoding canonicalization for them.
40417   if (Depth < 1)
40418     return SDValue();
40419
40420   // Depth threshold above which we can efficiently use variable mask shuffles.
40421   int VariableCrossLaneShuffleDepth =
40422       Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40423   int VariablePerLaneShuffleDepth =
40424       Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40425   AllowVariableCrossLaneMask &=
40426       (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
40427   AllowVariablePerLaneMask &=
40428       (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
40429   // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
40430   // higher depth before combining them.
40431   bool AllowBWIVPERMV3 =
40432       (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
40433
40434   bool MaskContainsZeros = isAnyZero(Mask);
40435
40436   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40437     // If we have a single input lane-crossing shuffle then lower to VPERMV.
40438     if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40439       if (Subtarget.hasAVX2() &&
40440           (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40441         SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40442         Res = CanonicalizeShuffleInput(MaskVT, V1);
40443         Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40444         return DAG.getBitcast(RootVT, Res);
40445       }
40446       // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40447       if ((Subtarget.hasAVX512() &&
40448            (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40449             MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40450           (Subtarget.hasBWI() &&
40451            (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40452           (Subtarget.hasVBMI() &&
40453            (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40454         V1 = CanonicalizeShuffleInput(MaskVT, V1);
40455         V2 = DAG.getUNDEF(MaskVT);
40456         Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40457         return DAG.getBitcast(RootVT, Res);
40458       }
40459     }
40460
40461     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40462     // vector as the second source (non-VLX will pad to 512-bit shuffles).
40463     if (UnaryShuffle && AllowVariableCrossLaneMask &&
40464         ((Subtarget.hasAVX512() &&
40465           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40466            MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40467            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40468            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40469          (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40470           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40471          (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40472           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40473       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40474       for (unsigned i = 0; i != NumMaskElts; ++i)
40475         if (Mask[i] == SM_SentinelZero)
40476           Mask[i] = NumMaskElts + i;
40477       V1 = CanonicalizeShuffleInput(MaskVT, V1);
40478       V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40479       Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40480       return DAG.getBitcast(RootVT, Res);
40481     }
40482
40483     // If that failed and either input is extracted then try to combine as a
40484     // shuffle with the larger type.
40485     if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40486             Inputs, Root, BaseMask, Depth, HasVariableMask,
40487             AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
40488             Subtarget))
40489       return WideShuffle;
40490
40491     // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40492     // (non-VLX will pad to 512-bit shuffles).
40493     if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40494         ((Subtarget.hasAVX512() &&
40495           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40496            MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40497            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40498            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40499          (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40500           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40501          (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40502           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40503       V1 = CanonicalizeShuffleInput(MaskVT, V1);
40504       V2 = CanonicalizeShuffleInput(MaskVT, V2);
40505       Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40506       return DAG.getBitcast(RootVT, Res);
40507     }
40508     return SDValue();
40509   }
40510
40511   // See if we can combine a single input shuffle with zeros to a bit-mask,
40512   // which is much simpler than any shuffle.
40513   if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40514       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40515       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
40516     APInt Zero = APInt::getZero(MaskEltSizeInBits);
40517     APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40518     APInt UndefElts(NumMaskElts, 0);
40519     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40520     for (unsigned i = 0; i != NumMaskElts; ++i) {
40521       int M = Mask[i];
40522       if (M == SM_SentinelUndef) {
40523         UndefElts.setBit(i);
40524         continue;
40525       }
40526       if (M == SM_SentinelZero)
40527         continue;
40528       EltBits[i] = AllOnes;
40529     }
40530     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40531     Res = CanonicalizeShuffleInput(MaskVT, V1);
40532     unsigned AndOpcode =
40533         MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
40534     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40535     return DAG.getBitcast(RootVT, Res);
40536   }
40537
40538   // If we have a single input shuffle with different shuffle patterns in the
40539   // the 128-bit lanes use the variable mask to VPERMILPS.
40540   // TODO Combine other mask types at higher depths.
40541   if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40542       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40543        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40544     SmallVector<SDValue, 16> VPermIdx;
40545     for (int M : Mask) {
40546       SDValue Idx =
40547           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40548       VPermIdx.push_back(Idx);
40549     }
40550     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40551     Res = CanonicalizeShuffleInput(MaskVT, V1);
40552     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40553     return DAG.getBitcast(RootVT, Res);
40554   }
40555
40556   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40557   // to VPERMIL2PD/VPERMIL2PS.
40558   if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40559       (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40560        MaskVT == MVT::v8f32)) {
40561     // VPERMIL2 Operation.
40562     // Bits[3] - Match Bit.
40563     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40564     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40565     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40566     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40567     SmallVector<int, 8> VPerm2Idx;
40568     unsigned M2ZImm = 0;
40569     for (int M : Mask) {
40570       if (M == SM_SentinelUndef) {
40571         VPerm2Idx.push_back(-1);
40572         continue;
40573       }
40574       if (M == SM_SentinelZero) {
40575         M2ZImm = 2;
40576         VPerm2Idx.push_back(8);
40577         continue;
40578       }
40579       int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40580       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40581       VPerm2Idx.push_back(Index);
40582     }
40583     V1 = CanonicalizeShuffleInput(MaskVT, V1);
40584     V2 = CanonicalizeShuffleInput(MaskVT, V2);
40585     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40586     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40587                       DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40588     return DAG.getBitcast(RootVT, Res);
40589   }
40590
40591   // If we have 3 or more shuffle instructions or a chain involving a variable
40592   // mask, we can replace them with a single PSHUFB instruction profitably.
40593   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40594   // instructions, but in practice PSHUFB tends to be *very* fast so we're
40595   // more aggressive.
40596   if (UnaryShuffle && AllowVariablePerLaneMask &&
40597       ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40598        (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40599        (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40600     SmallVector<SDValue, 16> PSHUFBMask;
40601     int NumBytes = RootVT.getSizeInBits() / 8;
40602     int Ratio = NumBytes / NumMaskElts;
40603     for (int i = 0; i < NumBytes; ++i) {
40604       int M = Mask[i / Ratio];
40605       if (M == SM_SentinelUndef) {
40606         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40607         continue;
40608       }
40609       if (M == SM_SentinelZero) {
40610         PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40611         continue;
40612       }
40613       M = Ratio * M + i % Ratio;
40614       assert((M / 16) == (i / 16) && "Lane crossing detected");
40615       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40616     }
40617     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40618     Res = CanonicalizeShuffleInput(ByteVT, V1);
40619     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40620     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40621     return DAG.getBitcast(RootVT, Res);
40622   }
40623
40624   // With XOP, if we have a 128-bit binary input shuffle we can always combine
40625   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40626   // slower than PSHUFB on targets that support both.
40627   if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40628       Subtarget.hasXOP()) {
40629     // VPPERM Mask Operation
40630     // Bits[4:0] - Byte Index (0 - 31)
40631     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40632     SmallVector<SDValue, 16> VPPERMMask;
40633     int NumBytes = 16;
40634     int Ratio = NumBytes / NumMaskElts;
40635     for (int i = 0; i < NumBytes; ++i) {
40636       int M = Mask[i / Ratio];
40637       if (M == SM_SentinelUndef) {
40638         VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40639         continue;
40640       }
40641       if (M == SM_SentinelZero) {
40642         VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40643         continue;
40644       }
40645       M = Ratio * M + i % Ratio;
40646       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40647     }
40648     MVT ByteVT = MVT::v16i8;
40649     V1 = CanonicalizeShuffleInput(ByteVT, V1);
40650     V2 = CanonicalizeShuffleInput(ByteVT, V2);
40651     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40652     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40653     return DAG.getBitcast(RootVT, Res);
40654   }
40655
40656   // If that failed and either input is extracted then try to combine as a
40657   // shuffle with the larger type.
40658   if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
40659           Inputs, Root, BaseMask, Depth, HasVariableMask,
40660           AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
40661     return WideShuffle;
40662
40663   // If we have a dual input shuffle then lower to VPERMV3,
40664   // (non-VLX will pad to 512-bit shuffles)
40665   if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40666       ((Subtarget.hasAVX512() &&
40667         (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40668          MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40669          MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40670          MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40671          MaskVT == MVT::v16i32)) ||
40672        (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40673         (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40674          MaskVT == MVT::v32i16)) ||
40675        (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40676         (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40677          MaskVT == MVT::v64i8)))) {
40678     V1 = CanonicalizeShuffleInput(MaskVT, V1);
40679     V2 = CanonicalizeShuffleInput(MaskVT, V2);
40680     Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40681     return DAG.getBitcast(RootVT, Res);
40682   }
40683
40684   // Failed to find any combines.
40685   return SDValue();
40686 }
40687
40688 // Combine an arbitrary chain of shuffles + extract_subvectors into a single
40689 // instruction if possible.
40690 //
40691 // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40692 // type size to attempt to combine:
40693 // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40694 // -->
40695 // extract_subvector(shuffle(x,y,m2),0)
40696 static SDValue combineX86ShuffleChainWithExtract(
40697     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
40698     bool HasVariableMask, bool AllowVariableCrossLaneMask,
40699     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
40700     const X86Subtarget &Subtarget) {
40701   unsigned NumMaskElts = BaseMask.size();
40702   unsigned NumInputs = Inputs.size();
40703   if (NumInputs == 0)
40704     return SDValue();
40705
40706   EVT RootVT = Root.getValueType();
40707   unsigned RootSizeInBits = RootVT.getSizeInBits();
40708   unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40709   assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40710
40711   // Peek through extract_subvector to find widest legal vector.
40712   // TODO: Handle ISD::TRUNCATE
40713   unsigned WideSizeInBits = RootSizeInBits;
40714   for (unsigned I = 0; I != NumInputs; ++I) {
40715     SDValue Input = peekThroughBitcasts(Inputs[I]);
40716     while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
40717       Input = peekThroughBitcasts(Input.getOperand(0));
40718     if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40719         WideSizeInBits < Input.getValueSizeInBits())
40720       WideSizeInBits = Input.getValueSizeInBits();
40721   }
40722
40723   // Bail if we fail to find a source larger than the existing root.
40724   unsigned Scale = WideSizeInBits / RootSizeInBits;
40725   if (WideSizeInBits <= RootSizeInBits ||
40726       (WideSizeInBits % RootSizeInBits) != 0)
40727     return SDValue();
40728
40729   // Create new mask for larger type.
40730   SmallVector<int, 64> WideMask(BaseMask);
40731   for (int &M : WideMask) {
40732     if (M < 0)
40733       continue;
40734     M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
40735   }
40736   WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
40737
40738   // Attempt to peek through inputs and adjust mask when we extract from an
40739   // upper subvector.
40740   int AdjustedMasks = 0;
40741   SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
40742   for (unsigned I = 0; I != NumInputs; ++I) {
40743     SDValue &Input = WideInputs[I];
40744     Input = peekThroughBitcasts(Input);
40745     while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40746            Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40747       uint64_t Idx = Input.getConstantOperandVal(1);
40748       if (Idx != 0) {
40749         ++AdjustedMasks;
40750         unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40751         Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40752
40753         int lo = I * WideMask.size();
40754         int hi = (I + 1) * WideMask.size();
40755         for (int &M : WideMask)
40756           if (lo <= M && M < hi)
40757             M += Idx;
40758       }
40759       Input = peekThroughBitcasts(Input.getOperand(0));
40760     }
40761   }
40762
40763   // Remove unused/repeated shuffle source ops.
40764   resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40765   assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40766
40767   // Bail if we're always extracting from the lowest subvectors,
40768   // combineX86ShuffleChain should match this for the current width, or the
40769   // shuffle still references too many inputs.
40770   if (AdjustedMasks == 0 || WideInputs.size() > 2)
40771     return SDValue();
40772
40773   // Minor canonicalization of the accumulated shuffle mask to make it easier
40774   // to match below. All this does is detect masks with sequential pairs of
40775   // elements, and shrink them to the half-width mask. It does this in a loop
40776   // so it will reduce the size of the mask to the minimal width mask which
40777   // performs an equivalent shuffle.
40778   while (WideMask.size() > 1) {
40779     SmallVector<int, 64> WidenedMask;
40780     if (!canWidenShuffleElements(WideMask, WidenedMask))
40781       break;
40782     WideMask = std::move(WidenedMask);
40783   }
40784
40785   // Canonicalization of binary shuffle masks to improve pattern matching by
40786   // commuting the inputs.
40787   if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40788     ShuffleVectorSDNode::commuteMask(WideMask);
40789     std::swap(WideInputs[0], WideInputs[1]);
40790   }
40791
40792   // Increase depth for every upper subvector we've peeked through.
40793   Depth += AdjustedMasks;
40794
40795   // Attempt to combine wider chain.
40796   // TODO: Can we use a better Root?
40797   SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40798                              WideInputs.back().getValueSizeInBits()
40799                          ? WideInputs.front()
40800                          : WideInputs.back();
40801   assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40802          "WideRootSize mismatch");
40803
40804   if (SDValue WideShuffle =
40805           combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
40806                                  HasVariableMask, AllowVariableCrossLaneMask,
40807                                  AllowVariablePerLaneMask, DAG, Subtarget)) {
40808     WideShuffle =
40809         extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
40810     return DAG.getBitcast(RootVT, WideShuffle);
40811   }
40812
40813   return SDValue();
40814 }
40815
40816 // Canonicalize the combined shuffle mask chain with horizontal ops.
40817 // NOTE: This may update the Ops and Mask.
40818 static SDValue canonicalizeShuffleMaskWithHorizOp(
40819     MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
40820     unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40821     const X86Subtarget &Subtarget) {
40822   if (Mask.empty() || Ops.empty())
40823     return SDValue();
40824
40825   SmallVector<SDValue> BC;
40826   for (SDValue Op : Ops)
40827     BC.push_back(peekThroughBitcasts(Op));
40828
40829   // All ops must be the same horizop + type.
40830   SDValue BC0 = BC[0];
40831   EVT VT0 = BC0.getValueType();
40832   unsigned Opcode0 = BC0.getOpcode();
40833   if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40834         return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40835       }))
40836     return SDValue();
40837
40838   bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40839                   Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40840   bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40841   if (!isHoriz && !isPack)
40842     return SDValue();
40843
40844   // Do all ops have a single use?
40845   bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40846     return Op.hasOneUse() &&
40847            peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
40848   });
40849
40850   int NumElts = VT0.getVectorNumElements();
40851   int NumLanes = VT0.getSizeInBits() / 128;
40852   int NumEltsPerLane = NumElts / NumLanes;
40853   int NumHalfEltsPerLane = NumEltsPerLane / 2;
40854   MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40855   unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40856
40857   if (NumEltsPerLane >= 4 &&
40858       (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40859     SmallVector<int> LaneMask, ScaledMask;
40860     if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40861         scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40862       // See if we can remove the shuffle by resorting the HOP chain so that
40863       // the HOP args are pre-shuffled.
40864       // TODO: Generalize to any sized/depth chain.
40865       // TODO: Add support for PACKSS/PACKUS.
40866       if (isHoriz) {
40867         // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40868         auto GetHOpSrc = [&](int M) {
40869           if (M == SM_SentinelUndef)
40870             return DAG.getUNDEF(VT0);
40871           if (M == SM_SentinelZero)
40872             return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40873           SDValue Src0 = BC[M / 4];
40874           SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40875           if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40876             return Src1.getOperand(M % 2);
40877           return SDValue();
40878         };
40879         SDValue M0 = GetHOpSrc(ScaledMask[0]);
40880         SDValue M1 = GetHOpSrc(ScaledMask[1]);
40881         SDValue M2 = GetHOpSrc(ScaledMask[2]);
40882         SDValue M3 = GetHOpSrc(ScaledMask[3]);
40883         if (M0 && M1 && M2 && M3) {
40884           SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40885           SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40886           return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40887         }
40888       }
40889       // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40890       if (Ops.size() >= 2) {
40891         SDValue LHS, RHS;
40892         auto GetHOpSrc = [&](int M, int &OutM) {
40893           // TODO: Support SM_SentinelZero
40894           if (M < 0)
40895             return M == SM_SentinelUndef;
40896           SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40897           if (!LHS || LHS == Src) {
40898             LHS = Src;
40899             OutM = (M % 2);
40900             return true;
40901           }
40902           if (!RHS || RHS == Src) {
40903             RHS = Src;
40904             OutM = (M % 2) + 2;
40905             return true;
40906           }
40907           return false;
40908         };
40909         int PostMask[4] = {-1, -1, -1, -1};
40910         if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40911             GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40912             GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40913             GetHOpSrc(ScaledMask[3], PostMask[3])) {
40914           LHS = DAG.getBitcast(SrcVT, LHS);
40915           RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40916           SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40917           // Use SHUFPS for the permute so this will work on SSE3 targets,
40918           // shuffle combining and domain handling will simplify this later on.
40919           MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40920           Res = DAG.getBitcast(ShuffleVT, Res);
40921           return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40922                              getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40923         }
40924       }
40925     }
40926   }
40927
40928   if (2 < Ops.size())
40929     return SDValue();
40930
40931   SDValue BC1 = BC[BC.size() - 1];
40932   if (Mask.size() == VT0.getVectorNumElements()) {
40933     // Canonicalize binary shuffles of horizontal ops that use the
40934     // same sources to an unary shuffle.
40935     // TODO: Try to perform this fold even if the shuffle remains.
40936     if (Ops.size() == 2) {
40937       auto ContainsOps = [](SDValue HOp, SDValue Op) {
40938         return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40939       };
40940       // Commute if all BC0's ops are contained in BC1.
40941       if (ContainsOps(BC1, BC0.getOperand(0)) &&
40942           ContainsOps(BC1, BC0.getOperand(1))) {
40943         ShuffleVectorSDNode::commuteMask(Mask);
40944         std::swap(Ops[0], Ops[1]);
40945         std::swap(BC0, BC1);
40946       }
40947
40948       // If BC1 can be represented by BC0, then convert to unary shuffle.
40949       if (ContainsOps(BC0, BC1.getOperand(0)) &&
40950           ContainsOps(BC0, BC1.getOperand(1))) {
40951         for (int &M : Mask) {
40952           if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40953             continue;
40954           int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40955           M -= NumElts + (SubLane * NumHalfEltsPerLane);
40956           if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40957             M += NumHalfEltsPerLane;
40958         }
40959       }
40960     }
40961
40962     // Canonicalize unary horizontal ops to only refer to lower halves.
40963     for (int i = 0; i != NumElts; ++i) {
40964       int &M = Mask[i];
40965       if (isUndefOrZero(M))
40966         continue;
40967       if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40968           (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40969         M -= NumHalfEltsPerLane;
40970       if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
40971           (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40972         M -= NumHalfEltsPerLane;
40973     }
40974   }
40975
40976   // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
40977   // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
40978   // represents the LHS/RHS inputs for the lower/upper halves.
40979   SmallVector<int, 16> TargetMask128, WideMask128;
40980   if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
40981       scaleShuffleElements(TargetMask128, 2, WideMask128)) {
40982     assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
40983     bool SingleOp = (Ops.size() == 1);
40984     if (isPack || OneUseOps ||
40985         shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
40986       SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
40987       SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
40988       Lo = Lo.getOperand(WideMask128[0] & 1);
40989       Hi = Hi.getOperand(WideMask128[1] & 1);
40990       if (SingleOp) {
40991         SDValue Undef = DAG.getUNDEF(SrcVT);
40992         SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
40993         Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
40994         Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
40995         Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
40996         Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
40997       }
40998       return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
40999     }
41000   }
41001
41002   return SDValue();
41003 }
41004
41005 // Attempt to constant fold all of the constant source ops.
41006 // Returns true if the entire shuffle is folded to a constant.
41007 // TODO: Extend this to merge multiple constant Ops and update the mask.
41008 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
41009                                            ArrayRef<int> Mask, SDValue Root,
41010                                            bool HasVariableMask,
41011                                            SelectionDAG &DAG,
41012                                            const X86Subtarget &Subtarget) {
41013   MVT VT = Root.getSimpleValueType();
41014
41015   unsigned SizeInBits = VT.getSizeInBits();
41016   unsigned NumMaskElts = Mask.size();
41017   unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41018   unsigned NumOps = Ops.size();
41019
41020   // Extract constant bits from each source op.
41021   SmallVector<APInt, 16> UndefEltsOps(NumOps);
41022   SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
41023   for (unsigned I = 0; I != NumOps; ++I)
41024     if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41025                                        RawBitsOps[I]))
41026       return SDValue();
41027
41028   // If we're optimizing for size, only fold if at least one of the constants is
41029   // only used once or the combined shuffle has included a variable mask
41030   // shuffle, this is to avoid constant pool bloat.
41031   bool IsOptimizingSize = DAG.shouldOptForSize();
41032   if (IsOptimizingSize && !HasVariableMask &&
41033       llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41034     return SDValue();
41035
41036   // Shuffle the constant bits according to the mask.
41037   SDLoc DL(Root);
41038   APInt UndefElts(NumMaskElts, 0);
41039   APInt ZeroElts(NumMaskElts, 0);
41040   APInt ConstantElts(NumMaskElts, 0);
41041   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41042                                         APInt::getZero(MaskSizeInBits));
41043   for (unsigned i = 0; i != NumMaskElts; ++i) {
41044     int M = Mask[i];
41045     if (M == SM_SentinelUndef) {
41046       UndefElts.setBit(i);
41047       continue;
41048     } else if (M == SM_SentinelZero) {
41049       ZeroElts.setBit(i);
41050       continue;
41051     }
41052     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41053
41054     unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41055     unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41056
41057     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41058     if (SrcUndefElts[SrcMaskIdx]) {
41059       UndefElts.setBit(i);
41060       continue;
41061     }
41062
41063     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41064     APInt &Bits = SrcEltBits[SrcMaskIdx];
41065     if (!Bits) {
41066       ZeroElts.setBit(i);
41067       continue;
41068     }
41069
41070     ConstantElts.setBit(i);
41071     ConstantBitData[i] = Bits;
41072   }
41073   assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41074
41075   // Attempt to create a zero vector.
41076   if ((UndefElts | ZeroElts).isAllOnes())
41077     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
41078
41079   // Create the constant data.
41080   MVT MaskSVT;
41081   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41082     MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41083   else
41084     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41085
41086   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41087   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41088     return SDValue();
41089
41090   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41091   return DAG.getBitcast(VT, CstOp);
41092 }
41093
41094 namespace llvm {
41095   namespace X86 {
41096     enum {
41097       MaxShuffleCombineDepth = 8
41098     };
41099   }
41100 } // namespace llvm
41101
41102 /// Fully generic combining of x86 shuffle instructions.
41103 ///
41104 /// This should be the last combine run over the x86 shuffle instructions. Once
41105 /// they have been fully optimized, this will recursively consider all chains
41106 /// of single-use shuffle instructions, build a generic model of the cumulative
41107 /// shuffle operation, and check for simpler instructions which implement this
41108 /// operation. We use this primarily for two purposes:
41109 ///
41110 /// 1) Collapse generic shuffles to specialized single instructions when
41111 ///    equivalent. In most cases, this is just an encoding size win, but
41112 ///    sometimes we will collapse multiple generic shuffles into a single
41113 ///    special-purpose shuffle.
41114 /// 2) Look for sequences of shuffle instructions with 3 or more total
41115 ///    instructions, and replace them with the slightly more expensive SSSE3
41116 ///    PSHUFB instruction if available. We do this as the last combining step
41117 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
41118 ///    a suitable short sequence of other instructions. The PSHUFB will either
41119 ///    use a register or have to read from memory and so is slightly (but only
41120 ///    slightly) more expensive than the other shuffle instructions.
41121 ///
41122 /// Because this is inherently a quadratic operation (for each shuffle in
41123 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41124 /// This should never be an issue in practice as the shuffle lowering doesn't
41125 /// produce sequences of more than 8 instructions.
41126 ///
41127 /// FIXME: We will currently miss some cases where the redundant shuffling
41128 /// would simplify under the threshold for PSHUFB formation because of
41129 /// combine-ordering. To fix this, we should do the redundant instruction
41130 /// combining in this recursive walk.
41131 static SDValue combineX86ShufflesRecursively(
41132     ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
41133     ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41134     unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
41135     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
41136     const X86Subtarget &Subtarget) {
41137   assert(!RootMask.empty() &&
41138          (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41139          "Illegal shuffle root mask");
41140   MVT RootVT = Root.getSimpleValueType();
41141   assert(RootVT.isVector() && "Shuffles operate on vector types!");
41142   unsigned RootSizeInBits = RootVT.getSizeInBits();
41143
41144   // Bound the depth of our recursive combine because this is ultimately
41145   // quadratic in nature.
41146   if (Depth >= MaxDepth)
41147     return SDValue();
41148
41149   // Directly rip through bitcasts to find the underlying operand.
41150   SDValue Op = SrcOps[SrcOpIndex];
41151   Op = peekThroughOneUseBitcasts(Op);
41152
41153   EVT VT = Op.getValueType();
41154   if (!VT.isVector() || !VT.isSimple())
41155     return SDValue(); // Bail if we hit a non-simple non-vector.
41156
41157   // FIXME: Just bail on f16 for now.
41158   if (VT.getVectorElementType() == MVT::f16)
41159     return SDValue();
41160
41161   assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41162          "Can only combine shuffles upto size of the root op.");
41163
41164   // Create a demanded elts mask from the referenced elements of Op.
41165   APInt OpDemandedElts = APInt::getZero(RootMask.size());
41166   for (int M : RootMask) {
41167     int BaseIdx = RootMask.size() * SrcOpIndex;
41168     if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41169       OpDemandedElts.setBit(M - BaseIdx);
41170   }
41171   if (RootSizeInBits != VT.getSizeInBits()) {
41172     // Op is smaller than Root - extract the demanded elts for the subvector.
41173     unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41174     unsigned NumOpMaskElts = RootMask.size() / Scale;
41175     assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41176     assert(OpDemandedElts
41177                .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41178                .isZero() &&
41179            "Out of range elements referenced in root mask");
41180     OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41181   }
41182   OpDemandedElts =
41183       APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41184
41185   // Extract target shuffle mask and resolve sentinels and inputs.
41186   SmallVector<int, 64> OpMask;
41187   SmallVector<SDValue, 2> OpInputs;
41188   APInt OpUndef, OpZero;
41189   bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
41190   if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41191                              OpZero, DAG, Depth, false)) {
41192     // Shuffle inputs must not be larger than the shuffle result.
41193     // TODO: Relax this for single input faux shuffles (e.g. trunc).
41194     if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41195           return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41196         }))
41197       return SDValue();
41198   } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41199              (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41200              !isNullConstant(Op.getOperand(1))) {
41201     SDValue SrcVec = Op.getOperand(0);
41202     int ExtractIdx = Op.getConstantOperandVal(1);
41203     unsigned NumElts = VT.getVectorNumElements();
41204     OpInputs.assign({SrcVec});
41205     OpMask.assign(NumElts, SM_SentinelUndef);
41206     std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41207     OpZero = OpUndef = APInt::getZero(NumElts);
41208   } else {
41209     return SDValue();
41210   }
41211
41212   // If the shuffle result was smaller than the root, we need to adjust the
41213   // mask indices and pad the mask with undefs.
41214   if (RootSizeInBits > VT.getSizeInBits()) {
41215     unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41216     unsigned OpMaskSize = OpMask.size();
41217     if (OpInputs.size() > 1) {
41218       unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41219       for (int &M : OpMask) {
41220         if (M < 0)
41221           continue;
41222         int EltIdx = M % OpMaskSize;
41223         int OpIdx = M / OpMaskSize;
41224         M = (PaddedMaskSize * OpIdx) + EltIdx;
41225       }
41226     }
41227     OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41228     OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41229     OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41230   }
41231
41232   SmallVector<int, 64> Mask;
41233   SmallVector<SDValue, 16> Ops;
41234
41235   // We don't need to merge masks if the root is empty.
41236   bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41237   if (EmptyRoot) {
41238     // Only resolve zeros if it will remove an input, otherwise we might end
41239     // up in an infinite loop.
41240     bool ResolveKnownZeros = true;
41241     if (!OpZero.isZero()) {
41242       APInt UsedInputs = APInt::getZero(OpInputs.size());
41243       for (int i = 0, e = OpMask.size(); i != e; ++i) {
41244         int M = OpMask[i];
41245         if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41246           continue;
41247         UsedInputs.setBit(M / OpMask.size());
41248         if (UsedInputs.isAllOnes()) {
41249           ResolveKnownZeros = false;
41250           break;
41251         }
41252       }
41253     }
41254     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41255                                       ResolveKnownZeros);
41256
41257     Mask = OpMask;
41258     Ops.append(OpInputs.begin(), OpInputs.end());
41259   } else {
41260     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41261
41262     // Add the inputs to the Ops list, avoiding duplicates.
41263     Ops.append(SrcOps.begin(), SrcOps.end());
41264
41265     auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41266       // Attempt to find an existing match.
41267       SDValue InputBC = peekThroughBitcasts(Input);
41268       for (int i = 0, e = Ops.size(); i < e; ++i)
41269         if (InputBC == peekThroughBitcasts(Ops[i]))
41270           return i;
41271       // Match failed - should we replace an existing Op?
41272       if (InsertionPoint >= 0) {
41273         Ops[InsertionPoint] = Input;
41274         return InsertionPoint;
41275       }
41276       // Add to the end of the Ops list.
41277       Ops.push_back(Input);
41278       return Ops.size() - 1;
41279     };
41280
41281     SmallVector<int, 2> OpInputIdx;
41282     for (SDValue OpInput : OpInputs)
41283       OpInputIdx.push_back(
41284           AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41285
41286     assert(((RootMask.size() > OpMask.size() &&
41287              RootMask.size() % OpMask.size() == 0) ||
41288             (OpMask.size() > RootMask.size() &&
41289              OpMask.size() % RootMask.size() == 0) ||
41290             OpMask.size() == RootMask.size()) &&
41291            "The smaller number of elements must divide the larger.");
41292
41293     // This function can be performance-critical, so we rely on the power-of-2
41294     // knowledge that we have about the mask sizes to replace div/rem ops with
41295     // bit-masks and shifts.
41296     assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
41297            "Non-power-of-2 shuffle mask sizes");
41298     assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
41299            "Non-power-of-2 shuffle mask sizes");
41300     unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41301     unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41302
41303     unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41304     unsigned RootRatio =
41305         std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41306     unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41307     assert((RootRatio == 1 || OpRatio == 1) &&
41308            "Must not have a ratio for both incoming and op masks!");
41309
41310     assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41311     assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41312     assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41313     unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41314     unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41315
41316     Mask.resize(MaskWidth, SM_SentinelUndef);
41317
41318     // Merge this shuffle operation's mask into our accumulated mask. Note that
41319     // this shuffle's mask will be the first applied to the input, followed by
41320     // the root mask to get us all the way to the root value arrangement. The
41321     // reason for this order is that we are recursing up the operation chain.
41322     for (unsigned i = 0; i < MaskWidth; ++i) {
41323       unsigned RootIdx = i >> RootRatioLog2;
41324       if (RootMask[RootIdx] < 0) {
41325         // This is a zero or undef lane, we're done.
41326         Mask[i] = RootMask[RootIdx];
41327         continue;
41328       }
41329
41330       unsigned RootMaskedIdx =
41331           RootRatio == 1
41332               ? RootMask[RootIdx]
41333               : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41334
41335       // Just insert the scaled root mask value if it references an input other
41336       // than the SrcOp we're currently inserting.
41337       if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41338           (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41339         Mask[i] = RootMaskedIdx;
41340         continue;
41341       }
41342
41343       RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41344       unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41345       if (OpMask[OpIdx] < 0) {
41346         // The incoming lanes are zero or undef, it doesn't matter which ones we
41347         // are using.
41348         Mask[i] = OpMask[OpIdx];
41349         continue;
41350       }
41351
41352       // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41353       unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41354                                           : (OpMask[OpIdx] << OpRatioLog2) +
41355                                                 (RootMaskedIdx & (OpRatio - 1));
41356
41357       OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41358       int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41359       assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41360       OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41361
41362       Mask[i] = OpMaskedIdx;
41363     }
41364   }
41365
41366   // Peek through vector widenings and set out of bounds mask indices to undef.
41367   // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41368   for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
41369     SDValue &Op = Ops[I];
41370     if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
41371         isNullConstant(Op.getOperand(2))) {
41372       Op = Op.getOperand(1);
41373       unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41374       int Lo = I * Mask.size();
41375       int Hi = (I + 1) * Mask.size();
41376       int NewHi = Lo + (Mask.size() / Scale);
41377       for (int &M : Mask) {
41378         if (Lo <= M && NewHi <= M && M < Hi)
41379           M = SM_SentinelUndef;
41380       }
41381     }
41382   }
41383
41384   // Peek through any free extract_subvector nodes back to root size.
41385   for (SDValue &Op : Ops)
41386     while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41387            (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41388            isNullConstant(Op.getOperand(1)))
41389       Op = Op.getOperand(0);
41390
41391   // Remove unused/repeated shuffle source ops.
41392   resolveTargetShuffleInputsAndMask(Ops, Mask);
41393
41394   // Handle the all undef/zero/ones cases early.
41395   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41396     return DAG.getUNDEF(RootVT);
41397   if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41398     return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
41399   if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41400       !llvm::is_contained(Mask, SM_SentinelZero))
41401     return getOnesVector(RootVT, DAG, SDLoc(Root));
41402
41403   assert(!Ops.empty() && "Shuffle with no inputs detected");
41404   HasVariableMask |= IsOpVariableMask;
41405
41406   // Update the list of shuffle nodes that have been combined so far.
41407   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
41408                                                 SrcNodes.end());
41409   CombinedNodes.push_back(Op.getNode());
41410
41411   // See if we can recurse into each shuffle source op (if it's a target
41412   // shuffle). The source op should only be generally combined if it either has
41413   // a single use (i.e. current Op) or all its users have already been combined,
41414   // if not then we can still combine but should prevent generation of variable
41415   // shuffles to avoid constant pool bloat.
41416   // Don't recurse if we already have more source ops than we can combine in
41417   // the remaining recursion depth.
41418   if (Ops.size() < (MaxDepth - Depth)) {
41419     for (int i = 0, e = Ops.size(); i < e; ++i) {
41420       // For empty roots, we need to resolve zeroable elements before combining
41421       // them with other shuffles.
41422       SmallVector<int, 64> ResolvedMask = Mask;
41423       if (EmptyRoot)
41424         resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41425       bool AllowCrossLaneVar = false;
41426       bool AllowPerLaneVar = false;
41427       if (Ops[i].getNode()->hasOneUse() ||
41428           SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41429         AllowCrossLaneVar = AllowVariableCrossLaneMask;
41430         AllowPerLaneVar = AllowVariablePerLaneMask;
41431       }
41432       if (SDValue Res = combineX86ShufflesRecursively(
41433               Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
41434               HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
41435               Subtarget))
41436         return Res;
41437     }
41438   }
41439
41440   // Attempt to constant fold all of the constant source ops.
41441   if (SDValue Cst = combineX86ShufflesConstants(
41442           Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
41443     return Cst;
41444
41445   // If constant fold failed and we only have constants - then we have
41446   // multiple uses by a single non-variable shuffle - just bail.
41447   if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41448         APInt UndefElts;
41449         SmallVector<APInt> RawBits;
41450         unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41451         return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41452                                              RawBits);
41453       })) {
41454     return SDValue();
41455   }
41456
41457   // Canonicalize the combined shuffle mask chain with horizontal ops.
41458   // NOTE: This will update the Ops and Mask.
41459   if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
41460           Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
41461     return DAG.getBitcast(RootVT, HOp);
41462
41463   // Try to refine our inputs given our knowledge of target shuffle mask.
41464   for (auto I : enumerate(Ops)) {
41465     int OpIdx = I.index();
41466     SDValue &Op = I.value();
41467
41468     // What range of shuffle mask element values results in picking from Op?
41469     int Lo = OpIdx * Mask.size();
41470     int Hi = Lo + Mask.size();
41471
41472     // Which elements of Op do we demand, given the mask's granularity?
41473     APInt OpDemandedElts(Mask.size(), 0);
41474     for (int MaskElt : Mask) {
41475       if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41476         int OpEltIdx = MaskElt - Lo;
41477         OpDemandedElts.setBit(OpEltIdx);
41478       }
41479     }
41480
41481     // Is the shuffle result smaller than the root?
41482     if (Op.getValueSizeInBits() < RootSizeInBits) {
41483       // We padded the mask with undefs. But we now need to undo that.
41484       unsigned NumExpectedVectorElts = Mask.size();
41485       unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41486       unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41487       assert(!OpDemandedElts.extractBits(
41488                  NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41489              "Demanding the virtual undef widening padding?");
41490       OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41491     }
41492
41493     // The Op itself may be of different VT, so we need to scale the mask.
41494     unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41495     APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41496
41497     // Can this operand be simplified any further, given it's demanded elements?
41498     if (SDValue NewOp =
41499             DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(
41500                 Op, OpScaledDemandedElts, DAG))
41501       Op = NewOp;
41502   }
41503   // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41504
41505   // Widen any subvector shuffle inputs we've collected.
41506   // TODO: Remove this to avoid generating temporary nodes, we should only
41507   // widen once combineX86ShuffleChain has found a match.
41508   if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41509         return Op.getValueSizeInBits() < RootSizeInBits;
41510       })) {
41511     for (SDValue &Op : Ops)
41512       if (Op.getValueSizeInBits() < RootSizeInBits)
41513         Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41514                             RootSizeInBits);
41515     // Reresolve - we might have repeated subvector sources.
41516     resolveTargetShuffleInputsAndMask(Ops, Mask);
41517   }
41518
41519   // We can only combine unary and binary shuffle mask cases.
41520   if (Ops.size() <= 2) {
41521     // Minor canonicalization of the accumulated shuffle mask to make it easier
41522     // to match below. All this does is detect masks with sequential pairs of
41523     // elements, and shrink them to the half-width mask. It does this in a loop
41524     // so it will reduce the size of the mask to the minimal width mask which
41525     // performs an equivalent shuffle.
41526     while (Mask.size() > 1) {
41527       SmallVector<int, 64> WidenedMask;
41528       if (!canWidenShuffleElements(Mask, WidenedMask))
41529         break;
41530       Mask = std::move(WidenedMask);
41531     }
41532
41533     // Canonicalization of binary shuffle masks to improve pattern matching by
41534     // commuting the inputs.
41535     if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41536       ShuffleVectorSDNode::commuteMask(Mask);
41537       std::swap(Ops[0], Ops[1]);
41538     }
41539
41540     // Try to combine into a single shuffle instruction.
41541     if (SDValue Shuffle = combineX86ShuffleChain(
41542             Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41543             AllowVariablePerLaneMask, DAG, Subtarget))
41544       return Shuffle;
41545
41546     // If all the operands come from the same larger vector, fallthrough and try
41547     // to use combineX86ShuffleChainWithExtract.
41548     SDValue LHS = peekThroughBitcasts(Ops.front());
41549     SDValue RHS = peekThroughBitcasts(Ops.back());
41550     if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41551         (RootSizeInBits / Mask.size()) != 64 ||
41552         LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41553         RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41554         LHS.getOperand(0) != RHS.getOperand(0))
41555       return SDValue();
41556   }
41557
41558   // If that failed and any input is extracted then try to combine as a
41559   // shuffle with the larger type.
41560   return combineX86ShuffleChainWithExtract(
41561       Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
41562       AllowVariablePerLaneMask, DAG, Subtarget);
41563 }
41564
41565 /// Helper entry wrapper to combineX86ShufflesRecursively.
41566 static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
41567                                              const X86Subtarget &Subtarget) {
41568   return combineX86ShufflesRecursively(
41569       {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
41570       /*HasVarMask*/ false,
41571       /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
41572       Subtarget);
41573 }
41574
41575 /// Get the PSHUF-style mask from PSHUF node.
41576 ///
41577 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41578 /// PSHUF-style masks that can be reused with such instructions.
41579 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
41580   MVT VT = N.getSimpleValueType();
41581   SmallVector<int, 4> Mask;
41582   SmallVector<SDValue, 2> Ops;
41583   bool HaveMask =
41584       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
41585   (void)HaveMask;
41586   assert(HaveMask);
41587
41588   // If we have more than 128-bits, only the low 128-bits of shuffle mask
41589   // matter. Check that the upper masks are repeats and remove them.
41590   if (VT.getSizeInBits() > 128) {
41591     int LaneElts = 128 / VT.getScalarSizeInBits();
41592 #ifndef NDEBUG
41593     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41594       for (int j = 0; j < LaneElts; ++j)
41595         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41596                "Mask doesn't repeat in high 128-bit lanes!");
41597 #endif
41598     Mask.resize(LaneElts);
41599   }
41600
41601   switch (N.getOpcode()) {
41602   case X86ISD::PSHUFD:
41603     return Mask;
41604   case X86ISD::PSHUFLW:
41605     Mask.resize(4);
41606     return Mask;
41607   case X86ISD::PSHUFHW:
41608     Mask.erase(Mask.begin(), Mask.begin() + 4);
41609     for (int &M : Mask)
41610       M -= 4;
41611     return Mask;
41612   default:
41613     llvm_unreachable("No valid shuffle instruction found!");
41614   }
41615 }
41616
41617 /// Search for a combinable shuffle across a chain ending in pshufd.
41618 ///
41619 /// We walk up the chain and look for a combinable shuffle, skipping over
41620 /// shuffles that we could hoist this shuffle's transformation past without
41621 /// altering anything.
41622 static SDValue
41623 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
41624                              SelectionDAG &DAG) {
41625   assert(N.getOpcode() == X86ISD::PSHUFD &&
41626          "Called with something other than an x86 128-bit half shuffle!");
41627   SDLoc DL(N);
41628
41629   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41630   // of the shuffles in the chain so that we can form a fresh chain to replace
41631   // this one.
41632   SmallVector<SDValue, 8> Chain;
41633   SDValue V = N.getOperand(0);
41634   for (; V.hasOneUse(); V = V.getOperand(0)) {
41635     switch (V.getOpcode()) {
41636     default:
41637       return SDValue(); // Nothing combined!
41638
41639     case ISD::BITCAST:
41640       // Skip bitcasts as we always know the type for the target specific
41641       // instructions.
41642       continue;
41643
41644     case X86ISD::PSHUFD:
41645       // Found another dword shuffle.
41646       break;
41647
41648     case X86ISD::PSHUFLW:
41649       // Check that the low words (being shuffled) are the identity in the
41650       // dword shuffle, and the high words are self-contained.
41651       if (Mask[0] != 0 || Mask[1] != 1 ||
41652           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41653         return SDValue();
41654
41655       Chain.push_back(V);
41656       continue;
41657
41658     case X86ISD::PSHUFHW:
41659       // Check that the high words (being shuffled) are the identity in the
41660       // dword shuffle, and the low words are self-contained.
41661       if (Mask[2] != 2 || Mask[3] != 3 ||
41662           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41663         return SDValue();
41664
41665       Chain.push_back(V);
41666       continue;
41667
41668     case X86ISD::UNPCKL:
41669     case X86ISD::UNPCKH:
41670       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41671       // shuffle into a preceding word shuffle.
41672       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41673           V.getSimpleValueType().getVectorElementType() != MVT::i16)
41674         return SDValue();
41675
41676       // Search for a half-shuffle which we can combine with.
41677       unsigned CombineOp =
41678           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41679       if (V.getOperand(0) != V.getOperand(1) ||
41680           !V->isOnlyUserOf(V.getOperand(0).getNode()))
41681         return SDValue();
41682       Chain.push_back(V);
41683       V = V.getOperand(0);
41684       do {
41685         switch (V.getOpcode()) {
41686         default:
41687           return SDValue(); // Nothing to combine.
41688
41689         case X86ISD::PSHUFLW:
41690         case X86ISD::PSHUFHW:
41691           if (V.getOpcode() == CombineOp)
41692             break;
41693
41694           Chain.push_back(V);
41695
41696           [[fallthrough]];
41697         case ISD::BITCAST:
41698           V = V.getOperand(0);
41699           continue;
41700         }
41701         break;
41702       } while (V.hasOneUse());
41703       break;
41704     }
41705     // Break out of the loop if we break out of the switch.
41706     break;
41707   }
41708
41709   if (!V.hasOneUse())
41710     // We fell out of the loop without finding a viable combining instruction.
41711     return SDValue();
41712
41713   // Merge this node's mask and our incoming mask.
41714   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
41715   for (int &M : Mask)
41716     M = VMask[M];
41717   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41718                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41719
41720   // Rebuild the chain around this new shuffle.
41721   while (!Chain.empty()) {
41722     SDValue W = Chain.pop_back_val();
41723
41724     if (V.getValueType() != W.getOperand(0).getValueType())
41725       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41726
41727     switch (W.getOpcode()) {
41728     default:
41729       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41730
41731     case X86ISD::UNPCKL:
41732     case X86ISD::UNPCKH:
41733       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41734       break;
41735
41736     case X86ISD::PSHUFD:
41737     case X86ISD::PSHUFLW:
41738     case X86ISD::PSHUFHW:
41739       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41740       break;
41741     }
41742   }
41743   if (V.getValueType() != N.getValueType())
41744     V = DAG.getBitcast(N.getValueType(), V);
41745
41746   // Return the new chain to replace N.
41747   return V;
41748 }
41749
41750 // Attempt to commute shufps LHS loads:
41751 // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41752 static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
41753                                       SelectionDAG &DAG) {
41754   // TODO: Add vXf64 support.
41755   if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41756     return SDValue();
41757
41758   // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41759   auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41760     if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41761       return SDValue();
41762     SDValue N0 = V.getOperand(0);
41763     SDValue N1 = V.getOperand(1);
41764     unsigned Imm = V.getConstantOperandVal(2);
41765     const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41766     if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41767         X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
41768       return SDValue();
41769     Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
41770     return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41771                        DAG.getTargetConstant(Imm, DL, MVT::i8));
41772   };
41773
41774   switch (N.getOpcode()) {
41775   case X86ISD::VPERMILPI:
41776     if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41777       unsigned Imm = N.getConstantOperandVal(1);
41778       return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41779                          DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41780     }
41781     break;
41782   case X86ISD::SHUFP: {
41783     SDValue N0 = N.getOperand(0);
41784     SDValue N1 = N.getOperand(1);
41785     unsigned Imm = N.getConstantOperandVal(2);
41786     if (N0 == N1) {
41787       if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41788         return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41789                            DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41790     } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41791       return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41792                          DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41793     } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41794       return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41795                          DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41796     }
41797     break;
41798   }
41799   }
41800
41801   return SDValue();
41802 }
41803
41804 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41805 static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
41806                                              const SDLoc &DL) {
41807   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41808   EVT ShuffleVT = N.getValueType();
41809
41810   auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {
41811     // AllZeros/AllOnes constants are freely shuffled and will peek through
41812     // bitcasts. Other constant build vectors do not peek through bitcasts. Only
41813     // merge with target shuffles if it has one use so shuffle combining is
41814     // likely to kick in. Shuffles of splats are expected to be removed.
41815     return ISD::isBuildVectorAllOnes(Op.getNode()) ||
41816            ISD::isBuildVectorAllZeros(Op.getNode()) ||
41817            ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
41818            ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
41819            (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
41820            (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
41821            (FoldLoad && isShuffleFoldableLoad(Op)) ||
41822            DAG.isSplatValue(Op, /*AllowUndefs*/ false);
41823   };
41824   auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
41825     // Ensure we only shuffle whole vector src elements, unless its a logical
41826     // binops where we can more aggressively move shuffles from dst to src.
41827     return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
41828            BinOp == X86ISD::ANDNP ||
41829            (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
41830   };
41831
41832   unsigned Opc = N.getOpcode();
41833   switch (Opc) {
41834   // Unary and Unary+Permute Shuffles.
41835   case X86ISD::PSHUFB: {
41836     // Don't merge PSHUFB if it contains zero'd elements.
41837     SmallVector<int> Mask;
41838     SmallVector<SDValue> Ops;
41839     if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
41840                               Mask))
41841       break;
41842     [[fallthrough]];
41843   }
41844   case X86ISD::VBROADCAST:
41845   case X86ISD::MOVDDUP:
41846   case X86ISD::PSHUFD:
41847   case X86ISD::PSHUFHW:
41848   case X86ISD::PSHUFLW:
41849   case X86ISD::VPERMI:
41850   case X86ISD::VPERMILPI: {
41851     if (N.getOperand(0).getValueType() == ShuffleVT &&
41852         N->isOnlyUserOf(N.getOperand(0).getNode())) {
41853       SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41854       unsigned SrcOpcode = N0.getOpcode();
41855       if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
41856         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
41857         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
41858         if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||
41859             IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {
41860           SDValue LHS, RHS;
41861           Op00 = DAG.getBitcast(ShuffleVT, Op00);
41862           Op01 = DAG.getBitcast(ShuffleVT, Op01);
41863           if (N.getNumOperands() == 2) {
41864             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
41865             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
41866           } else {
41867             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
41868             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
41869           }
41870           EVT OpVT = N0.getValueType();
41871           return DAG.getBitcast(ShuffleVT,
41872                                 DAG.getNode(SrcOpcode, DL, OpVT,
41873                                             DAG.getBitcast(OpVT, LHS),
41874                                             DAG.getBitcast(OpVT, RHS)));
41875         }
41876       }
41877     }
41878     break;
41879   }
41880   // Binary and Binary+Permute Shuffles.
41881   case X86ISD::INSERTPS: {
41882     // Don't merge INSERTPS if it contains zero'd elements.
41883     unsigned InsertPSMask = N.getConstantOperandVal(2);
41884     unsigned ZeroMask = InsertPSMask & 0xF;
41885     if (ZeroMask != 0)
41886       break;
41887     [[fallthrough]];
41888   }
41889   case X86ISD::MOVSD:
41890   case X86ISD::MOVSS:
41891   case X86ISD::BLENDI:
41892   case X86ISD::SHUFP:
41893   case X86ISD::UNPCKH:
41894   case X86ISD::UNPCKL: {
41895     if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
41896         N->isOnlyUserOf(N.getOperand(1).getNode())) {
41897       SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
41898       SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
41899       unsigned SrcOpcode = N0.getOpcode();
41900       if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
41901           N0.getValueType() == N1.getValueType() &&
41902           IsSafeToMoveShuffle(N0, SrcOpcode) &&
41903           IsSafeToMoveShuffle(N1, SrcOpcode)) {
41904         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
41905         SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
41906         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
41907         SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
41908         // Ensure the total number of shuffles doesn't increase by folding this
41909         // shuffle through to the source ops.
41910         if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
41911              (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
41912             ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
41913              (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
41914           SDValue LHS, RHS;
41915           Op00 = DAG.getBitcast(ShuffleVT, Op00);
41916           Op10 = DAG.getBitcast(ShuffleVT, Op10);
41917           Op01 = DAG.getBitcast(ShuffleVT, Op01);
41918           Op11 = DAG.getBitcast(ShuffleVT, Op11);
41919           if (N.getNumOperands() == 3) {
41920             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
41921             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
41922           } else {
41923             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
41924             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
41925           }
41926           EVT OpVT = N0.getValueType();
41927           return DAG.getBitcast(ShuffleVT,
41928                                 DAG.getNode(SrcOpcode, DL, OpVT,
41929                                             DAG.getBitcast(OpVT, LHS),
41930                                             DAG.getBitcast(OpVT, RHS)));
41931         }
41932       }
41933     }
41934     break;
41935   }
41936   }
41937   return SDValue();
41938 }
41939
41940 /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
41941 static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
41942                                                       SelectionDAG &DAG,
41943                                                       const SDLoc &DL) {
41944   assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
41945
41946   MVT VT = V.getSimpleValueType();
41947   SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
41948   SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
41949   unsigned SrcOpc0 = Src0.getOpcode();
41950   unsigned SrcOpc1 = Src1.getOpcode();
41951   EVT SrcVT0 = Src0.getValueType();
41952   EVT SrcVT1 = Src1.getValueType();
41953
41954   if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
41955     return SDValue();
41956
41957   switch (SrcOpc0) {
41958   case X86ISD::MOVDDUP: {
41959     SDValue LHS = Src0.getOperand(0);
41960     SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41961     SDValue Res =
41962         DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
41963     Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
41964     return DAG.getBitcast(VT, Res);
41965   }
41966   case X86ISD::VPERMILPI:
41967     // TODO: Handle v4f64 permutes with different low/high lane masks.
41968     if (SrcVT0 == MVT::v4f64) {
41969       uint64_t Mask = Src0.getConstantOperandVal(1);
41970       if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
41971         break;
41972     }
41973     [[fallthrough]];
41974   case X86ISD::VSHLI:
41975   case X86ISD::VSRLI:
41976   case X86ISD::VSRAI:
41977   case X86ISD::PSHUFD:
41978     if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
41979       SDValue LHS = Src0.getOperand(0);
41980       SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
41981       SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
41982                                 V.getOperand(2));
41983       Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
41984       return DAG.getBitcast(VT, Res);
41985     }
41986     break;
41987   }
41988
41989   return SDValue();
41990 }
41991
41992 /// Try to combine x86 target specific shuffles.
41993 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
41994                                     TargetLowering::DAGCombinerInfo &DCI,
41995                                     const X86Subtarget &Subtarget) {
41996   SDLoc DL(N);
41997   MVT VT = N.getSimpleValueType();
41998   SmallVector<int, 4> Mask;
41999   unsigned Opcode = N.getOpcode();
42000
42001   if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42002     return R;
42003
42004   // Handle specific target shuffles.
42005   switch (Opcode) {
42006   case X86ISD::MOVDDUP: {
42007     SDValue Src = N.getOperand(0);
42008     // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42009     if (VT == MVT::v2f64 && Src.hasOneUse() &&
42010         ISD::isNormalLoad(Src.getNode())) {
42011       LoadSDNode *LN = cast<LoadSDNode>(Src);
42012       if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42013         SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42014         DCI.CombineTo(N.getNode(), Movddup);
42015         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42016         DCI.recursivelyDeleteUnusedNodes(LN);
42017         return N; // Return N so it doesn't get rechecked!
42018       }
42019     }
42020
42021     return SDValue();
42022   }
42023   case X86ISD::VBROADCAST: {
42024     SDValue Src = N.getOperand(0);
42025     SDValue BC = peekThroughBitcasts(Src);
42026     EVT SrcVT = Src.getValueType();
42027     EVT BCVT = BC.getValueType();
42028
42029     // If broadcasting from another shuffle, attempt to simplify it.
42030     // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42031     if (isTargetShuffle(BC.getOpcode()) &&
42032         VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42033       unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42034       SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42035                                         SM_SentinelUndef);
42036       for (unsigned i = 0; i != Scale; ++i)
42037         DemandedMask[i] = i;
42038       if (SDValue Res = combineX86ShufflesRecursively(
42039               {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
42040               X86::MaxShuffleCombineDepth,
42041               /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
42042               /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
42043         return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42044                            DAG.getBitcast(SrcVT, Res));
42045     }
42046
42047     // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42048     // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42049     if (Src.getOpcode() == ISD::BITCAST &&
42050         SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42051         DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
42052         FixedVectorType::isValidElementType(
42053             BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42054       EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42055                                    VT.getVectorNumElements());
42056       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42057     }
42058
42059     // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42060     // If we're re-broadcasting a smaller type then broadcast with that type and
42061     // bitcast.
42062     // TODO: Do this for any splat?
42063     if (Src.getOpcode() == ISD::BITCAST &&
42064         (BC.getOpcode() == X86ISD::VBROADCAST ||
42065          BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&
42066         (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42067         (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42068       MVT NewVT =
42069           MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),
42070                            VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42071       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42072     }
42073
42074     // Reduce broadcast source vector to lowest 128-bits.
42075     if (SrcVT.getSizeInBits() > 128)
42076       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42077                          extract128BitVector(Src, 0, DAG, DL));
42078
42079     // broadcast(scalar_to_vector(x)) -> broadcast(x).
42080     if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
42081       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42082
42083     // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42084     if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42085         isNullConstant(Src.getOperand(1)) &&
42086         DAG.getTargetLoweringInfo().isTypeLegal(
42087             Src.getOperand(0).getValueType()))
42088       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42089
42090     // Share broadcast with the longest vector and extract low subvector (free).
42091     // Ensure the same SDValue from the SDNode use is being used.
42092     for (SDNode *User : Src->uses())
42093       if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42094           Src == User->getOperand(0) &&
42095           User->getValueSizeInBits(0).getFixedValue() >
42096               VT.getFixedSizeInBits()) {
42097         return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42098                                 VT.getSizeInBits());
42099       }
42100
42101     // vbroadcast(scalarload X) -> vbroadcast_load X
42102     // For float loads, extract other uses of the scalar from the broadcast.
42103     if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42104         ISD::isNormalLoad(Src.getNode())) {
42105       LoadSDNode *LN = cast<LoadSDNode>(Src);
42106       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42107       SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42108       SDValue BcastLd =
42109           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
42110                                   LN->getMemoryVT(), LN->getMemOperand());
42111       // If the load value is used only by N, replace it via CombineTo N.
42112       bool NoReplaceExtract = Src.hasOneUse();
42113       DCI.CombineTo(N.getNode(), BcastLd);
42114       if (NoReplaceExtract) {
42115         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42116         DCI.recursivelyDeleteUnusedNodes(LN);
42117       } else {
42118         SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42119                                   DAG.getIntPtrConstant(0, DL));
42120         DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42121       }
42122       return N; // Return N so it doesn't get rechecked!
42123     }
42124
42125     // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42126     // i16. So shrink it ourselves if we can make a broadcast_load.
42127     if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42128         Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42129       assert(Subtarget.hasAVX2() && "Expected AVX2");
42130       SDValue TruncIn = Src.getOperand(0);
42131
42132       // If this is a truncate of a non extending load we can just narrow it to
42133       // use a broadcast_load.
42134       if (ISD::isNormalLoad(TruncIn.getNode())) {
42135         LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42136         // Unless its volatile or atomic.
42137         if (LN->isSimple()) {
42138           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42139           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42140           SDValue BcastLd = DAG.getMemIntrinsicNode(
42141               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42142               LN->getPointerInfo(), LN->getOriginalAlign(),
42143               LN->getMemOperand()->getFlags());
42144           DCI.CombineTo(N.getNode(), BcastLd);
42145           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42146           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42147           return N; // Return N so it doesn't get rechecked!
42148         }
42149       }
42150
42151       // If this is a truncate of an i16 extload, we can directly replace it.
42152       if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42153           ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42154         LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42155         if (LN->getMemoryVT().getSizeInBits() == 16) {
42156           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42157           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42158           SDValue BcastLd =
42159               DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
42160                                       LN->getMemoryVT(), LN->getMemOperand());
42161           DCI.CombineTo(N.getNode(), BcastLd);
42162           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42163           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42164           return N; // Return N so it doesn't get rechecked!
42165         }
42166       }
42167
42168       // If this is a truncate of load that has been shifted right, we can
42169       // offset the pointer and use a narrower load.
42170       if (TruncIn.getOpcode() == ISD::SRL &&
42171           TruncIn.getOperand(0).hasOneUse() &&
42172           isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42173           ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42174         LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42175         unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42176         // Make sure the shift amount and the load size are divisible by 16.
42177         // Don't do this if the load is volatile or atomic.
42178         if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42179             LN->isSimple()) {
42180           unsigned Offset = ShiftAmt / 8;
42181           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42182           SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
42183                                                  TypeSize::Fixed(Offset), DL);
42184           SDValue Ops[] = { LN->getChain(), Ptr };
42185           SDValue BcastLd = DAG.getMemIntrinsicNode(
42186               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42187               LN->getPointerInfo().getWithOffset(Offset),
42188               LN->getOriginalAlign(),
42189               LN->getMemOperand()->getFlags());
42190           DCI.CombineTo(N.getNode(), BcastLd);
42191           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42192           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42193           return N; // Return N so it doesn't get rechecked!
42194         }
42195       }
42196     }
42197
42198     // vbroadcast(vzload X) -> vbroadcast_load X
42199     if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42200       MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
42201       if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42202         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42203         SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42204         SDValue BcastLd =
42205             DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
42206                                     LN->getMemoryVT(), LN->getMemOperand());
42207         DCI.CombineTo(N.getNode(), BcastLd);
42208         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42209         DCI.recursivelyDeleteUnusedNodes(LN);
42210         return N; // Return N so it doesn't get rechecked!
42211       }
42212     }
42213
42214     // vbroadcast(vector load X) -> vbroadcast_load
42215     if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
42216          SrcVT == MVT::v4i32) &&
42217         Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42218       LoadSDNode *LN = cast<LoadSDNode>(Src);
42219       // Unless the load is volatile or atomic.
42220       if (LN->isSimple()) {
42221         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42222         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42223         SDValue BcastLd = DAG.getMemIntrinsicNode(
42224             X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
42225             LN->getPointerInfo(), LN->getOriginalAlign(),
42226             LN->getMemOperand()->getFlags());
42227         DCI.CombineTo(N.getNode(), BcastLd);
42228         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42229         DCI.recursivelyDeleteUnusedNodes(LN);
42230         return N; // Return N so it doesn't get rechecked!
42231       }
42232     }
42233
42234     return SDValue();
42235   }
42236   case X86ISD::VZEXT_MOVL: {
42237     SDValue N0 = N.getOperand(0);
42238
42239     // If this a vzmovl of a full vector load, replace it with a vzload, unless
42240     // the load is volatile.
42241     if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42242       auto *LN = cast<LoadSDNode>(N0);
42243       if (SDValue VZLoad =
42244               narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42245         DCI.CombineTo(N.getNode(), VZLoad);
42246         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42247         DCI.recursivelyDeleteUnusedNodes(LN);
42248         return N;
42249       }
42250     }
42251
42252     // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42253     // and can just use a VZEXT_LOAD.
42254     // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42255     if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42256       auto *LN = cast<MemSDNode>(N0);
42257       if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42258         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42259         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42260         SDValue VZLoad =
42261             DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
42262                                     LN->getMemoryVT(), LN->getMemOperand());
42263         DCI.CombineTo(N.getNode(), VZLoad);
42264         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42265         DCI.recursivelyDeleteUnusedNodes(LN);
42266         return N;
42267       }
42268     }
42269
42270     // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42271     // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42272     // if the upper bits of the i64 are zero.
42273     if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42274         N0.getOperand(0).hasOneUse() &&
42275         N0.getOperand(0).getValueType() == MVT::i64) {
42276       SDValue In = N0.getOperand(0);
42277       APInt Mask = APInt::getHighBitsSet(64, 32);
42278       if (DAG.MaskedValueIsZero(In, Mask)) {
42279         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42280         MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
42281         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42282         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42283         return DAG.getBitcast(VT, Movl);
42284       }
42285     }
42286
42287     // Load a scalar integer constant directly to XMM instead of transferring an
42288     // immediate value from GPR.
42289     // vzext_movl (scalar_to_vector C) --> load [C,0...]
42290     if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42291       if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42292         // Create a vector constant - scalar constant followed by zeros.
42293         EVT ScalarVT = N0.getOperand(0).getValueType();
42294         Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42295         unsigned NumElts = VT.getVectorNumElements();
42296         Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42297         SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42298         ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42299
42300         // Load the vector constant from constant pool.
42301         MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
42302         SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42303         MachinePointerInfo MPI =
42304             MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
42305         Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42306         return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42307                            MachineMemOperand::MOLoad);
42308       }
42309     }
42310
42311     // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42312     // insert into a zero vector. This helps get VZEXT_MOVL closer to
42313     // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42314     // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42315     if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42316       SDValue V = peekThroughOneUseBitcasts(N0);
42317
42318       if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42319           isNullConstant(V.getOperand(2))) {
42320         SDValue In = V.getOperand(1);
42321         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
42322                                      In.getValueSizeInBits() /
42323                                          VT.getScalarSizeInBits());
42324         In = DAG.getBitcast(SubVT, In);
42325         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42326         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42327                            getZeroVector(VT, Subtarget, DAG, DL), Movl,
42328                            V.getOperand(2));
42329       }
42330     }
42331
42332     return SDValue();
42333   }
42334   case X86ISD::BLENDI: {
42335     SDValue N0 = N.getOperand(0);
42336     SDValue N1 = N.getOperand(1);
42337
42338     // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42339     // TODO: Handle MVT::v16i16 repeated blend mask.
42340     if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
42341         N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42342       MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42343       if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
42344           SrcVT.getScalarSizeInBits() >= 32) {
42345         unsigned BlendMask = N.getConstantOperandVal(2);
42346         unsigned Size = VT.getVectorNumElements();
42347         unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
42348         BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
42349         return DAG.getBitcast(
42350             VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42351                             N1.getOperand(0),
42352                             DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
42353       }
42354     }
42355     return SDValue();
42356   }
42357   case X86ISD::SHUFP: {
42358     // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42359     // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42360     // TODO: Support types other than v4f32.
42361     if (VT == MVT::v4f32) {
42362       bool Updated = false;
42363       SmallVector<int> Mask;
42364       SmallVector<SDValue> Ops;
42365       if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
42366           Ops.size() == 2) {
42367         for (int i = 0; i != 2; ++i) {
42368           SmallVector<SDValue> SubOps;
42369           SmallVector<int> SubMask, SubScaledMask;
42370           SDValue Sub = peekThroughBitcasts(Ops[i]);
42371           // TODO: Scaling might be easier if we specify the demanded elts.
42372           if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42373               scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42374               SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42375             int Ofs = i * 2;
42376             Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42377             Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42378             Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42379             Updated = true;
42380           }
42381         }
42382       }
42383       if (Updated) {
42384         for (int &M : Mask)
42385           M %= 4;
42386         Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42387         return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42388       }
42389     }
42390     return SDValue();
42391   }
42392   case X86ISD::VPERMI: {
42393     // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42394     // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42395     SDValue N0 = N.getOperand(0);
42396     SDValue N1 = N.getOperand(1);
42397     unsigned EltSizeInBits = VT.getScalarSizeInBits();
42398     if (N0.getOpcode() == ISD::BITCAST &&
42399         N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42400       SDValue Src = N0.getOperand(0);
42401       EVT SrcVT = Src.getValueType();
42402       SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42403       return DAG.getBitcast(VT, Res);
42404     }
42405     return SDValue();
42406   }
42407   case X86ISD::VPERM2X128: {
42408     // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42409     SDValue LHS = N->getOperand(0);
42410     SDValue RHS = N->getOperand(1);
42411     if (LHS.getOpcode() == ISD::BITCAST &&
42412         (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42413       EVT SrcVT = LHS.getOperand(0).getValueType();
42414       if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42415         return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42416                                               DAG.getBitcast(SrcVT, LHS),
42417                                               DAG.getBitcast(SrcVT, RHS),
42418                                               N->getOperand(2)));
42419       }
42420     }
42421
42422     // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42423     if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
42424       return Res;
42425
42426     // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42427     // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42428     auto FindSubVector128 = [&](unsigned Idx) {
42429       if (Idx > 3)
42430         return SDValue();
42431       SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42432       SmallVector<SDValue> SubOps;
42433       if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42434         return SubOps[Idx & 1];
42435       unsigned NumElts = Src.getValueType().getVectorNumElements();
42436       if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42437           Src.getOperand(1).getValueSizeInBits() == 128 &&
42438           Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42439         return Src.getOperand(1);
42440       }
42441       return SDValue();
42442     };
42443     unsigned Imm = N.getConstantOperandVal(2);
42444     if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42445       if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42446         MVT SubVT = VT.getHalfNumVectorElementsVT();
42447         SubLo = DAG.getBitcast(SubVT, SubLo);
42448         SubHi = DAG.getBitcast(SubVT, SubHi);
42449         return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42450       }
42451     }
42452     return SDValue();
42453   }
42454   case X86ISD::PSHUFD:
42455   case X86ISD::PSHUFLW:
42456   case X86ISD::PSHUFHW: {
42457     SDValue N0 = N.getOperand(0);
42458     SDValue N1 = N.getOperand(1);
42459     if (N0->hasOneUse()) {
42460       SDValue V = peekThroughOneUseBitcasts(N0);
42461       switch (V.getOpcode()) {
42462       case X86ISD::VSHL:
42463       case X86ISD::VSRL:
42464       case X86ISD::VSRA:
42465       case X86ISD::VSHLI:
42466       case X86ISD::VSRLI:
42467       case X86ISD::VSRAI:
42468       case X86ISD::VROTLI:
42469       case X86ISD::VROTRI: {
42470         MVT InnerVT = V.getSimpleValueType();
42471         if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42472           SDValue Res = DAG.getNode(Opcode, DL, VT,
42473                                     DAG.getBitcast(VT, V.getOperand(0)), N1);
42474           Res = DAG.getBitcast(InnerVT, Res);
42475           Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42476           return DAG.getBitcast(VT, Res);
42477         }
42478         break;
42479       }
42480       }
42481     }
42482
42483     Mask = getPSHUFShuffleMask(N);
42484     assert(Mask.size() == 4);
42485     break;
42486   }
42487   case X86ISD::MOVSD:
42488   case X86ISD::MOVSH:
42489   case X86ISD::MOVSS: {
42490     SDValue N0 = N.getOperand(0);
42491     SDValue N1 = N.getOperand(1);
42492
42493     // Canonicalize scalar FPOps:
42494     // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42495     // If commutable, allow OP(N1[0], N0[0]).
42496     unsigned Opcode1 = N1.getOpcode();
42497     if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42498         Opcode1 == ISD::FDIV) {
42499       SDValue N10 = N1.getOperand(0);
42500       SDValue N11 = N1.getOperand(1);
42501       if (N10 == N0 ||
42502           (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42503         if (N10 != N0)
42504           std::swap(N10, N11);
42505         MVT SVT = VT.getVectorElementType();
42506         SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
42507         N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42508         N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42509         SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42510         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42511         return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42512       }
42513     }
42514
42515     return SDValue();
42516   }
42517   case X86ISD::INSERTPS: {
42518     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42519     SDValue Op0 = N.getOperand(0);
42520     SDValue Op1 = N.getOperand(1);
42521     unsigned InsertPSMask = N.getConstantOperandVal(2);
42522     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42523     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42524     unsigned ZeroMask = InsertPSMask & 0xF;
42525
42526     // If we zero out all elements from Op0 then we don't need to reference it.
42527     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42528       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42529                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42530
42531     // If we zero out the element from Op1 then we don't need to reference it.
42532     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42533       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42534                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42535
42536     // Attempt to merge insertps Op1 with an inner target shuffle node.
42537     SmallVector<int, 8> TargetMask1;
42538     SmallVector<SDValue, 2> Ops1;
42539     APInt KnownUndef1, KnownZero1;
42540     if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42541                                      KnownZero1)) {
42542       if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42543         // Zero/UNDEF insertion - zero out element and remove dependency.
42544         InsertPSMask |= (1u << DstIdx);
42545         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42546                            DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42547       }
42548       // Update insertps mask srcidx and reference the source input directly.
42549       int M = TargetMask1[SrcIdx];
42550       assert(0 <= M && M < 8 && "Shuffle index out of range");
42551       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42552       Op1 = Ops1[M < 4 ? 0 : 1];
42553       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42554                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42555     }
42556
42557     // Attempt to merge insertps Op0 with an inner target shuffle node.
42558     SmallVector<int, 8> TargetMask0;
42559     SmallVector<SDValue, 2> Ops0;
42560     APInt KnownUndef0, KnownZero0;
42561     if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42562                                      KnownZero0)) {
42563       bool Updated = false;
42564       bool UseInput00 = false;
42565       bool UseInput01 = false;
42566       for (int i = 0; i != 4; ++i) {
42567         if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42568           // No change if element is already zero or the inserted element.
42569           continue;
42570         }
42571
42572         if (KnownUndef0[i] || KnownZero0[i]) {
42573           // If the target mask is undef/zero then we must zero the element.
42574           InsertPSMask |= (1u << i);
42575           Updated = true;
42576           continue;
42577         }
42578
42579         // The input vector element must be inline.
42580         int M = TargetMask0[i];
42581         if (M != i && M != (i + 4))
42582           return SDValue();
42583
42584         // Determine which inputs of the target shuffle we're using.
42585         UseInput00 |= (0 <= M && M < 4);
42586         UseInput01 |= (4 <= M);
42587       }
42588
42589       // If we're not using both inputs of the target shuffle then use the
42590       // referenced input directly.
42591       if (UseInput00 && !UseInput01) {
42592         Updated = true;
42593         Op0 = Ops0[0];
42594       } else if (!UseInput00 && UseInput01) {
42595         Updated = true;
42596         Op0 = Ops0[1];
42597       }
42598
42599       if (Updated)
42600         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42601                            DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42602     }
42603
42604     // If we're inserting an element from a vbroadcast load, fold the
42605     // load into the X86insertps instruction. We need to convert the scalar
42606     // load to a vector and clear the source lane of the INSERTPS control.
42607     if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42608       auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42609       if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42610         SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42611                                    MemIntr->getBasePtr(),
42612                                    MemIntr->getMemOperand());
42613         SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42614                            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
42615                                        Load),
42616                            DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42617         DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42618         return Insert;
42619       }
42620     }
42621
42622     return SDValue();
42623   }
42624   default:
42625     return SDValue();
42626   }
42627
42628   // Nuke no-op shuffles that show up after combining.
42629   if (isNoopShuffleMask(Mask))
42630     return N.getOperand(0);
42631
42632   // Look for simplifications involving one or two shuffle instructions.
42633   SDValue V = N.getOperand(0);
42634   switch (N.getOpcode()) {
42635   default:
42636     break;
42637   case X86ISD::PSHUFLW:
42638   case X86ISD::PSHUFHW:
42639     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
42640
42641     // See if this reduces to a PSHUFD which is no more expensive and can
42642     // combine with more operations. Note that it has to at least flip the
42643     // dwords as otherwise it would have been removed as a no-op.
42644     if (ArrayRef(Mask).equals({2, 3, 0, 1})) {
42645       int DMask[] = {0, 1, 2, 3};
42646       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
42647       DMask[DOffset + 0] = DOffset + 1;
42648       DMask[DOffset + 1] = DOffset + 0;
42649       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
42650       V = DAG.getBitcast(DVT, V);
42651       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
42652                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
42653       return DAG.getBitcast(VT, V);
42654     }
42655
42656     // Look for shuffle patterns which can be implemented as a single unpack.
42657     // FIXME: This doesn't handle the location of the PSHUFD generically, and
42658     // only works when we have a PSHUFD followed by two half-shuffles.
42659     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
42660         (V.getOpcode() == X86ISD::PSHUFLW ||
42661          V.getOpcode() == X86ISD::PSHUFHW) &&
42662         V.getOpcode() != N.getOpcode() &&
42663         V.hasOneUse() && V.getOperand(0).hasOneUse()) {
42664       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
42665       if (D.getOpcode() == X86ISD::PSHUFD) {
42666         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
42667         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
42668         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42669         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
42670         int WordMask[8];
42671         for (int i = 0; i < 4; ++i) {
42672           WordMask[i + NOffset] = Mask[i] + NOffset;
42673           WordMask[i + VOffset] = VMask[i] + VOffset;
42674         }
42675         // Map the word mask through the DWord mask.
42676         int MappedMask[8];
42677         for (int i = 0; i < 8; ++i)
42678           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
42679         if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
42680             ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
42681           // We can replace all three shuffles with an unpack.
42682           V = DAG.getBitcast(VT, D.getOperand(0));
42683           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
42684                                                 : X86ISD::UNPCKH,
42685                              DL, VT, V, V);
42686         }
42687       }
42688     }
42689
42690     break;
42691
42692   case X86ISD::PSHUFD:
42693     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
42694       return NewN;
42695
42696     break;
42697   }
42698
42699   return SDValue();
42700 }
42701
42702 /// Checks if the shuffle mask takes subsequent elements
42703 /// alternately from two vectors.
42704 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
42705 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
42706
42707   int ParitySrc[2] = {-1, -1};
42708   unsigned Size = Mask.size();
42709   for (unsigned i = 0; i != Size; ++i) {
42710     int M = Mask[i];
42711     if (M < 0)
42712       continue;
42713
42714     // Make sure we are using the matching element from the input.
42715     if ((M % Size) != i)
42716       return false;
42717
42718     // Make sure we use the same input for all elements of the same parity.
42719     int Src = M / Size;
42720     if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
42721       return false;
42722     ParitySrc[i % 2] = Src;
42723   }
42724
42725   // Make sure each input is used.
42726   if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
42727     return false;
42728
42729   Op0Even = ParitySrc[0] == 0;
42730   return true;
42731 }
42732
42733 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
42734 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
42735 /// are written to the parameters \p Opnd0 and \p Opnd1.
42736 ///
42737 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
42738 /// so it is easier to generically match. We also insert dummy vector shuffle
42739 /// nodes for the operands which explicitly discard the lanes which are unused
42740 /// by this operation to try to flow through the rest of the combiner
42741 /// the fact that they're unused.
42742 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
42743                              SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
42744                              bool &IsSubAdd) {
42745
42746   EVT VT = N->getValueType(0);
42747   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42748   if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
42749       !VT.getSimpleVT().isFloatingPoint())
42750     return false;
42751
42752   // We only handle target-independent shuffles.
42753   // FIXME: It would be easy and harmless to use the target shuffle mask
42754   // extraction tool to support more.
42755   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42756     return false;
42757
42758   SDValue V1 = N->getOperand(0);
42759   SDValue V2 = N->getOperand(1);
42760
42761   // Make sure we have an FADD and an FSUB.
42762   if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
42763       (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
42764       V1.getOpcode() == V2.getOpcode())
42765     return false;
42766
42767   // If there are other uses of these operations we can't fold them.
42768   if (!V1->hasOneUse() || !V2->hasOneUse())
42769     return false;
42770
42771   // Ensure that both operations have the same operands. Note that we can
42772   // commute the FADD operands.
42773   SDValue LHS, RHS;
42774   if (V1.getOpcode() == ISD::FSUB) {
42775     LHS = V1->getOperand(0); RHS = V1->getOperand(1);
42776     if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
42777         (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
42778       return false;
42779   } else {
42780     assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
42781     LHS = V2->getOperand(0); RHS = V2->getOperand(1);
42782     if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
42783         (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
42784       return false;
42785   }
42786
42787   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42788   bool Op0Even;
42789   if (!isAddSubOrSubAddMask(Mask, Op0Even))
42790     return false;
42791
42792   // It's a subadd if the vector in the even parity is an FADD.
42793   IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
42794                      : V2->getOpcode() == ISD::FADD;
42795
42796   Opnd0 = LHS;
42797   Opnd1 = RHS;
42798   return true;
42799 }
42800
42801 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
42802 static SDValue combineShuffleToFMAddSub(SDNode *N,
42803                                         const X86Subtarget &Subtarget,
42804                                         SelectionDAG &DAG) {
42805   // We only handle target-independent shuffles.
42806   // FIXME: It would be easy and harmless to use the target shuffle mask
42807   // extraction tool to support more.
42808   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
42809     return SDValue();
42810
42811   MVT VT = N->getSimpleValueType(0);
42812   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42813   if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
42814     return SDValue();
42815
42816   // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
42817   SDValue Op0 = N->getOperand(0);
42818   SDValue Op1 = N->getOperand(1);
42819   SDValue FMAdd = Op0, FMSub = Op1;
42820   if (FMSub.getOpcode() != X86ISD::FMSUB)
42821     std::swap(FMAdd, FMSub);
42822
42823   if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
42824       FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
42825       FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
42826       FMAdd.getOperand(2) != FMSub.getOperand(2))
42827     return SDValue();
42828
42829   // Check for correct shuffle mask.
42830   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
42831   bool Op0Even;
42832   if (!isAddSubOrSubAddMask(Mask, Op0Even))
42833     return SDValue();
42834
42835   // FMAddSub takes zeroth operand from FMSub node.
42836   SDLoc DL(N);
42837   bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
42838   unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42839   return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
42840                      FMAdd.getOperand(2));
42841 }
42842
42843 /// Try to combine a shuffle into a target-specific add-sub or
42844 /// mul-add-sub node.
42845 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
42846                                                 const X86Subtarget &Subtarget,
42847                                                 SelectionDAG &DAG) {
42848   if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
42849     return V;
42850
42851   SDValue Opnd0, Opnd1;
42852   bool IsSubAdd;
42853   if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
42854     return SDValue();
42855
42856   MVT VT = N->getSimpleValueType(0);
42857   SDLoc DL(N);
42858
42859   // Try to generate X86ISD::FMADDSUB node here.
42860   SDValue Opnd2;
42861   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
42862     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
42863     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
42864   }
42865
42866   if (IsSubAdd)
42867     return SDValue();
42868
42869   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
42870   // the ADDSUB idiom has been successfully recognized. There are no known
42871   // X86 targets with 512-bit ADDSUB instructions!
42872   if (VT.is512BitVector())
42873     return SDValue();
42874
42875   // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
42876   // the ADDSUB idiom has been successfully recognized. There are no known
42877   // X86 targets with FP16 ADDSUB instructions!
42878   if (VT.getVectorElementType() == MVT::f16)
42879     return SDValue();
42880
42881   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
42882 }
42883
42884 // We are looking for a shuffle where both sources are concatenated with undef
42885 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
42886 // if we can express this as a single-source shuffle, that's preferable.
42887 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
42888                                            const X86Subtarget &Subtarget) {
42889   if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
42890     return SDValue();
42891
42892   EVT VT = N->getValueType(0);
42893
42894   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
42895   if (!VT.is128BitVector() && !VT.is256BitVector())
42896     return SDValue();
42897
42898   if (VT.getVectorElementType() != MVT::i32 &&
42899       VT.getVectorElementType() != MVT::i64 &&
42900       VT.getVectorElementType() != MVT::f32 &&
42901       VT.getVectorElementType() != MVT::f64)
42902     return SDValue();
42903
42904   SDValue N0 = N->getOperand(0);
42905   SDValue N1 = N->getOperand(1);
42906
42907   // Check that both sources are concats with undef.
42908   if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
42909       N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
42910       N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
42911       !N1.getOperand(1).isUndef())
42912     return SDValue();
42913
42914   // Construct the new shuffle mask. Elements from the first source retain their
42915   // index, but elements from the second source no longer need to skip an undef.
42916   SmallVector<int, 8> Mask;
42917   int NumElts = VT.getVectorNumElements();
42918
42919   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
42920   for (int Elt : SVOp->getMask())
42921     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
42922
42923   SDLoc DL(N);
42924   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
42925                                N1.getOperand(0));
42926   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
42927 }
42928
42929 /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
42930 /// low half of each source vector and does not set any high half elements in
42931 /// the destination vector, narrow the shuffle to half its original size.
42932 static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
42933   EVT VT = Shuf->getValueType(0);
42934   if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
42935     return SDValue();
42936   if (!VT.is256BitVector() && !VT.is512BitVector())
42937     return SDValue();
42938
42939   // See if we can ignore all of the high elements of the shuffle.
42940   ArrayRef<int> Mask = Shuf->getMask();
42941   if (!isUndefUpperHalf(Mask))
42942     return SDValue();
42943
42944   // Check if the shuffle mask accesses only the low half of each input vector
42945   // (half-index output is 0 or 2).
42946   int HalfIdx1, HalfIdx2;
42947   SmallVector<int, 8> HalfMask(Mask.size() / 2);
42948   if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
42949       (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
42950     return SDValue();
42951
42952   // Create a half-width shuffle to replace the unnecessarily wide shuffle.
42953   // The trick is knowing that all of the insert/extract are actually free
42954   // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
42955   // of narrow inputs into a narrow output, and that is always cheaper than
42956   // the wide shuffle that we started with.
42957   return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
42958                                Shuf->getOperand(1), HalfMask, HalfIdx1,
42959                                HalfIdx2, false, DAG, /*UseConcat*/ true);
42960 }
42961
42962 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
42963                               TargetLowering::DAGCombinerInfo &DCI,
42964                               const X86Subtarget &Subtarget) {
42965   if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
42966     if (SDValue V = narrowShuffle(Shuf, DAG))
42967       return V;
42968
42969   // If we have legalized the vector types, look for blends of FADD and FSUB
42970   // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
42971   SDLoc dl(N);
42972   EVT VT = N->getValueType(0);
42973   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42974   if (TLI.isTypeLegal(VT))
42975     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
42976       return AddSub;
42977
42978   // Attempt to combine into a vector load/broadcast.
42979   if (SDValue LD = combineToConsecutiveLoads(
42980           VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
42981     return LD;
42982
42983   // For AVX2, we sometimes want to combine
42984   // (vector_shuffle <mask> (concat_vectors t1, undef)
42985   //                        (concat_vectors t2, undef))
42986   // Into:
42987   // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
42988   // Since the latter can be efficiently lowered with VPERMD/VPERMQ
42989   if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
42990     return ShufConcat;
42991
42992   if (isTargetShuffle(N->getOpcode())) {
42993     SDValue Op(N, 0);
42994     if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
42995       return Shuffle;
42996
42997     // Try recursively combining arbitrary sequences of x86 shuffle
42998     // instructions into higher-order shuffles. We do this after combining
42999     // specific PSHUF instruction sequences into their minimal form so that we
43000     // can evaluate how many specialized shuffle instructions are involved in
43001     // a particular chain.
43002     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43003       return Res;
43004
43005     // Simplify source operands based on shuffle mask.
43006     // TODO - merge this into combineX86ShufflesRecursively.
43007     APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43008     if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43009       return SDValue(N, 0);
43010
43011     // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43012     // Perform this after other shuffle combines to allow inner shuffles to be
43013     // combined away first.
43014     if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))
43015       return BinOp;
43016   }
43017
43018   return SDValue();
43019 }
43020
43021 // Simplify variable target shuffle masks based on the demanded elements.
43022 // TODO: Handle DemandedBits in mask indices as well?
43023 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
43024     SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43025     TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43026   // If we're demanding all elements don't bother trying to simplify the mask.
43027   unsigned NumElts = DemandedElts.getBitWidth();
43028   if (DemandedElts.isAllOnes())
43029     return false;
43030
43031   SDValue Mask = Op.getOperand(MaskIndex);
43032   if (!Mask.hasOneUse())
43033     return false;
43034
43035   // Attempt to generically simplify the variable shuffle mask.
43036   APInt MaskUndef, MaskZero;
43037   if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43038                                  Depth + 1))
43039     return true;
43040
43041   // Attempt to extract+simplify a (constant pool load) shuffle mask.
43042   // TODO: Support other types from getTargetShuffleMaskIndices?
43043   SDValue BC = peekThroughOneUseBitcasts(Mask);
43044   EVT BCVT = BC.getValueType();
43045   auto *Load = dyn_cast<LoadSDNode>(BC);
43046   if (!Load)
43047     return false;
43048
43049   const Constant *C = getTargetConstantFromNode(Load);
43050   if (!C)
43051     return false;
43052
43053   Type *CTy = C->getType();
43054   if (!CTy->isVectorTy() ||
43055       CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43056     return false;
43057
43058   // Handle scaling for i64 elements on 32-bit targets.
43059   unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43060   if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43061     return false;
43062   unsigned Scale = NumCstElts / NumElts;
43063
43064   // Simplify mask if we have an undemanded element that is not undef.
43065   bool Simplified = false;
43066   SmallVector<Constant *, 32> ConstVecOps;
43067   for (unsigned i = 0; i != NumCstElts; ++i) {
43068     Constant *Elt = C->getAggregateElement(i);
43069     if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43070       ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43071       Simplified = true;
43072       continue;
43073     }
43074     ConstVecOps.push_back(Elt);
43075   }
43076   if (!Simplified)
43077     return false;
43078
43079   // Generate new constant pool entry + legalize immediately for the load.
43080   SDLoc DL(Op);
43081   SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43082   SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43083   SDValue NewMask = TLO.DAG.getLoad(
43084       BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43085       MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
43086       Load->getAlign());
43087   return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43088 }
43089
43090 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
43091     SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43092     TargetLoweringOpt &TLO, unsigned Depth) const {
43093   int NumElts = DemandedElts.getBitWidth();
43094   unsigned Opc = Op.getOpcode();
43095   EVT VT = Op.getValueType();
43096
43097   // Handle special case opcodes.
43098   switch (Opc) {
43099   case X86ISD::PMULDQ:
43100   case X86ISD::PMULUDQ: {
43101     APInt LHSUndef, LHSZero;
43102     APInt RHSUndef, RHSZero;
43103     SDValue LHS = Op.getOperand(0);
43104     SDValue RHS = Op.getOperand(1);
43105     if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43106                                    Depth + 1))
43107       return true;
43108     if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43109                                    Depth + 1))
43110       return true;
43111     // Multiply by zero.
43112     KnownZero = LHSZero | RHSZero;
43113     break;
43114   }
43115   case X86ISD::VPMADDWD: {
43116     APInt LHSUndef, LHSZero;
43117     APInt RHSUndef, RHSZero;
43118     SDValue LHS = Op.getOperand(0);
43119     SDValue RHS = Op.getOperand(1);
43120     APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43121
43122     if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43123                                    Depth + 1))
43124       return true;
43125     if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43126                                    Depth + 1))
43127       return true;
43128
43129     // TODO: Multiply by zero.
43130
43131     // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43132     APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43133     if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43134                                    Depth + 1))
43135       return true;
43136     APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43137     if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43138                                    Depth + 1))
43139       return true;
43140     break;
43141   }
43142   case X86ISD::PSADBW: {
43143     SDValue LHS = Op.getOperand(0);
43144     SDValue RHS = Op.getOperand(1);
43145     assert(VT.getScalarType() == MVT::i64 &&
43146            LHS.getValueType() == RHS.getValueType() &&
43147            LHS.getValueType().getScalarType() == MVT::i8 &&
43148            "Unexpected PSADBW types");
43149
43150     // Aggressively peek through ops to get at the demanded elts.
43151     if (!DemandedElts.isAllOnes()) {
43152       unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43153       APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43154       SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
43155           LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43156       SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
43157           RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43158       if (NewLHS || NewRHS) {
43159         NewLHS = NewLHS ? NewLHS : LHS;
43160         NewRHS = NewRHS ? NewRHS : RHS;
43161         return TLO.CombineTo(
43162             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43163       }
43164     }
43165     break;
43166   }
43167   case X86ISD::VSHL:
43168   case X86ISD::VSRL:
43169   case X86ISD::VSRA: {
43170     // We only need the bottom 64-bits of the (128-bit) shift amount.
43171     SDValue Amt = Op.getOperand(1);
43172     MVT AmtVT = Amt.getSimpleValueType();
43173     assert(AmtVT.is128BitVector() && "Unexpected value type");
43174
43175     // If we reuse the shift amount just for sse shift amounts then we know that
43176     // only the bottom 64-bits are only ever used.
43177     bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
43178       unsigned UseOpc = Use->getOpcode();
43179       return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43180               UseOpc == X86ISD::VSRA) &&
43181              Use->getOperand(0) != Amt;
43182     });
43183
43184     APInt AmtUndef, AmtZero;
43185     unsigned NumAmtElts = AmtVT.getVectorNumElements();
43186     APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43187     if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43188                                    Depth + 1, AssumeSingleUse))
43189       return true;
43190     [[fallthrough]];
43191   }
43192   case X86ISD::VSHLI:
43193   case X86ISD::VSRLI:
43194   case X86ISD::VSRAI: {
43195     SDValue Src = Op.getOperand(0);
43196     APInt SrcUndef;
43197     if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43198                                    Depth + 1))
43199       return true;
43200
43201     // Fold shift(0,x) -> 0
43202     if (DemandedElts.isSubsetOf(KnownZero))
43203       return TLO.CombineTo(
43204           Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43205
43206     // Aggressively peek through ops to get at the demanded elts.
43207     if (!DemandedElts.isAllOnes())
43208       if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
43209               Src, DemandedElts, TLO.DAG, Depth + 1))
43210         return TLO.CombineTo(
43211             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43212     break;
43213   }
43214   case X86ISD::VPSHA:
43215   case X86ISD::VPSHL:
43216   case X86ISD::VSHLV:
43217   case X86ISD::VSRLV:
43218   case X86ISD::VSRAV: {
43219     APInt LHSUndef, LHSZero;
43220     APInt RHSUndef, RHSZero;
43221     SDValue LHS = Op.getOperand(0);
43222     SDValue RHS = Op.getOperand(1);
43223     if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43224                                    Depth + 1))
43225       return true;
43226
43227     // Fold shift(0,x) -> 0
43228     if (DemandedElts.isSubsetOf(LHSZero))
43229       return TLO.CombineTo(
43230           Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43231
43232     if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43233                                    Depth + 1))
43234       return true;
43235
43236     KnownZero = LHSZero;
43237     break;
43238   }
43239   case X86ISD::KSHIFTL: {
43240     SDValue Src = Op.getOperand(0);
43241     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43242     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43243     unsigned ShiftAmt = Amt->getZExtValue();
43244
43245     if (ShiftAmt == 0)
43246       return TLO.CombineTo(Op, Src);
43247
43248     // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43249     // single shift.  We can do this if the bottom bits (which are shifted
43250     // out) are never demanded.
43251     if (Src.getOpcode() == X86ISD::KSHIFTR) {
43252       if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43253         unsigned C1 = Src.getConstantOperandVal(1);
43254         unsigned NewOpc = X86ISD::KSHIFTL;
43255         int Diff = ShiftAmt - C1;
43256         if (Diff < 0) {
43257           Diff = -Diff;
43258           NewOpc = X86ISD::KSHIFTR;
43259         }
43260
43261         SDLoc dl(Op);
43262         SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43263         return TLO.CombineTo(
43264             Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43265       }
43266     }
43267
43268     APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43269     if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43270                                    Depth + 1))
43271       return true;
43272
43273     KnownUndef <<= ShiftAmt;
43274     KnownZero <<= ShiftAmt;
43275     KnownZero.setLowBits(ShiftAmt);
43276     break;
43277   }
43278   case X86ISD::KSHIFTR: {
43279     SDValue Src = Op.getOperand(0);
43280     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43281     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43282     unsigned ShiftAmt = Amt->getZExtValue();
43283
43284     if (ShiftAmt == 0)
43285       return TLO.CombineTo(Op, Src);
43286
43287     // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43288     // single shift.  We can do this if the top bits (which are shifted
43289     // out) are never demanded.
43290     if (Src.getOpcode() == X86ISD::KSHIFTL) {
43291       if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43292         unsigned C1 = Src.getConstantOperandVal(1);
43293         unsigned NewOpc = X86ISD::KSHIFTR;
43294         int Diff = ShiftAmt - C1;
43295         if (Diff < 0) {
43296           Diff = -Diff;
43297           NewOpc = X86ISD::KSHIFTL;
43298         }
43299
43300         SDLoc dl(Op);
43301         SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43302         return TLO.CombineTo(
43303             Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43304       }
43305     }
43306
43307     APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43308     if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43309                                    Depth + 1))
43310       return true;
43311
43312     KnownUndef.lshrInPlace(ShiftAmt);
43313     KnownZero.lshrInPlace(ShiftAmt);
43314     KnownZero.setHighBits(ShiftAmt);
43315     break;
43316   }
43317   case X86ISD::ANDNP: {
43318     // ANDNP = (~LHS & RHS);
43319     SDValue LHS = Op.getOperand(0);
43320     SDValue RHS = Op.getOperand(1);
43321
43322     auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43323       APInt UndefElts;
43324       SmallVector<APInt> EltBits;
43325       int NumElts = VT.getVectorNumElements();
43326       int EltSizeInBits = VT.getScalarSizeInBits();
43327       APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43328       APInt OpElts = DemandedElts;
43329       if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43330                                         EltBits)) {
43331         OpBits.clearAllBits();
43332         OpElts.clearAllBits();
43333         for (int I = 0; I != NumElts; ++I) {
43334           if (!DemandedElts[I])
43335             continue;
43336           if (UndefElts[I]) {
43337             // We can't assume an undef src element gives an undef dst - the
43338             // other src might be zero.
43339             OpBits.setAllBits();
43340             OpElts.setBit(I);
43341           } else if ((Invert && !EltBits[I].isAllOnes()) ||
43342                      (!Invert && !EltBits[I].isZero())) {
43343             OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43344             OpElts.setBit(I);
43345           }
43346         }
43347       }
43348       return std::make_pair(OpBits, OpElts);
43349     };
43350     APInt BitsLHS, EltsLHS;
43351     APInt BitsRHS, EltsRHS;
43352     std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43353     std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43354
43355     APInt LHSUndef, LHSZero;
43356     APInt RHSUndef, RHSZero;
43357     if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43358                                    Depth + 1))
43359       return true;
43360     if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43361                                    Depth + 1))
43362       return true;
43363
43364     if (!DemandedElts.isAllOnes()) {
43365       SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43366                                                        TLO.DAG, Depth + 1);
43367       SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43368                                                        TLO.DAG, Depth + 1);
43369       if (NewLHS || NewRHS) {
43370         NewLHS = NewLHS ? NewLHS : LHS;
43371         NewRHS = NewRHS ? NewRHS : RHS;
43372         return TLO.CombineTo(
43373             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43374       }
43375     }
43376     break;
43377   }
43378   case X86ISD::CVTSI2P:
43379   case X86ISD::CVTUI2P: {
43380     SDValue Src = Op.getOperand(0);
43381     MVT SrcVT = Src.getSimpleValueType();
43382     APInt SrcUndef, SrcZero;
43383     APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43384     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43385                                    Depth + 1))
43386       return true;
43387     break;
43388   }
43389   case X86ISD::PACKSS:
43390   case X86ISD::PACKUS: {
43391     SDValue N0 = Op.getOperand(0);
43392     SDValue N1 = Op.getOperand(1);
43393
43394     APInt DemandedLHS, DemandedRHS;
43395     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43396
43397     APInt LHSUndef, LHSZero;
43398     if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43399                                    Depth + 1))
43400       return true;
43401     APInt RHSUndef, RHSZero;
43402     if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43403                                    Depth + 1))
43404       return true;
43405
43406     // TODO - pass on known zero/undef.
43407
43408     // Aggressively peek through ops to get at the demanded elts.
43409     // TODO - we should do this for all target/faux shuffles ops.
43410     if (!DemandedElts.isAllOnes()) {
43411       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43412                                                             TLO.DAG, Depth + 1);
43413       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43414                                                             TLO.DAG, Depth + 1);
43415       if (NewN0 || NewN1) {
43416         NewN0 = NewN0 ? NewN0 : N0;
43417         NewN1 = NewN1 ? NewN1 : N1;
43418         return TLO.CombineTo(Op,
43419                              TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43420       }
43421     }
43422     break;
43423   }
43424   case X86ISD::HADD:
43425   case X86ISD::HSUB:
43426   case X86ISD::FHADD:
43427   case X86ISD::FHSUB: {
43428     SDValue N0 = Op.getOperand(0);
43429     SDValue N1 = Op.getOperand(1);
43430
43431     APInt DemandedLHS, DemandedRHS;
43432     getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43433
43434     APInt LHSUndef, LHSZero;
43435     if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43436                                    Depth + 1))
43437       return true;
43438     APInt RHSUndef, RHSZero;
43439     if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43440                                    Depth + 1))
43441       return true;
43442
43443     // TODO - pass on known zero/undef.
43444
43445     // Aggressively peek through ops to get at the demanded elts.
43446     // TODO: Handle repeated operands.
43447     if (N0 != N1 && !DemandedElts.isAllOnes()) {
43448       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43449                                                             TLO.DAG, Depth + 1);
43450       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43451                                                             TLO.DAG, Depth + 1);
43452       if (NewN0 || NewN1) {
43453         NewN0 = NewN0 ? NewN0 : N0;
43454         NewN1 = NewN1 ? NewN1 : N1;
43455         return TLO.CombineTo(Op,
43456                              TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43457       }
43458     }
43459     break;
43460   }
43461   case X86ISD::VTRUNC:
43462   case X86ISD::VTRUNCS:
43463   case X86ISD::VTRUNCUS: {
43464     SDValue Src = Op.getOperand(0);
43465     MVT SrcVT = Src.getSimpleValueType();
43466     APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43467     APInt SrcUndef, SrcZero;
43468     if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43469                                    Depth + 1))
43470       return true;
43471     KnownZero = SrcZero.zextOrTrunc(NumElts);
43472     KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43473     break;
43474   }
43475   case X86ISD::BLENDV: {
43476     APInt SelUndef, SelZero;
43477     if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43478                                    SelZero, TLO, Depth + 1))
43479       return true;
43480
43481     // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43482     APInt LHSUndef, LHSZero;
43483     if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43484                                    LHSZero, TLO, Depth + 1))
43485       return true;
43486
43487     APInt RHSUndef, RHSZero;
43488     if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43489                                    RHSZero, TLO, Depth + 1))
43490       return true;
43491
43492     KnownZero = LHSZero & RHSZero;
43493     KnownUndef = LHSUndef & RHSUndef;
43494     break;
43495   }
43496   case X86ISD::VZEXT_MOVL: {
43497     // If upper demanded elements are already zero then we have nothing to do.
43498     SDValue Src = Op.getOperand(0);
43499     APInt DemandedUpperElts = DemandedElts;
43500     DemandedUpperElts.clearLowBits(1);
43501     if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43502       return TLO.CombineTo(Op, Src);
43503     break;
43504   }
43505   case X86ISD::VBROADCAST: {
43506     SDValue Src = Op.getOperand(0);
43507     MVT SrcVT = Src.getSimpleValueType();
43508     if (!SrcVT.isVector())
43509       break;
43510     // Don't bother broadcasting if we just need the 0'th element.
43511     if (DemandedElts == 1) {
43512       if (Src.getValueType() != VT)
43513         Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43514                              SDLoc(Op));
43515       return TLO.CombineTo(Op, Src);
43516     }
43517     APInt SrcUndef, SrcZero;
43518     APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43519     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43520                                    Depth + 1))
43521       return true;
43522     // Aggressively peek through src to get at the demanded elt.
43523     // TODO - we should do this for all target/faux shuffles ops.
43524     if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
43525             Src, SrcElts, TLO.DAG, Depth + 1))
43526       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43527     break;
43528   }
43529   case X86ISD::VPERMV:
43530     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
43531                                                    Depth))
43532       return true;
43533     break;
43534   case X86ISD::PSHUFB:
43535   case X86ISD::VPERMV3:
43536   case X86ISD::VPERMILPV:
43537     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
43538                                                    Depth))
43539       return true;
43540     break;
43541   case X86ISD::VPPERM:
43542   case X86ISD::VPERMIL2:
43543     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
43544                                                    Depth))
43545       return true;
43546     break;
43547   }
43548
43549   // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
43550   // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
43551   // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
43552   if ((VT.is256BitVector() || VT.is512BitVector()) &&
43553       DemandedElts.lshr(NumElts / 2) == 0) {
43554     unsigned SizeInBits = VT.getSizeInBits();
43555     unsigned ExtSizeInBits = SizeInBits / 2;
43556
43557     // See if 512-bit ops only use the bottom 128-bits.
43558     if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
43559       ExtSizeInBits = SizeInBits / 4;
43560
43561     switch (Opc) {
43562       // Scalar broadcast.
43563     case X86ISD::VBROADCAST: {
43564       SDLoc DL(Op);
43565       SDValue Src = Op.getOperand(0);
43566       if (Src.getValueSizeInBits() > ExtSizeInBits)
43567         Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
43568       EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43569                                     ExtSizeInBits / VT.getScalarSizeInBits());
43570       SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
43571       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43572                                                TLO.DAG, DL, ExtSizeInBits));
43573     }
43574     case X86ISD::VBROADCAST_LOAD: {
43575       SDLoc DL(Op);
43576       auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43577       EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43578                                     ExtSizeInBits / VT.getScalarSizeInBits());
43579       SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
43580       SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
43581       SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
43582           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
43583           MemIntr->getMemOperand());
43584       TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
43585                                            Bcst.getValue(1));
43586       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
43587                                                TLO.DAG, DL, ExtSizeInBits));
43588     }
43589       // Subvector broadcast.
43590     case X86ISD::SUBV_BROADCAST_LOAD: {
43591       auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
43592       EVT MemVT = MemIntr->getMemoryVT();
43593       if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
43594         SDLoc DL(Op);
43595         SDValue Ld =
43596             TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
43597                             MemIntr->getBasePtr(), MemIntr->getMemOperand());
43598         TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
43599                                              Ld.getValue(1));
43600         return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
43601                                                  TLO.DAG, DL, ExtSizeInBits));
43602       } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
43603         SDLoc DL(Op);
43604         EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
43605                                       ExtSizeInBits / VT.getScalarSizeInBits());
43606         if (SDValue BcstLd =
43607                 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
43608           return TLO.CombineTo(Op,
43609                                insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
43610                                                TLO.DAG, DL, ExtSizeInBits));
43611       }
43612       break;
43613     }
43614       // Byte shifts by immediate.
43615     case X86ISD::VSHLDQ:
43616     case X86ISD::VSRLDQ:
43617       // Shift by uniform.
43618     case X86ISD::VSHL:
43619     case X86ISD::VSRL:
43620     case X86ISD::VSRA:
43621       // Shift by immediate.
43622     case X86ISD::VSHLI:
43623     case X86ISD::VSRLI:
43624     case X86ISD::VSRAI: {
43625       SDLoc DL(Op);
43626       SDValue Ext0 =
43627           extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
43628       SDValue ExtOp =
43629           TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
43630       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43631       SDValue Insert =
43632           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43633       return TLO.CombineTo(Op, Insert);
43634     }
43635     case X86ISD::VPERMI: {
43636       // Simplify PERMPD/PERMQ to extract_subvector.
43637       // TODO: This should be done in shuffle combining.
43638       if (VT == MVT::v4f64 || VT == MVT::v4i64) {
43639         SmallVector<int, 4> Mask;
43640         DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
43641         if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
43642           SDLoc DL(Op);
43643           SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
43644           SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43645           SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
43646           return TLO.CombineTo(Op, Insert);
43647         }
43648       }
43649       break;
43650     }
43651     case X86ISD::VPERM2X128: {
43652       // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
43653       SDLoc DL(Op);
43654       unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
43655       if (LoMask & 0x8)
43656         return TLO.CombineTo(
43657             Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
43658       unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
43659       unsigned SrcIdx = (LoMask & 0x2) >> 1;
43660       SDValue ExtOp =
43661           extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
43662       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43663       SDValue Insert =
43664           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43665       return TLO.CombineTo(Op, Insert);
43666     }
43667       // Zero upper elements.
43668     case X86ISD::VZEXT_MOVL:
43669       // Target unary shuffles by immediate:
43670     case X86ISD::PSHUFD:
43671     case X86ISD::PSHUFLW:
43672     case X86ISD::PSHUFHW:
43673     case X86ISD::VPERMILPI:
43674       // (Non-Lane Crossing) Target Shuffles.
43675     case X86ISD::VPERMILPV:
43676     case X86ISD::VPERMIL2:
43677     case X86ISD::PSHUFB:
43678     case X86ISD::UNPCKL:
43679     case X86ISD::UNPCKH:
43680     case X86ISD::BLENDI:
43681       // Integer ops.
43682     case X86ISD::PACKSS:
43683     case X86ISD::PACKUS:
43684       // Horizontal Ops.
43685     case X86ISD::HADD:
43686     case X86ISD::HSUB:
43687     case X86ISD::FHADD:
43688     case X86ISD::FHSUB: {
43689       SDLoc DL(Op);
43690       SmallVector<SDValue, 4> Ops;
43691       for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
43692         SDValue SrcOp = Op.getOperand(i);
43693         EVT SrcVT = SrcOp.getValueType();
43694         assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
43695                "Unsupported vector size");
43696         Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
43697                                                           ExtSizeInBits)
43698                                        : SrcOp);
43699       }
43700       MVT ExtVT = VT.getSimpleVT();
43701       ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
43702                                ExtSizeInBits / ExtVT.getScalarSizeInBits());
43703       SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
43704       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
43705       SDValue Insert =
43706           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
43707       return TLO.CombineTo(Op, Insert);
43708     }
43709     }
43710   }
43711
43712   // For splats, unless we *only* demand the 0'th element,
43713   // stop attempts at simplification here, we aren't going to improve things,
43714   // this is better than any potential shuffle.
43715   if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
43716     return false;
43717
43718   // Get target/faux shuffle mask.
43719   APInt OpUndef, OpZero;
43720   SmallVector<int, 64> OpMask;
43721   SmallVector<SDValue, 2> OpInputs;
43722   if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
43723                               OpZero, TLO.DAG, Depth, false))
43724     return false;
43725
43726   // Shuffle inputs must be the same size as the result.
43727   if (OpMask.size() != (unsigned)NumElts ||
43728       llvm::any_of(OpInputs, [VT](SDValue V) {
43729         return VT.getSizeInBits() != V.getValueSizeInBits() ||
43730                !V.getValueType().isVector();
43731       }))
43732     return false;
43733
43734   KnownZero = OpZero;
43735   KnownUndef = OpUndef;
43736
43737   // Check if shuffle mask can be simplified to undef/zero/identity.
43738   int NumSrcs = OpInputs.size();
43739   for (int i = 0; i != NumElts; ++i)
43740     if (!DemandedElts[i])
43741       OpMask[i] = SM_SentinelUndef;
43742
43743   if (isUndefInRange(OpMask, 0, NumElts)) {
43744     KnownUndef.setAllBits();
43745     return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
43746   }
43747   if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
43748     KnownZero.setAllBits();
43749     return TLO.CombineTo(
43750         Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43751   }
43752   for (int Src = 0; Src != NumSrcs; ++Src)
43753     if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
43754       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
43755
43756   // Attempt to simplify inputs.
43757   for (int Src = 0; Src != NumSrcs; ++Src) {
43758     // TODO: Support inputs of different types.
43759     if (OpInputs[Src].getValueType() != VT)
43760       continue;
43761
43762     int Lo = Src * NumElts;
43763     APInt SrcElts = APInt::getZero(NumElts);
43764     for (int i = 0; i != NumElts; ++i)
43765       if (DemandedElts[i]) {
43766         int M = OpMask[i] - Lo;
43767         if (0 <= M && M < NumElts)
43768           SrcElts.setBit(M);
43769       }
43770
43771     // TODO - Propagate input undef/zero elts.
43772     APInt SrcUndef, SrcZero;
43773     if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
43774                                    TLO, Depth + 1))
43775       return true;
43776   }
43777
43778   // If we don't demand all elements, then attempt to combine to a simpler
43779   // shuffle.
43780   // We need to convert the depth to something combineX86ShufflesRecursively
43781   // can handle - so pretend its Depth == 0 again, and reduce the max depth
43782   // to match. This prevents combineX86ShuffleChain from returning a
43783   // combined shuffle that's the same as the original root, causing an
43784   // infinite loop.
43785   if (!DemandedElts.isAllOnes()) {
43786     assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
43787
43788     SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
43789     for (int i = 0; i != NumElts; ++i)
43790       if (DemandedElts[i])
43791         DemandedMask[i] = i;
43792
43793     SDValue NewShuffle = combineX86ShufflesRecursively(
43794         {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
43795         /*HasVarMask*/ false,
43796         /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
43797         Subtarget);
43798     if (NewShuffle)
43799       return TLO.CombineTo(Op, NewShuffle);
43800   }
43801
43802   return false;
43803 }
43804
43805 bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
43806     SDValue Op, const APInt &OriginalDemandedBits,
43807     const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
43808     unsigned Depth) const {
43809   EVT VT = Op.getValueType();
43810   unsigned BitWidth = OriginalDemandedBits.getBitWidth();
43811   unsigned Opc = Op.getOpcode();
43812   switch(Opc) {
43813   case X86ISD::VTRUNC: {
43814     KnownBits KnownOp;
43815     SDValue Src = Op.getOperand(0);
43816     MVT SrcVT = Src.getSimpleValueType();
43817
43818     // Simplify the input, using demanded bit information.
43819     APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
43820     APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
43821     if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
43822       return true;
43823     break;
43824   }
43825   case X86ISD::PMULDQ:
43826   case X86ISD::PMULUDQ: {
43827     // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
43828     KnownBits KnownLHS, KnownRHS;
43829     SDValue LHS = Op.getOperand(0);
43830     SDValue RHS = Op.getOperand(1);
43831
43832     // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
43833     // FIXME: Can we bound this better?
43834     APInt DemandedMask = APInt::getLowBitsSet(64, 32);
43835     APInt DemandedMaskLHS = APInt::getAllOnes(64);
43836     APInt DemandedMaskRHS = APInt::getAllOnes(64);
43837
43838     bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
43839     if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
43840       DemandedMaskLHS = DemandedMask;
43841     if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
43842       DemandedMaskRHS = DemandedMask;
43843
43844     if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
43845                              KnownLHS, TLO, Depth + 1))
43846       return true;
43847     if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
43848                              KnownRHS, TLO, Depth + 1))
43849       return true;
43850
43851     // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
43852     KnownRHS = KnownRHS.trunc(32);
43853     if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
43854         KnownRHS.getConstant().isOne()) {
43855       SDLoc DL(Op);
43856       SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
43857       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
43858     }
43859
43860     // Aggressively peek through ops to get at the demanded low bits.
43861     SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
43862         LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43863     SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
43864         RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
43865     if (DemandedLHS || DemandedRHS) {
43866       DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
43867       DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
43868       return TLO.CombineTo(
43869           Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
43870     }
43871     break;
43872   }
43873   case X86ISD::ANDNP: {
43874     KnownBits Known2;
43875     SDValue Op0 = Op.getOperand(0);
43876     SDValue Op1 = Op.getOperand(1);
43877
43878     if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
43879                              Known, TLO, Depth + 1))
43880       return true;
43881     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
43882
43883     if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
43884                              OriginalDemandedElts, Known2, TLO, Depth + 1))
43885       return true;
43886     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
43887
43888     // If the RHS is a constant, see if we can simplify it.
43889     if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
43890                                OriginalDemandedElts, TLO))
43891       return true;
43892
43893     // ANDNP = (~Op0 & Op1);
43894     Known.One &= Known2.Zero;
43895     Known.Zero |= Known2.One;
43896     break;
43897   }
43898   case X86ISD::VSHLI: {
43899     SDValue Op0 = Op.getOperand(0);
43900
43901     unsigned ShAmt = Op.getConstantOperandVal(1);
43902     if (ShAmt >= BitWidth)
43903       break;
43904
43905     APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
43906
43907     // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43908     // single shift.  We can do this if the bottom bits (which are shifted
43909     // out) are never demanded.
43910     if (Op0.getOpcode() == X86ISD::VSRLI &&
43911         OriginalDemandedBits.countr_zero() >= ShAmt) {
43912       unsigned Shift2Amt = Op0.getConstantOperandVal(1);
43913       if (Shift2Amt < BitWidth) {
43914         int Diff = ShAmt - Shift2Amt;
43915         if (Diff == 0)
43916           return TLO.CombineTo(Op, Op0.getOperand(0));
43917
43918         unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
43919         SDValue NewShift = TLO.DAG.getNode(
43920             NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
43921             TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
43922         return TLO.CombineTo(Op, NewShift);
43923       }
43924     }
43925
43926     // If we are only demanding sign bits then we can use the shift source directly.
43927     unsigned NumSignBits =
43928         TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
43929     unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
43930     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
43931       return TLO.CombineTo(Op, Op0);
43932
43933     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43934                              TLO, Depth + 1))
43935       return true;
43936
43937     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
43938     Known.Zero <<= ShAmt;
43939     Known.One <<= ShAmt;
43940
43941     // Low bits known zero.
43942     Known.Zero.setLowBits(ShAmt);
43943     return false;
43944   }
43945   case X86ISD::VSRLI: {
43946     unsigned ShAmt = Op.getConstantOperandVal(1);
43947     if (ShAmt >= BitWidth)
43948       break;
43949
43950     APInt DemandedMask = OriginalDemandedBits << ShAmt;
43951
43952     if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
43953                              OriginalDemandedElts, Known, TLO, Depth + 1))
43954       return true;
43955
43956     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
43957     Known.Zero.lshrInPlace(ShAmt);
43958     Known.One.lshrInPlace(ShAmt);
43959
43960     // High bits known zero.
43961     Known.Zero.setHighBits(ShAmt);
43962     return false;
43963   }
43964   case X86ISD::VSRAI: {
43965     SDValue Op0 = Op.getOperand(0);
43966     SDValue Op1 = Op.getOperand(1);
43967
43968     unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
43969     if (ShAmt >= BitWidth)
43970       break;
43971
43972     APInt DemandedMask = OriginalDemandedBits << ShAmt;
43973
43974     // If we just want the sign bit then we don't need to shift it.
43975     if (OriginalDemandedBits.isSignMask())
43976       return TLO.CombineTo(Op, Op0);
43977
43978     // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
43979     if (Op0.getOpcode() == X86ISD::VSHLI &&
43980         Op.getOperand(1) == Op0.getOperand(1)) {
43981       SDValue Op00 = Op0.getOperand(0);
43982       unsigned NumSignBits =
43983           TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
43984       if (ShAmt < NumSignBits)
43985         return TLO.CombineTo(Op, Op00);
43986     }
43987
43988     // If any of the demanded bits are produced by the sign extension, we also
43989     // demand the input sign bit.
43990     if (OriginalDemandedBits.countl_zero() < ShAmt)
43991       DemandedMask.setSignBit();
43992
43993     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
43994                              TLO, Depth + 1))
43995       return true;
43996
43997     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
43998     Known.Zero.lshrInPlace(ShAmt);
43999     Known.One.lshrInPlace(ShAmt);
44000
44001     // If the input sign bit is known to be zero, or if none of the top bits
44002     // are demanded, turn this into an unsigned shift right.
44003     if (Known.Zero[BitWidth - ShAmt - 1] ||
44004         OriginalDemandedBits.countl_zero() >= ShAmt)
44005       return TLO.CombineTo(
44006           Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44007
44008     // High bits are known one.
44009     if (Known.One[BitWidth - ShAmt - 1])
44010       Known.One.setHighBits(ShAmt);
44011     return false;
44012   }
44013   case X86ISD::BLENDV: {
44014     SDValue Sel = Op.getOperand(0);
44015     SDValue LHS = Op.getOperand(1);
44016     SDValue RHS = Op.getOperand(2);
44017
44018     APInt SignMask = APInt::getSignMask(BitWidth);
44019     SDValue NewSel = SimplifyMultipleUseDemandedBits(
44020         Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44021     SDValue NewLHS = SimplifyMultipleUseDemandedBits(
44022         LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44023     SDValue NewRHS = SimplifyMultipleUseDemandedBits(
44024         RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44025
44026     if (NewSel || NewLHS || NewRHS) {
44027       NewSel = NewSel ? NewSel : Sel;
44028       NewLHS = NewLHS ? NewLHS : LHS;
44029       NewRHS = NewRHS ? NewRHS : RHS;
44030       return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44031                                                NewSel, NewLHS, NewRHS));
44032     }
44033     break;
44034   }
44035   case X86ISD::PEXTRB:
44036   case X86ISD::PEXTRW: {
44037     SDValue Vec = Op.getOperand(0);
44038     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44039     MVT VecVT = Vec.getSimpleValueType();
44040     unsigned NumVecElts = VecVT.getVectorNumElements();
44041
44042     if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44043       unsigned Idx = CIdx->getZExtValue();
44044       unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44045
44046       // If we demand no bits from the vector then we must have demanded
44047       // bits from the implict zext - simplify to zero.
44048       APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44049       if (DemandedVecBits == 0)
44050         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44051
44052       APInt KnownUndef, KnownZero;
44053       APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44054       if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44055                                      KnownZero, TLO, Depth + 1))
44056         return true;
44057
44058       KnownBits KnownVec;
44059       if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44060                                KnownVec, TLO, Depth + 1))
44061         return true;
44062
44063       if (SDValue V = SimplifyMultipleUseDemandedBits(
44064               Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44065         return TLO.CombineTo(
44066             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44067
44068       Known = KnownVec.zext(BitWidth);
44069       return false;
44070     }
44071     break;
44072   }
44073   case X86ISD::PINSRB:
44074   case X86ISD::PINSRW: {
44075     SDValue Vec = Op.getOperand(0);
44076     SDValue Scl = Op.getOperand(1);
44077     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44078     MVT VecVT = Vec.getSimpleValueType();
44079
44080     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44081       unsigned Idx = CIdx->getZExtValue();
44082       if (!OriginalDemandedElts[Idx])
44083         return TLO.CombineTo(Op, Vec);
44084
44085       KnownBits KnownVec;
44086       APInt DemandedVecElts(OriginalDemandedElts);
44087       DemandedVecElts.clearBit(Idx);
44088       if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44089                                KnownVec, TLO, Depth + 1))
44090         return true;
44091
44092       KnownBits KnownScl;
44093       unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44094       APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44095       if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44096         return true;
44097
44098       KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44099       Known = KnownVec.intersectWith(KnownScl);
44100       return false;
44101     }
44102     break;
44103   }
44104   case X86ISD::PACKSS:
44105     // PACKSS saturates to MIN/MAX integer values. So if we just want the
44106     // sign bit then we can just ask for the source operands sign bit.
44107     // TODO - add known bits handling.
44108     if (OriginalDemandedBits.isSignMask()) {
44109       APInt DemandedLHS, DemandedRHS;
44110       getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44111
44112       KnownBits KnownLHS, KnownRHS;
44113       APInt SignMask = APInt::getSignMask(BitWidth * 2);
44114       if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44115                                KnownLHS, TLO, Depth + 1))
44116         return true;
44117       if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44118                                KnownRHS, TLO, Depth + 1))
44119         return true;
44120
44121       // Attempt to avoid multi-use ops if we don't need anything from them.
44122       SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44123           Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44124       SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
44125           Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44126       if (DemandedOp0 || DemandedOp1) {
44127         SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44128         SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44129         return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44130       }
44131     }
44132     // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44133     break;
44134   case X86ISD::VBROADCAST: {
44135     SDValue Src = Op.getOperand(0);
44136     MVT SrcVT = Src.getSimpleValueType();
44137     APInt DemandedElts = APInt::getOneBitSet(
44138         SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44139     if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44140                              TLO, Depth + 1))
44141       return true;
44142     // If we don't need the upper bits, attempt to narrow the broadcast source.
44143     // Don't attempt this on AVX512 as it might affect broadcast folding.
44144     // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44145     if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44146         OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44147         Src->hasOneUse()) {
44148       MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44149       SDValue NewSrc =
44150           TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44151       MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44152       SDValue NewBcst =
44153           TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44154       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44155     }
44156     break;
44157   }
44158   case X86ISD::PCMPGT:
44159     // icmp sgt(0, R) == ashr(R, BitWidth-1).
44160     // iff we only need the sign bit then we can use R directly.
44161     if (OriginalDemandedBits.isSignMask() &&
44162         ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44163       return TLO.CombineTo(Op, Op.getOperand(1));
44164     break;
44165   case X86ISD::MOVMSK: {
44166     SDValue Src = Op.getOperand(0);
44167     MVT SrcVT = Src.getSimpleValueType();
44168     unsigned SrcBits = SrcVT.getScalarSizeInBits();
44169     unsigned NumElts = SrcVT.getVectorNumElements();
44170
44171     // If we don't need the sign bits at all just return zero.
44172     if (OriginalDemandedBits.countr_zero() >= NumElts)
44173       return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44174
44175     // See if we only demand bits from the lower 128-bit vector.
44176     if (SrcVT.is256BitVector() &&
44177         OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44178       SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44179       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44180     }
44181
44182     // Only demand the vector elements of the sign bits we need.
44183     APInt KnownUndef, KnownZero;
44184     APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44185     if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44186                                    TLO, Depth + 1))
44187       return true;
44188
44189     Known.Zero = KnownZero.zext(BitWidth);
44190     Known.Zero.setHighBits(BitWidth - NumElts);
44191
44192     // MOVMSK only uses the MSB from each vector element.
44193     KnownBits KnownSrc;
44194     APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44195     if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44196                              Depth + 1))
44197       return true;
44198
44199     if (KnownSrc.One[SrcBits - 1])
44200       Known.One.setLowBits(NumElts);
44201     else if (KnownSrc.Zero[SrcBits - 1])
44202       Known.Zero.setLowBits(NumElts);
44203
44204     // Attempt to avoid multi-use os if we don't need anything from it.
44205     if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
44206             Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44207       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44208     return false;
44209   }
44210   case X86ISD::TESTP: {
44211     SDValue Op0 = Op.getOperand(0);
44212     SDValue Op1 = Op.getOperand(1);
44213     MVT OpVT = Op0.getSimpleValueType();
44214     assert((OpVT.getVectorElementType() == MVT::f32 ||
44215             OpVT.getVectorElementType() == MVT::f64) &&
44216            "Illegal vector type for X86ISD::TESTP");
44217
44218     // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44219     KnownBits KnownSrc;
44220     APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44221     bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44222     return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44223                                 AssumeSingleUse) ||
44224            SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44225                                 AssumeSingleUse);
44226   }
44227   case X86ISD::BEXTR:
44228   case X86ISD::BEXTRI: {
44229     SDValue Op0 = Op.getOperand(0);
44230     SDValue Op1 = Op.getOperand(1);
44231
44232     // Only bottom 16-bits of the control bits are required.
44233     if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44234       // NOTE: SimplifyDemandedBits won't do this for constants.
44235       uint64_t Val1 = Cst1->getZExtValue();
44236       uint64_t MaskedVal1 = Val1 & 0xFFFF;
44237       if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44238         SDLoc DL(Op);
44239         return TLO.CombineTo(
44240             Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44241                                 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44242       }
44243
44244       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44245       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44246
44247       // If the length is 0, the result is 0.
44248       if (Length == 0) {
44249         Known.setAllZero();
44250         return false;
44251       }
44252
44253       if ((Shift + Length) <= BitWidth) {
44254         APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44255         if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44256           return true;
44257
44258         Known = Known.extractBits(Length, Shift);
44259         Known = Known.zextOrTrunc(BitWidth);
44260         return false;
44261       }
44262     } else {
44263       assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44264       KnownBits Known1;
44265       APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44266       if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44267         return true;
44268
44269       // If the length is 0, replace with 0.
44270       KnownBits LengthBits = Known1.extractBits(8, 8);
44271       if (LengthBits.isZero())
44272         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44273     }
44274
44275     break;
44276   }
44277   case X86ISD::PDEP: {
44278     SDValue Op0 = Op.getOperand(0);
44279     SDValue Op1 = Op.getOperand(1);
44280
44281     unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44282     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44283
44284     // If the demanded bits has leading zeroes, we don't demand those from the
44285     // mask.
44286     if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44287       return true;
44288
44289     // The number of possible 1s in the mask determines the number of LSBs of
44290     // operand 0 used. Undemanded bits from the mask don't matter so filter
44291     // them before counting.
44292     KnownBits Known2;
44293     uint64_t Count = (~Known.Zero & LoMask).popcount();
44294     APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44295     if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44296       return true;
44297
44298     // Zeroes are retained from the mask, but not ones.
44299     Known.One.clearAllBits();
44300     // The result will have at least as many trailing zeros as the non-mask
44301     // operand since bits can only map to the same or higher bit position.
44302     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44303     return false;
44304   }
44305   }
44306
44307   return TargetLowering::SimplifyDemandedBitsForTargetNode(
44308       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
44309 }
44310
44311 SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
44312     SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
44313     SelectionDAG &DAG, unsigned Depth) const {
44314   int NumElts = DemandedElts.getBitWidth();
44315   unsigned Opc = Op.getOpcode();
44316   EVT VT = Op.getValueType();
44317
44318   switch (Opc) {
44319   case X86ISD::PINSRB:
44320   case X86ISD::PINSRW: {
44321     // If we don't demand the inserted element, return the base vector.
44322     SDValue Vec = Op.getOperand(0);
44323     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44324     MVT VecVT = Vec.getSimpleValueType();
44325     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
44326         !DemandedElts[CIdx->getZExtValue()])
44327       return Vec;
44328     break;
44329   }
44330   case X86ISD::VSHLI: {
44331     // If we are only demanding sign bits then we can use the shift source
44332     // directly.
44333     SDValue Op0 = Op.getOperand(0);
44334     unsigned ShAmt = Op.getConstantOperandVal(1);
44335     unsigned BitWidth = DemandedBits.getBitWidth();
44336     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
44337     unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
44338     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44339       return Op0;
44340     break;
44341   }
44342   case X86ISD::VSRAI:
44343     // iff we only need the sign bit then we can use the source directly.
44344     // TODO: generalize where we only demand extended signbits.
44345     if (DemandedBits.isSignMask())
44346       return Op.getOperand(0);
44347     break;
44348   case X86ISD::PCMPGT:
44349     // icmp sgt(0, R) == ashr(R, BitWidth-1).
44350     // iff we only need the sign bit then we can use R directly.
44351     if (DemandedBits.isSignMask() &&
44352         ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
44353       return Op.getOperand(1);
44354     break;
44355   case X86ISD::ANDNP: {
44356     // ANDNP = (~LHS & RHS);
44357     SDValue LHS = Op.getOperand(0);
44358     SDValue RHS = Op.getOperand(1);
44359
44360     KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
44361     KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
44362
44363     // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
44364     // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
44365     // this context, so return RHS.
44366     if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
44367       return RHS;
44368     break;
44369   }
44370   }
44371
44372   APInt ShuffleUndef, ShuffleZero;
44373   SmallVector<int, 16> ShuffleMask;
44374   SmallVector<SDValue, 2> ShuffleOps;
44375   if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
44376                              ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
44377     // If all the demanded elts are from one operand and are inline,
44378     // then we can use the operand directly.
44379     int NumOps = ShuffleOps.size();
44380     if (ShuffleMask.size() == (unsigned)NumElts &&
44381         llvm::all_of(ShuffleOps, [VT](SDValue V) {
44382           return VT.getSizeInBits() == V.getValueSizeInBits();
44383         })) {
44384
44385       if (DemandedElts.isSubsetOf(ShuffleUndef))
44386         return DAG.getUNDEF(VT);
44387       if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
44388         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
44389
44390       // Bitmask that indicates which ops have only been accessed 'inline'.
44391       APInt IdentityOp = APInt::getAllOnes(NumOps);
44392       for (int i = 0; i != NumElts; ++i) {
44393         int M = ShuffleMask[i];
44394         if (!DemandedElts[i] || ShuffleUndef[i])
44395           continue;
44396         int OpIdx = M / NumElts;
44397         int EltIdx = M % NumElts;
44398         if (M < 0 || EltIdx != i) {
44399           IdentityOp.clearAllBits();
44400           break;
44401         }
44402         IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
44403         if (IdentityOp == 0)
44404           break;
44405       }
44406       assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
44407              "Multiple identity shuffles detected");
44408
44409       if (IdentityOp != 0)
44410         return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
44411     }
44412   }
44413
44414   return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
44415       Op, DemandedBits, DemandedElts, DAG, Depth);
44416 }
44417
44418 bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
44419     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44420     bool PoisonOnly, unsigned Depth) const {
44421   unsigned EltsBits = Op.getScalarValueSizeInBits();
44422   unsigned NumElts = DemandedElts.getBitWidth();
44423
44424   // TODO: Add more target shuffles.
44425   switch (Op.getOpcode()) {
44426   case X86ISD::PSHUFD:
44427   case X86ISD::VPERMILPI: {
44428     SmallVector<int, 8> Mask;
44429     DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);
44430
44431     APInt DemandedSrcElts = APInt::getZero(NumElts);
44432     for (unsigned I = 0; I != NumElts; ++I)
44433       if (DemandedElts[I])
44434         DemandedSrcElts.setBit(Mask[I]);
44435
44436     return DAG.isGuaranteedNotToBeUndefOrPoison(
44437         Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);
44438   }
44439   }
44440   return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
44441       Op, DemandedElts, DAG, PoisonOnly, Depth);
44442 }
44443
44444 bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
44445     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
44446     bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
44447
44448   // TODO: Add more target shuffles.
44449   switch (Op.getOpcode()) {
44450   case X86ISD::PSHUFD:
44451   case X86ISD::VPERMILPI:
44452     return false;
44453   }
44454   return TargetLowering::canCreateUndefOrPoisonForTargetNode(
44455       Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
44456 }
44457
44458 bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
44459                                                   const APInt &DemandedElts,
44460                                                   APInt &UndefElts,
44461                                                   const SelectionDAG &DAG,
44462                                                   unsigned Depth) const {
44463   unsigned NumElts = DemandedElts.getBitWidth();
44464   unsigned Opc = Op.getOpcode();
44465
44466   switch (Opc) {
44467   case X86ISD::VBROADCAST:
44468   case X86ISD::VBROADCAST_LOAD:
44469     UndefElts = APInt::getZero(NumElts);
44470     return true;
44471   }
44472
44473   return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
44474                                                    DAG, Depth);
44475 }
44476
44477 // Helper to peek through bitops/trunc/setcc to determine size of source vector.
44478 // Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
44479 static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
44480                                       bool AllowTruncate) {
44481   switch (Src.getOpcode()) {
44482   case ISD::TRUNCATE:
44483     if (!AllowTruncate)
44484       return false;
44485     [[fallthrough]];
44486   case ISD::SETCC:
44487     return Src.getOperand(0).getValueSizeInBits() == Size;
44488   case ISD::AND:
44489   case ISD::XOR:
44490   case ISD::OR:
44491     return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
44492            checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
44493   case ISD::SELECT:
44494   case ISD::VSELECT:
44495     return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
44496            checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
44497            checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
44498   case ISD::BUILD_VECTOR:
44499     return ISD::isBuildVectorAllZeros(Src.getNode()) ||
44500            ISD::isBuildVectorAllOnes(Src.getNode());
44501   }
44502   return false;
44503 }
44504
44505 // Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
44506 static unsigned getAltBitOpcode(unsigned Opcode) {
44507   switch(Opcode) {
44508   case ISD::AND: return X86ISD::FAND;
44509   case ISD::OR: return X86ISD::FOR;
44510   case ISD::XOR: return X86ISD::FXOR;
44511   case X86ISD::ANDNP: return X86ISD::FANDN;
44512   }
44513   llvm_unreachable("Unknown bitwise opcode");
44514 }
44515
44516 // Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
44517 static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
44518                                           const SDLoc &DL) {
44519   EVT SrcVT = Src.getValueType();
44520   if (SrcVT != MVT::v4i1)
44521     return SDValue();
44522
44523   switch (Src.getOpcode()) {
44524   case ISD::SETCC:
44525     if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
44526         ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
44527         cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
44528       SDValue Op0 = Src.getOperand(0);
44529       if (ISD::isNormalLoad(Op0.getNode()))
44530         return DAG.getBitcast(MVT::v4f32, Op0);
44531       if (Op0.getOpcode() == ISD::BITCAST &&
44532           Op0.getOperand(0).getValueType() == MVT::v4f32)
44533         return Op0.getOperand(0);
44534     }
44535     break;
44536   case ISD::AND:
44537   case ISD::XOR:
44538   case ISD::OR: {
44539     SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
44540     SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
44541     if (Op0 && Op1)
44542       return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
44543                          Op1);
44544     break;
44545   }
44546   }
44547   return SDValue();
44548 }
44549
44550 // Helper to push sign extension of vXi1 SETCC result through bitops.
44551 static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
44552                                           SDValue Src, const SDLoc &DL) {
44553   switch (Src.getOpcode()) {
44554   case ISD::SETCC:
44555   case ISD::TRUNCATE:
44556   case ISD::BUILD_VECTOR:
44557     return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44558   case ISD::AND:
44559   case ISD::XOR:
44560   case ISD::OR:
44561     return DAG.getNode(
44562         Src.getOpcode(), DL, SExtVT,
44563         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
44564         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
44565   case ISD::SELECT:
44566   case ISD::VSELECT:
44567     return DAG.getSelect(
44568         DL, SExtVT, Src.getOperand(0),
44569         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
44570         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
44571   }
44572   llvm_unreachable("Unexpected node type for vXi1 sign extension");
44573 }
44574
44575 // Try to match patterns such as
44576 // (i16 bitcast (v16i1 x))
44577 // ->
44578 // (i16 movmsk (16i8 sext (v16i1 x)))
44579 // before the illegal vector is scalarized on subtargets that don't have legal
44580 // vxi1 types.
44581 static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
44582                                   const SDLoc &DL,
44583                                   const X86Subtarget &Subtarget) {
44584   EVT SrcVT = Src.getValueType();
44585   if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
44586     return SDValue();
44587
44588   // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
44589   // legalization destroys the v4i32 type.
44590   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
44591     if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
44592       V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
44593                       DAG.getBitcast(MVT::v4f32, V));
44594       return DAG.getZExtOrTrunc(V, DL, VT);
44595     }
44596   }
44597
44598   // If the input is a truncate from v16i8 or v32i8 go ahead and use a
44599   // movmskb even with avx512. This will be better than truncating to vXi1 and
44600   // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
44601   // vpcmpeqb/vpcmpgtb.
44602   bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
44603                       (Src.getOperand(0).getValueType() == MVT::v16i8 ||
44604                        Src.getOperand(0).getValueType() == MVT::v32i8 ||
44605                        Src.getOperand(0).getValueType() == MVT::v64i8);
44606
44607   // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
44608   // directly with vpmovmskb/vmovmskps/vmovmskpd.
44609   if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
44610       cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
44611       ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
44612     EVT CmpVT = Src.getOperand(0).getValueType();
44613     EVT EltVT = CmpVT.getVectorElementType();
44614     if (CmpVT.getSizeInBits() <= 256 &&
44615         (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
44616       PreferMovMsk = true;
44617   }
44618
44619   // With AVX512 vxi1 types are legal and we prefer using k-regs.
44620   // MOVMSK is supported in SSE2 or later.
44621   if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
44622     return SDValue();
44623
44624   // If the upper ops of a concatenation are undef, then try to bitcast the
44625   // lower op and extend.
44626   SmallVector<SDValue, 4> SubSrcOps;
44627   if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
44628       SubSrcOps.size() >= 2) {
44629     SDValue LowerOp = SubSrcOps[0];
44630     ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
44631     if (LowerOp.getOpcode() == ISD::SETCC &&
44632         all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
44633       EVT SubVT = VT.getIntegerVT(
44634           *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
44635       if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
44636         EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
44637         return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
44638       }
44639     }
44640   }
44641
44642   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
44643   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
44644   // v8i16 and v16i16.
44645   // For these two cases, we can shuffle the upper element bytes to a
44646   // consecutive sequence at the start of the vector and treat the results as
44647   // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
44648   // for v16i16 this is not the case, because the shuffle is expensive, so we
44649   // avoid sign-extending to this type entirely.
44650   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
44651   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
44652   MVT SExtVT;
44653   bool PropagateSExt = false;
44654   switch (SrcVT.getSimpleVT().SimpleTy) {
44655   default:
44656     return SDValue();
44657   case MVT::v2i1:
44658     SExtVT = MVT::v2i64;
44659     break;
44660   case MVT::v4i1:
44661     SExtVT = MVT::v4i32;
44662     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
44663     // sign-extend to a 256-bit operation to avoid truncation.
44664     if (Subtarget.hasAVX() &&
44665         checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
44666       SExtVT = MVT::v4i64;
44667       PropagateSExt = true;
44668     }
44669     break;
44670   case MVT::v8i1:
44671     SExtVT = MVT::v8i16;
44672     // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
44673     // sign-extend to a 256-bit operation to match the compare.
44674     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
44675     // 256-bit because the shuffle is cheaper than sign extending the result of
44676     // the compare.
44677     if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
44678                                checkBitcastSrcVectorSize(Src, 512, true))) {
44679       SExtVT = MVT::v8i32;
44680       PropagateSExt = true;
44681     }
44682     break;
44683   case MVT::v16i1:
44684     SExtVT = MVT::v16i8;
44685     // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
44686     // it is not profitable to sign-extend to 256-bit because this will
44687     // require an extra cross-lane shuffle which is more expensive than
44688     // truncating the result of the compare to 128-bits.
44689     break;
44690   case MVT::v32i1:
44691     SExtVT = MVT::v32i8;
44692     break;
44693   case MVT::v64i1:
44694     // If we have AVX512F, but not AVX512BW and the input is truncated from
44695     // v64i8 checked earlier. Then split the input and make two pmovmskbs.
44696     if (Subtarget.hasAVX512()) {
44697       if (Subtarget.hasBWI())
44698         return SDValue();
44699       SExtVT = MVT::v64i8;
44700       break;
44701     }
44702     // Split if this is a <64 x i8> comparison result.
44703     if (checkBitcastSrcVectorSize(Src, 512, false)) {
44704       SExtVT = MVT::v64i8;
44705       break;
44706     }
44707     return SDValue();
44708   };
44709
44710   SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
44711                             : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
44712
44713   if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
44714     V = getPMOVMSKB(DL, V, DAG, Subtarget);
44715   } else {
44716     if (SExtVT == MVT::v8i16)
44717       V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
44718                       DAG.getUNDEF(MVT::v8i16));
44719     V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
44720   }
44721
44722   EVT IntVT =
44723       EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
44724   V = DAG.getZExtOrTrunc(V, DL, IntVT);
44725   return DAG.getBitcast(VT, V);
44726 }
44727
44728 // Convert a vXi1 constant build vector to the same width scalar integer.
44729 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
44730   EVT SrcVT = Op.getValueType();
44731   assert(SrcVT.getVectorElementType() == MVT::i1 &&
44732          "Expected a vXi1 vector");
44733   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
44734          "Expected a constant build vector");
44735
44736   APInt Imm(SrcVT.getVectorNumElements(), 0);
44737   for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
44738     SDValue In = Op.getOperand(Idx);
44739     if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
44740       Imm.setBit(Idx);
44741   }
44742   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
44743   return DAG.getConstant(Imm, SDLoc(Op), IntVT);
44744 }
44745
44746 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
44747                                            TargetLowering::DAGCombinerInfo &DCI,
44748                                            const X86Subtarget &Subtarget) {
44749   assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
44750
44751   if (!DCI.isBeforeLegalizeOps())
44752     return SDValue();
44753
44754   // Only do this if we have k-registers.
44755   if (!Subtarget.hasAVX512())
44756     return SDValue();
44757
44758   EVT DstVT = N->getValueType(0);
44759   SDValue Op = N->getOperand(0);
44760   EVT SrcVT = Op.getValueType();
44761
44762   if (!Op.hasOneUse())
44763     return SDValue();
44764
44765   // Look for logic ops.
44766   if (Op.getOpcode() != ISD::AND &&
44767       Op.getOpcode() != ISD::OR &&
44768       Op.getOpcode() != ISD::XOR)
44769     return SDValue();
44770
44771   // Make sure we have a bitcast between mask registers and a scalar type.
44772   if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
44773         DstVT.isScalarInteger()) &&
44774       !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
44775         SrcVT.isScalarInteger()))
44776     return SDValue();
44777
44778   SDValue LHS = Op.getOperand(0);
44779   SDValue RHS = Op.getOperand(1);
44780
44781   if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
44782       LHS.getOperand(0).getValueType() == DstVT)
44783     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
44784                        DAG.getBitcast(DstVT, RHS));
44785
44786   if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
44787       RHS.getOperand(0).getValueType() == DstVT)
44788     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44789                        DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
44790
44791   // If the RHS is a vXi1 build vector, this is a good reason to flip too.
44792   // Most of these have to move a constant from the scalar domain anyway.
44793   if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
44794     RHS = combinevXi1ConstantToInteger(RHS, DAG);
44795     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
44796                        DAG.getBitcast(DstVT, LHS), RHS);
44797   }
44798
44799   return SDValue();
44800 }
44801
44802 static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
44803                                     const X86Subtarget &Subtarget) {
44804   SDLoc DL(BV);
44805   unsigned NumElts = BV->getNumOperands();
44806   SDValue Splat = BV->getSplatValue();
44807
44808   // Build MMX element from integer GPR or SSE float values.
44809   auto CreateMMXElement = [&](SDValue V) {
44810     if (V.isUndef())
44811       return DAG.getUNDEF(MVT::x86mmx);
44812     if (V.getValueType().isFloatingPoint()) {
44813       if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
44814         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
44815         V = DAG.getBitcast(MVT::v2i64, V);
44816         return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
44817       }
44818       V = DAG.getBitcast(MVT::i32, V);
44819     } else {
44820       V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
44821     }
44822     return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
44823   };
44824
44825   // Convert build vector ops to MMX data in the bottom elements.
44826   SmallVector<SDValue, 8> Ops;
44827
44828   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44829
44830   // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
44831   if (Splat) {
44832     if (Splat.isUndef())
44833       return DAG.getUNDEF(MVT::x86mmx);
44834
44835     Splat = CreateMMXElement(Splat);
44836
44837     if (Subtarget.hasSSE1()) {
44838       // Unpack v8i8 to splat i8 elements to lowest 16-bits.
44839       if (NumElts == 8)
44840         Splat = DAG.getNode(
44841             ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44842             DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
44843                                   TLI.getPointerTy(DAG.getDataLayout())),
44844             Splat, Splat);
44845
44846       // Use PSHUFW to repeat 16-bit elements.
44847       unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
44848       return DAG.getNode(
44849           ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
44850           DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
44851                                 TLI.getPointerTy(DAG.getDataLayout())),
44852           Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
44853     }
44854     Ops.append(NumElts, Splat);
44855   } else {
44856     for (unsigned i = 0; i != NumElts; ++i)
44857       Ops.push_back(CreateMMXElement(BV->getOperand(i)));
44858   }
44859
44860   // Use tree of PUNPCKLs to build up general MMX vector.
44861   while (Ops.size() > 1) {
44862     unsigned NumOps = Ops.size();
44863     unsigned IntrinOp =
44864         (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
44865                      : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
44866                                     : Intrinsic::x86_mmx_punpcklbw));
44867     SDValue Intrin = DAG.getTargetConstant(
44868         IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
44869     for (unsigned i = 0; i != NumOps; i += 2)
44870       Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
44871                                Ops[i], Ops[i + 1]);
44872     Ops.resize(NumOps / 2);
44873   }
44874
44875   return Ops[0];
44876 }
44877
44878 // Recursive function that attempts to find if a bool vector node was originally
44879 // a vector/float/double that got truncated/extended/bitcast to/from a scalar
44880 // integer. If so, replace the scalar ops with bool vector equivalents back down
44881 // the chain.
44882 static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
44883                                           SelectionDAG &DAG,
44884                                           const X86Subtarget &Subtarget) {
44885   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44886   unsigned Opc = V.getOpcode();
44887   switch (Opc) {
44888   case ISD::BITCAST: {
44889     // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
44890     SDValue Src = V.getOperand(0);
44891     EVT SrcVT = Src.getValueType();
44892     if (SrcVT.isVector() || SrcVT.isFloatingPoint())
44893       return DAG.getBitcast(VT, Src);
44894     break;
44895   }
44896   case ISD::TRUNCATE: {
44897     // If we find a suitable source, a truncated scalar becomes a subvector.
44898     SDValue Src = V.getOperand(0);
44899     EVT NewSrcVT =
44900         EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
44901     if (TLI.isTypeLegal(NewSrcVT))
44902       if (SDValue N0 =
44903               combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
44904         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
44905                            DAG.getIntPtrConstant(0, DL));
44906     break;
44907   }
44908   case ISD::ANY_EXTEND:
44909   case ISD::ZERO_EXTEND: {
44910     // If we find a suitable source, an extended scalar becomes a subvector.
44911     SDValue Src = V.getOperand(0);
44912     EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
44913                                     Src.getScalarValueSizeInBits());
44914     if (TLI.isTypeLegal(NewSrcVT))
44915       if (SDValue N0 =
44916               combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
44917         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
44918                            Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
44919                                                   : DAG.getConstant(0, DL, VT),
44920                            N0, DAG.getIntPtrConstant(0, DL));
44921     break;
44922   }
44923   case ISD::OR: {
44924     // If we find suitable sources, we can just move an OR to the vector domain.
44925     SDValue Src0 = V.getOperand(0);
44926     SDValue Src1 = V.getOperand(1);
44927     if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
44928       if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
44929         return DAG.getNode(Opc, DL, VT, N0, N1);
44930     break;
44931   }
44932   case ISD::SHL: {
44933     // If we find a suitable source, a SHL becomes a KSHIFTL.
44934     SDValue Src0 = V.getOperand(0);
44935     if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
44936         ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
44937       break;
44938
44939     if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
44940       if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
44941         return DAG.getNode(
44942             X86ISD::KSHIFTL, DL, VT, N0,
44943             DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
44944     break;
44945   }
44946   }
44947   return SDValue();
44948 }
44949
44950 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
44951                               TargetLowering::DAGCombinerInfo &DCI,
44952                               const X86Subtarget &Subtarget) {
44953   SDValue N0 = N->getOperand(0);
44954   EVT VT = N->getValueType(0);
44955   EVT SrcVT = N0.getValueType();
44956   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44957
44958   // Try to match patterns such as
44959   // (i16 bitcast (v16i1 x))
44960   // ->
44961   // (i16 movmsk (16i8 sext (v16i1 x)))
44962   // before the setcc result is scalarized on subtargets that don't have legal
44963   // vxi1 types.
44964   if (DCI.isBeforeLegalize()) {
44965     SDLoc dl(N);
44966     if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
44967       return V;
44968
44969     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
44970     // type, widen both sides to avoid a trip through memory.
44971     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
44972         Subtarget.hasAVX512()) {
44973       N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
44974       N0 = DAG.getBitcast(MVT::v8i1, N0);
44975       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
44976                          DAG.getIntPtrConstant(0, dl));
44977     }
44978
44979     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
44980     // type, widen both sides to avoid a trip through memory.
44981     if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
44982         Subtarget.hasAVX512()) {
44983       // Use zeros for the widening if we already have some zeroes. This can
44984       // allow SimplifyDemandedBits to remove scalar ANDs that may be down
44985       // stream of this.
44986       // FIXME: It might make sense to detect a concat_vectors with a mix of
44987       // zeroes and undef and turn it into insert_subvector for i1 vectors as
44988       // a separate combine. What we can't do is canonicalize the operands of
44989       // such a concat or we'll get into a loop with SimplifyDemandedBits.
44990       if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
44991         SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
44992         if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
44993           SrcVT = LastOp.getValueType();
44994           unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
44995           SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
44996           Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
44997           N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
44998           N0 = DAG.getBitcast(MVT::i8, N0);
44999           return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45000         }
45001       }
45002
45003       unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45004       SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45005       Ops[0] = N0;
45006       N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45007       N0 = DAG.getBitcast(MVT::i8, N0);
45008       return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45009     }
45010   } else {
45011     // If we're bitcasting from iX to vXi1, see if the integer originally
45012     // began as a vXi1 and whether we can remove the bitcast entirely.
45013     if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45014         SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45015       if (SDValue V =
45016               combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45017         return V;
45018     }
45019   }
45020
45021   // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45022   // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45023   // due to insert_subvector legalization on KNL. By promoting the copy to i16
45024   // we can help with known bits propagation from the vXi1 domain to the
45025   // scalar domain.
45026   if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45027       !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45028       N0.getOperand(0).getValueType() == MVT::v16i1 &&
45029       isNullConstant(N0.getOperand(1)))
45030     return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45031                        DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45032
45033   // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45034   // and the vbroadcast_load are both integer or both fp. In some cases this
45035   // will remove the bitcast entirely.
45036   if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45037        VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45038     auto *BCast = cast<MemIntrinsicSDNode>(N0);
45039     unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45040     unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45041     // Don't swap i8/i16 since don't have fp types that size.
45042     if (MemSize >= 32) {
45043       MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45044                                        : MVT::getIntegerVT(MemSize);
45045       MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45046                                         : MVT::getIntegerVT(SrcVTSize);
45047       LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45048
45049       SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45050       SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45051       SDValue ResNode =
45052           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
45053                                   MemVT, BCast->getMemOperand());
45054       DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45055       return DAG.getBitcast(VT, ResNode);
45056     }
45057   }
45058
45059   // Since MMX types are special and don't usually play with other vector types,
45060   // it's better to handle them early to be sure we emit efficient code by
45061   // avoiding store-load conversions.
45062   if (VT == MVT::x86mmx) {
45063     // Detect MMX constant vectors.
45064     APInt UndefElts;
45065     SmallVector<APInt, 1> EltBits;
45066     if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
45067       SDLoc DL(N0);
45068       // Handle zero-extension of i32 with MOVD.
45069       if (EltBits[0].countl_zero() >= 32)
45070         return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45071                            DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45072       // Else, bitcast to a double.
45073       // TODO - investigate supporting sext 32-bit immediates on x86_64.
45074       APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45075       return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45076     }
45077
45078     // Detect bitcasts to x86mmx low word.
45079     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45080         (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45081         N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45082       bool LowUndef = true, AllUndefOrZero = true;
45083       for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45084         SDValue Op = N0.getOperand(i);
45085         LowUndef &= Op.isUndef() || (i >= e/2);
45086         AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
45087       }
45088       if (AllUndefOrZero) {
45089         SDValue N00 = N0.getOperand(0);
45090         SDLoc dl(N00);
45091         N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45092                        : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45093         return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45094       }
45095     }
45096
45097     // Detect bitcasts of 64-bit build vectors and convert to a
45098     // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45099     // lowest element.
45100     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45101         (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45102          SrcVT == MVT::v8i8))
45103       return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45104
45105     // Detect bitcasts between element or subvector extraction to x86mmx.
45106     if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
45107          N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
45108         isNullConstant(N0.getOperand(1))) {
45109       SDValue N00 = N0.getOperand(0);
45110       if (N00.getValueType().is128BitVector())
45111         return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
45112                            DAG.getBitcast(MVT::v2i64, N00));
45113     }
45114
45115     // Detect bitcasts from FP_TO_SINT to x86mmx.
45116     if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
45117       SDLoc DL(N0);
45118       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
45119                                 DAG.getUNDEF(MVT::v2i32));
45120       return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
45121                          DAG.getBitcast(MVT::v2i64, Res));
45122     }
45123   }
45124
45125   // Try to remove a bitcast of constant vXi1 vector. We have to legalize
45126   // most of these to scalar anyway.
45127   if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
45128       SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45129       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
45130     return combinevXi1ConstantToInteger(N0, DAG);
45131   }
45132
45133   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
45134       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
45135       isa<ConstantSDNode>(N0)) {
45136     auto *C = cast<ConstantSDNode>(N0);
45137     if (C->isAllOnes())
45138       return DAG.getConstant(1, SDLoc(N0), VT);
45139     if (C->isZero())
45140       return DAG.getConstant(0, SDLoc(N0), VT);
45141   }
45142
45143   // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
45144   // Turn it into a sign bit compare that produces a k-register. This avoids
45145   // a trip through a GPR.
45146   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
45147       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
45148       isPowerOf2_32(VT.getVectorNumElements())) {
45149     unsigned NumElts = VT.getVectorNumElements();
45150     SDValue Src = N0;
45151
45152     // Peek through truncate.
45153     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
45154       Src = N0.getOperand(0);
45155
45156     if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
45157       SDValue MovmskIn = Src.getOperand(0);
45158       MVT MovmskVT = MovmskIn.getSimpleValueType();
45159       unsigned MovMskElts = MovmskVT.getVectorNumElements();
45160
45161       // We allow extra bits of the movmsk to be used since they are known zero.
45162       // We can't convert a VPMOVMSKB without avx512bw.
45163       if (MovMskElts <= NumElts &&
45164           (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
45165         EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
45166         MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
45167         SDLoc dl(N);
45168         MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
45169         SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
45170                                    DAG.getConstant(0, dl, IntVT), ISD::SETLT);
45171         if (EVT(CmpVT) == VT)
45172           return Cmp;
45173
45174         // Pad with zeroes up to original VT to replace the zeroes that were
45175         // being used from the MOVMSK.
45176         unsigned NumConcats = NumElts / MovMskElts;
45177         SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
45178         Ops[0] = Cmp;
45179         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
45180       }
45181     }
45182   }
45183
45184   // Try to remove bitcasts from input and output of mask arithmetic to
45185   // remove GPR<->K-register crossings.
45186   if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
45187     return V;
45188
45189   // Convert a bitcasted integer logic operation that has one bitcasted
45190   // floating-point operand into a floating-point logic operation. This may
45191   // create a load of a constant, but that is cheaper than materializing the
45192   // constant in an integer register and transferring it to an SSE register or
45193   // transferring the SSE operand to integer register and back.
45194   unsigned FPOpcode;
45195   switch (N0.getOpcode()) {
45196     case ISD::AND: FPOpcode = X86ISD::FAND; break;
45197     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
45198     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
45199     default: return SDValue();
45200   }
45201
45202   // Check if we have a bitcast from another integer type as well.
45203   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
45204         (Subtarget.hasSSE2() && VT == MVT::f64) ||
45205         (Subtarget.hasFP16() && VT == MVT::f16) ||
45206         (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
45207          TLI.isTypeLegal(VT))))
45208     return SDValue();
45209
45210   SDValue LogicOp0 = N0.getOperand(0);
45211   SDValue LogicOp1 = N0.getOperand(1);
45212   SDLoc DL0(N0);
45213
45214   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
45215   if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
45216       LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
45217       LogicOp0.getOperand(0).getValueType() == VT &&
45218       !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
45219     SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
45220     unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
45221     return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
45222   }
45223   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
45224   if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
45225       LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
45226       LogicOp1.getOperand(0).getValueType() == VT &&
45227       !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
45228     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
45229     unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
45230     return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
45231   }
45232
45233   return SDValue();
45234 }
45235
45236 // (mul (zext a), (sext, b))
45237 static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
45238                          SDValue &Op1) {
45239   Op0 = Mul.getOperand(0);
45240   Op1 = Mul.getOperand(1);
45241
45242   // The operand1 should be signed extend
45243   if (Op0.getOpcode() == ISD::SIGN_EXTEND)
45244     std::swap(Op0, Op1);
45245
45246   auto IsFreeTruncation = [](SDValue &Op) -> bool {
45247     if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
45248          Op.getOpcode() == ISD::SIGN_EXTEND) &&
45249         Op.getOperand(0).getScalarValueSizeInBits() <= 8)
45250       return true;
45251
45252     auto *BV = dyn_cast<BuildVectorSDNode>(Op);
45253     return (BV && BV->isConstant());
45254   };
45255
45256   // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
45257   // value, we need to check Op0 is zero extended value. Op1 should be signed
45258   // value, so we just check the signed bits.
45259   if ((IsFreeTruncation(Op0) &&
45260        DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
45261       (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
45262     return true;
45263
45264   return false;
45265 }
45266
45267 // Given a ABS node, detect the following pattern:
45268 // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
45269 // This is useful as it is the input into a SAD pattern.
45270 static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
45271   SDValue AbsOp1 = Abs->getOperand(0);
45272   if (AbsOp1.getOpcode() != ISD::SUB)
45273     return false;
45274
45275   Op0 = AbsOp1.getOperand(0);
45276   Op1 = AbsOp1.getOperand(1);
45277
45278   // Check if the operands of the sub are zero-extended from vectors of i8.
45279   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
45280       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
45281       Op1.getOpcode() != ISD::ZERO_EXTEND ||
45282       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
45283     return false;
45284
45285   return true;
45286 }
45287
45288 static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
45289                               unsigned &LogBias, const SDLoc &DL,
45290                               const X86Subtarget &Subtarget) {
45291   // Extend or truncate to MVT::i8 first.
45292   MVT Vi8VT =
45293       MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
45294   LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
45295   RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
45296
45297   // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
45298   // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
45299   // The src A, B element type is i8, but the dst C element type is i32.
45300   // When we calculate the reduce stage, we use src vector type vXi8 for it
45301   // so we need logbias 2 to avoid extra 2 stages.
45302   LogBias = 2;
45303
45304   unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
45305   if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
45306     RegSize = std::max(512u, RegSize);
45307
45308   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45309   // fill in the missing vector elements with 0.
45310   unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
45311   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
45312   Ops[0] = LHS;
45313   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
45314   SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45315   Ops[0] = RHS;
45316   SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45317
45318   // Actually build the DotProduct, split as 256/512 bits for
45319   // AVXVNNI/AVX512VNNI.
45320   auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45321                        ArrayRef<SDValue> Ops) {
45322     MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
45323     return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
45324   };
45325   MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
45326   SDValue Zero = DAG.getConstant(0, DL, DpVT);
45327
45328   return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
45329                           DpBuilder, false);
45330 }
45331
45332 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
45333 // to these zexts.
45334 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
45335                             const SDValue &Zext1, const SDLoc &DL,
45336                             const X86Subtarget &Subtarget) {
45337   // Find the appropriate width for the PSADBW.
45338   EVT InVT = Zext0.getOperand(0).getValueType();
45339   unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
45340
45341   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
45342   // fill in the missing vector elements with 0.
45343   unsigned NumConcat = RegSize / InVT.getSizeInBits();
45344   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
45345   Ops[0] = Zext0.getOperand(0);
45346   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
45347   SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45348   Ops[0] = Zext1.getOperand(0);
45349   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
45350
45351   // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
45352   auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45353                           ArrayRef<SDValue> Ops) {
45354     MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
45355     return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
45356   };
45357   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
45358   return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
45359                           PSADBWBuilder);
45360 }
45361
45362 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
45363 // PHMINPOSUW.
45364 static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
45365                                       const X86Subtarget &Subtarget) {
45366   // Bail without SSE41.
45367   if (!Subtarget.hasSSE41())
45368     return SDValue();
45369
45370   EVT ExtractVT = Extract->getValueType(0);
45371   if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
45372     return SDValue();
45373
45374   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
45375   ISD::NodeType BinOp;
45376   SDValue Src = DAG.matchBinOpReduction(
45377       Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
45378   if (!Src)
45379     return SDValue();
45380
45381   EVT SrcVT = Src.getValueType();
45382   EVT SrcSVT = SrcVT.getScalarType();
45383   if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
45384     return SDValue();
45385
45386   SDLoc DL(Extract);
45387   SDValue MinPos = Src;
45388
45389   // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
45390   while (SrcVT.getSizeInBits() > 128) {
45391     SDValue Lo, Hi;
45392     std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
45393     SrcVT = Lo.getValueType();
45394     MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
45395   }
45396   assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
45397           (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
45398          "Unexpected value type");
45399
45400   // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
45401   // to flip the value accordingly.
45402   SDValue Mask;
45403   unsigned MaskEltsBits = ExtractVT.getSizeInBits();
45404   if (BinOp == ISD::SMAX)
45405     Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
45406   else if (BinOp == ISD::SMIN)
45407     Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
45408   else if (BinOp == ISD::UMAX)
45409     Mask = DAG.getAllOnesConstant(DL, SrcVT);
45410
45411   if (Mask)
45412     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
45413
45414   // For v16i8 cases we need to perform UMIN on pairs of byte elements,
45415   // shuffling each upper element down and insert zeros. This means that the
45416   // v16i8 UMIN will leave the upper element as zero, performing zero-extension
45417   // ready for the PHMINPOS.
45418   if (ExtractVT == MVT::i8) {
45419     SDValue Upper = DAG.getVectorShuffle(
45420         SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
45421         {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
45422     MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
45423   }
45424
45425   // Perform the PHMINPOS on a v8i16 vector,
45426   MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
45427   MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
45428   MinPos = DAG.getBitcast(SrcVT, MinPos);
45429
45430   if (Mask)
45431     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
45432
45433   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
45434                      DAG.getIntPtrConstant(0, DL));
45435 }
45436
45437 // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
45438 static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
45439                                          const X86Subtarget &Subtarget) {
45440   // Bail without SSE2.
45441   if (!Subtarget.hasSSE2())
45442     return SDValue();
45443
45444   EVT ExtractVT = Extract->getValueType(0);
45445   unsigned BitWidth = ExtractVT.getSizeInBits();
45446   if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
45447       ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
45448     return SDValue();
45449
45450   // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
45451   ISD::NodeType BinOp;
45452   SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
45453   if (!Match && ExtractVT == MVT::i1)
45454     Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
45455   if (!Match)
45456     return SDValue();
45457
45458   // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
45459   // which we can't support here for now.
45460   if (Match.getScalarValueSizeInBits() != BitWidth)
45461     return SDValue();
45462
45463   SDValue Movmsk;
45464   SDLoc DL(Extract);
45465   EVT MatchVT = Match.getValueType();
45466   unsigned NumElts = MatchVT.getVectorNumElements();
45467   unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
45468   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45469   LLVMContext &Ctx = *DAG.getContext();
45470
45471   if (ExtractVT == MVT::i1) {
45472     // Special case for (pre-legalization) vXi1 reductions.
45473     if (NumElts > 64 || !isPowerOf2_32(NumElts))
45474       return SDValue();
45475     if (Match.getOpcode() == ISD::SETCC) {
45476       ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
45477       if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
45478           (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
45479         // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
45480         // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
45481         X86::CondCode X86CC;
45482         SDValue LHS = DAG.getFreeze(Match.getOperand(0));
45483         SDValue RHS = DAG.getFreeze(Match.getOperand(1));
45484         APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
45485         if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
45486                                             DAG, X86CC))
45487           return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
45488                              getSETCC(X86CC, V, DL, DAG));
45489       }
45490     }
45491     if (TLI.isTypeLegal(MatchVT)) {
45492       // If this is a legal AVX512 predicate type then we can just bitcast.
45493       EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45494       Movmsk = DAG.getBitcast(MovmskVT, Match);
45495     } else {
45496       // Use combineBitcastvxi1 to create the MOVMSK.
45497       while (NumElts > MaxElts) {
45498         SDValue Lo, Hi;
45499         std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45500         Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45501         NumElts /= 2;
45502       }
45503       EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
45504       Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
45505     }
45506     if (!Movmsk)
45507       return SDValue();
45508     Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
45509   } else {
45510     // FIXME: Better handling of k-registers or 512-bit vectors?
45511     unsigned MatchSizeInBits = Match.getValueSizeInBits();
45512     if (!(MatchSizeInBits == 128 ||
45513           (MatchSizeInBits == 256 && Subtarget.hasAVX())))
45514       return SDValue();
45515
45516     // Make sure this isn't a vector of 1 element. The perf win from using
45517     // MOVMSK diminishes with less elements in the reduction, but it is
45518     // generally better to get the comparison over to the GPRs as soon as
45519     // possible to reduce the number of vector ops.
45520     if (Match.getValueType().getVectorNumElements() < 2)
45521       return SDValue();
45522
45523     // Check that we are extracting a reduction of all sign bits.
45524     if (DAG.ComputeNumSignBits(Match) != BitWidth)
45525       return SDValue();
45526
45527     if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
45528       SDValue Lo, Hi;
45529       std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
45530       Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
45531       MatchSizeInBits = Match.getValueSizeInBits();
45532     }
45533
45534     // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
45535     MVT MaskSrcVT;
45536     if (64 == BitWidth || 32 == BitWidth)
45537       MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
45538                                    MatchSizeInBits / BitWidth);
45539     else
45540       MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
45541
45542     SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
45543     Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
45544     NumElts = MaskSrcVT.getVectorNumElements();
45545   }
45546   assert((NumElts <= 32 || NumElts == 64) &&
45547          "Not expecting more than 64 elements");
45548
45549   MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
45550   if (BinOp == ISD::XOR) {
45551     // parity -> (PARITY(MOVMSK X))
45552     SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
45553     return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
45554   }
45555
45556   SDValue CmpC;
45557   ISD::CondCode CondCode;
45558   if (BinOp == ISD::OR) {
45559     // any_of -> MOVMSK != 0
45560     CmpC = DAG.getConstant(0, DL, CmpVT);
45561     CondCode = ISD::CondCode::SETNE;
45562   } else {
45563     // all_of -> MOVMSK == ((1 << NumElts) - 1)
45564     CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
45565                            DL, CmpVT);
45566     CondCode = ISD::CondCode::SETEQ;
45567   }
45568
45569   // The setcc produces an i8 of 0/1, so extend that to the result width and
45570   // negate to get the final 0/-1 mask value.
45571   EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
45572   SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
45573   SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
45574   SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
45575   return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
45576 }
45577
45578 static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
45579                                       const X86Subtarget &Subtarget) {
45580   if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
45581     return SDValue();
45582
45583   EVT ExtractVT = Extract->getValueType(0);
45584   // Verify the type we're extracting is i32, as the output element type of
45585   // vpdpbusd is i32.
45586   if (ExtractVT != MVT::i32)
45587     return SDValue();
45588
45589   EVT VT = Extract->getOperand(0).getValueType();
45590   if (!isPowerOf2_32(VT.getVectorNumElements()))
45591     return SDValue();
45592
45593   // Match shuffle + add pyramid.
45594   ISD::NodeType BinOp;
45595   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45596
45597   // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
45598   // done by vpdpbusd compute a signed 16-bit product that will be sign extended
45599   // before adding into the accumulator.
45600   // TODO:
45601   // We also need to verify that the multiply has at least 2x the number of bits
45602   // of the input. We shouldn't match
45603   // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
45604   // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
45605   //   Root = Root.getOperand(0);
45606
45607   // If there was a match, we want Root to be a mul.
45608   if (!Root || Root.getOpcode() != ISD::MUL)
45609     return SDValue();
45610
45611   // Check whether we have an extend and mul pattern
45612   SDValue LHS, RHS;
45613   if (!detectExtMul(DAG, Root, LHS, RHS))
45614     return SDValue();
45615
45616   // Create the dot product instruction.
45617   SDLoc DL(Extract);
45618   unsigned StageBias;
45619   SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
45620
45621   // If the original vector was wider than 4 elements, sum over the results
45622   // in the DP vector.
45623   unsigned Stages = Log2_32(VT.getVectorNumElements());
45624   EVT DpVT = DP.getValueType();
45625
45626   if (Stages > StageBias) {
45627     unsigned DpElems = DpVT.getVectorNumElements();
45628
45629     for (unsigned i = Stages - StageBias; i > 0; --i) {
45630       SmallVector<int, 16> Mask(DpElems, -1);
45631       for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45632         Mask[j] = MaskEnd + j;
45633
45634       SDValue Shuffle =
45635           DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
45636       DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
45637     }
45638   }
45639
45640   // Return the lowest ExtractSizeInBits bits.
45641   EVT ResVT =
45642       EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45643                        DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
45644   DP = DAG.getBitcast(ResVT, DP);
45645   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
45646                      Extract->getOperand(1));
45647 }
45648
45649 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
45650                                       const X86Subtarget &Subtarget) {
45651   // PSADBW is only supported on SSE2 and up.
45652   if (!Subtarget.hasSSE2())
45653     return SDValue();
45654
45655   EVT ExtractVT = Extract->getValueType(0);
45656   // Verify the type we're extracting is either i32 or i64.
45657   // FIXME: Could support other types, but this is what we have coverage for.
45658   if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
45659     return SDValue();
45660
45661   EVT VT = Extract->getOperand(0).getValueType();
45662   if (!isPowerOf2_32(VT.getVectorNumElements()))
45663     return SDValue();
45664
45665   // Match shuffle + add pyramid.
45666   ISD::NodeType BinOp;
45667   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
45668
45669   // The operand is expected to be zero extended from i8
45670   // (verified in detectZextAbsDiff).
45671   // In order to convert to i64 and above, additional any/zero/sign
45672   // extend is expected.
45673   // The zero extend from 32 bit has no mathematical effect on the result.
45674   // Also the sign extend is basically zero extend
45675   // (extends the sign bit which is zero).
45676   // So it is correct to skip the sign/zero extend instruction.
45677   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
45678                Root.getOpcode() == ISD::ZERO_EXTEND ||
45679                Root.getOpcode() == ISD::ANY_EXTEND))
45680     Root = Root.getOperand(0);
45681
45682   // If there was a match, we want Root to be a select that is the root of an
45683   // abs-diff pattern.
45684   if (!Root || Root.getOpcode() != ISD::ABS)
45685     return SDValue();
45686
45687   // Check whether we have an abs-diff pattern feeding into the select.
45688   SDValue Zext0, Zext1;
45689   if (!detectZextAbsDiff(Root, Zext0, Zext1))
45690     return SDValue();
45691
45692   // Create the SAD instruction.
45693   SDLoc DL(Extract);
45694   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
45695
45696   // If the original vector was wider than 8 elements, sum over the results
45697   // in the SAD vector.
45698   unsigned Stages = Log2_32(VT.getVectorNumElements());
45699   EVT SadVT = SAD.getValueType();
45700   if (Stages > 3) {
45701     unsigned SadElems = SadVT.getVectorNumElements();
45702
45703     for(unsigned i = Stages - 3; i > 0; --i) {
45704       SmallVector<int, 16> Mask(SadElems, -1);
45705       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
45706         Mask[j] = MaskEnd + j;
45707
45708       SDValue Shuffle =
45709           DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
45710       SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
45711     }
45712   }
45713
45714   unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
45715   // Return the lowest ExtractSizeInBits bits.
45716   EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
45717                                SadVT.getSizeInBits() / ExtractSizeInBits);
45718   SAD = DAG.getBitcast(ResVT, SAD);
45719   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
45720                      Extract->getOperand(1));
45721 }
45722
45723 // Attempt to peek through a target shuffle and extract the scalar from the
45724 // source.
45725 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
45726                                          TargetLowering::DAGCombinerInfo &DCI,
45727                                          const X86Subtarget &Subtarget) {
45728   if (DCI.isBeforeLegalizeOps())
45729     return SDValue();
45730
45731   SDLoc dl(N);
45732   SDValue Src = N->getOperand(0);
45733   SDValue Idx = N->getOperand(1);
45734
45735   EVT VT = N->getValueType(0);
45736   EVT SrcVT = Src.getValueType();
45737   EVT SrcSVT = SrcVT.getVectorElementType();
45738   unsigned SrcEltBits = SrcSVT.getSizeInBits();
45739   unsigned NumSrcElts = SrcVT.getVectorNumElements();
45740
45741   // Don't attempt this for boolean mask vectors or unknown extraction indices.
45742   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
45743     return SDValue();
45744
45745   const APInt &IdxC = N->getConstantOperandAPInt(1);
45746   if (IdxC.uge(NumSrcElts))
45747     return SDValue();
45748
45749   SDValue SrcBC = peekThroughBitcasts(Src);
45750
45751   // Handle extract(bitcast(broadcast(scalar_value))).
45752   if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
45753     SDValue SrcOp = SrcBC.getOperand(0);
45754     EVT SrcOpVT = SrcOp.getValueType();
45755     if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
45756         (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
45757       unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
45758       unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
45759       // TODO support non-zero offsets.
45760       if (Offset == 0) {
45761         SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
45762         SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
45763         return SrcOp;
45764       }
45765     }
45766   }
45767
45768   // If we're extracting a single element from a broadcast load and there are
45769   // no other users, just create a single load.
45770   if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
45771     auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
45772     unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
45773     if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
45774         VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
45775       SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
45776                                  MemIntr->getBasePtr(),
45777                                  MemIntr->getPointerInfo(),
45778                                  MemIntr->getOriginalAlign(),
45779                                  MemIntr->getMemOperand()->getFlags());
45780       DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
45781       return Load;
45782     }
45783   }
45784
45785   // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
45786   // TODO: Move to DAGCombine?
45787   if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
45788       SrcBC.getValueType().isInteger() &&
45789       (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
45790       SrcBC.getScalarValueSizeInBits() ==
45791           SrcBC.getOperand(0).getValueSizeInBits()) {
45792     unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
45793     if (IdxC.ult(Scale)) {
45794       unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
45795       SDValue Scl = SrcBC.getOperand(0);
45796       EVT SclVT = Scl.getValueType();
45797       if (Offset) {
45798         Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
45799                           DAG.getShiftAmountConstant(Offset, SclVT, dl));
45800       }
45801       Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
45802       Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
45803       return Scl;
45804     }
45805   }
45806
45807   // Handle extract(truncate(x)) for 0'th index.
45808   // TODO: Treat this as a faux shuffle?
45809   // TODO: When can we use this for general indices?
45810   if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
45811       (SrcVT.getSizeInBits() % 128) == 0) {
45812     Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
45813     MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
45814     return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
45815                        Idx);
45816   }
45817
45818   // We can only legally extract other elements from 128-bit vectors and in
45819   // certain circumstances, depending on SSE-level.
45820   // TODO: Investigate float/double extraction if it will be just stored.
45821   auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
45822                                                  unsigned Idx) {
45823     EVT VecSVT = VecVT.getScalarType();
45824     if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
45825         (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
45826          VecSVT == MVT::i64)) {
45827       unsigned EltSizeInBits = VecSVT.getSizeInBits();
45828       unsigned NumEltsPerLane = 128 / EltSizeInBits;
45829       unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
45830       unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
45831       VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
45832       Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
45833       Idx &= (NumEltsPerLane - 1);
45834     }
45835     if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
45836         ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
45837       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
45838                          DAG.getBitcast(VecVT, Vec),
45839                          DAG.getIntPtrConstant(Idx, dl));
45840     }
45841     if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
45842         (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
45843       unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
45844       return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
45845                          DAG.getTargetConstant(Idx, dl, MVT::i8));
45846     }
45847     return SDValue();
45848   };
45849
45850   // Resolve the target shuffle inputs and mask.
45851   SmallVector<int, 16> Mask;
45852   SmallVector<SDValue, 2> Ops;
45853   if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
45854     return SDValue();
45855
45856   // Shuffle inputs must be the same size as the result.
45857   if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
45858         return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
45859       }))
45860     return SDValue();
45861
45862   // Attempt to narrow/widen the shuffle mask to the correct size.
45863   if (Mask.size() != NumSrcElts) {
45864     if ((NumSrcElts % Mask.size()) == 0) {
45865       SmallVector<int, 16> ScaledMask;
45866       int Scale = NumSrcElts / Mask.size();
45867       narrowShuffleMaskElts(Scale, Mask, ScaledMask);
45868       Mask = std::move(ScaledMask);
45869     } else if ((Mask.size() % NumSrcElts) == 0) {
45870       // Simplify Mask based on demanded element.
45871       int ExtractIdx = (int)IdxC.getZExtValue();
45872       int Scale = Mask.size() / NumSrcElts;
45873       int Lo = Scale * ExtractIdx;
45874       int Hi = Scale * (ExtractIdx + 1);
45875       for (int i = 0, e = (int)Mask.size(); i != e; ++i)
45876         if (i < Lo || Hi <= i)
45877           Mask[i] = SM_SentinelUndef;
45878
45879       SmallVector<int, 16> WidenedMask;
45880       while (Mask.size() > NumSrcElts &&
45881              canWidenShuffleElements(Mask, WidenedMask))
45882         Mask = std::move(WidenedMask);
45883     }
45884   }
45885
45886   // If narrowing/widening failed, see if we can extract+zero-extend.
45887   int ExtractIdx;
45888   EVT ExtractVT;
45889   if (Mask.size() == NumSrcElts) {
45890     ExtractIdx = Mask[IdxC.getZExtValue()];
45891     ExtractVT = SrcVT;
45892   } else {
45893     unsigned Scale = Mask.size() / NumSrcElts;
45894     if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
45895       return SDValue();
45896     unsigned ScaledIdx = Scale * IdxC.getZExtValue();
45897     if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
45898       return SDValue();
45899     ExtractIdx = Mask[ScaledIdx];
45900     EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
45901     ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
45902     assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
45903            "Failed to widen vector type");
45904   }
45905
45906   // If the shuffle source element is undef/zero then we can just accept it.
45907   if (ExtractIdx == SM_SentinelUndef)
45908     return DAG.getUNDEF(VT);
45909
45910   if (ExtractIdx == SM_SentinelZero)
45911     return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
45912                                 : DAG.getConstant(0, dl, VT);
45913
45914   SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
45915   ExtractIdx = ExtractIdx % Mask.size();
45916   if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
45917     return DAG.getZExtOrTrunc(V, dl, VT);
45918
45919   return SDValue();
45920 }
45921
45922 /// Extracting a scalar FP value from vector element 0 is free, so extract each
45923 /// operand first, then perform the math as a scalar op.
45924 static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
45925                                  const X86Subtarget &Subtarget) {
45926   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
45927   SDValue Vec = ExtElt->getOperand(0);
45928   SDValue Index = ExtElt->getOperand(1);
45929   EVT VT = ExtElt->getValueType(0);
45930   EVT VecVT = Vec.getValueType();
45931
45932   // TODO: If this is a unary/expensive/expand op, allow extraction from a
45933   // non-zero element because the shuffle+scalar op will be cheaper?
45934   if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
45935     return SDValue();
45936
45937   // Vector FP compares don't fit the pattern of FP math ops (propagate, not
45938   // extract, the condition code), so deal with those as a special-case.
45939   if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
45940     EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
45941     if (OpVT != MVT::f32 && OpVT != MVT::f64)
45942       return SDValue();
45943
45944     // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
45945     SDLoc DL(ExtElt);
45946     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
45947                                Vec.getOperand(0), Index);
45948     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
45949                                Vec.getOperand(1), Index);
45950     return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
45951   }
45952
45953   if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
45954       VT != MVT::f64)
45955     return SDValue();
45956
45957   // Vector FP selects don't fit the pattern of FP math ops (because the
45958   // condition has a different type and we have to change the opcode), so deal
45959   // with those here.
45960   // FIXME: This is restricted to pre type legalization by ensuring the setcc
45961   // has i1 elements. If we loosen this we need to convert vector bool to a
45962   // scalar bool.
45963   if (Vec.getOpcode() == ISD::VSELECT &&
45964       Vec.getOperand(0).getOpcode() == ISD::SETCC &&
45965       Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
45966       Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
45967     // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
45968     SDLoc DL(ExtElt);
45969     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
45970                                Vec.getOperand(0).getValueType().getScalarType(),
45971                                Vec.getOperand(0), Index);
45972     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45973                                Vec.getOperand(1), Index);
45974     SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45975                                Vec.getOperand(2), Index);
45976     return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
45977   }
45978
45979   // TODO: This switch could include FNEG and the x86-specific FP logic ops
45980   // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
45981   // missed load folding and fma+fneg combining.
45982   switch (Vec.getOpcode()) {
45983   case ISD::FMA: // Begin 3 operands
45984   case ISD::FMAD:
45985   case ISD::FADD: // Begin 2 operands
45986   case ISD::FSUB:
45987   case ISD::FMUL:
45988   case ISD::FDIV:
45989   case ISD::FREM:
45990   case ISD::FCOPYSIGN:
45991   case ISD::FMINNUM:
45992   case ISD::FMAXNUM:
45993   case ISD::FMINNUM_IEEE:
45994   case ISD::FMAXNUM_IEEE:
45995   case ISD::FMAXIMUM:
45996   case ISD::FMINIMUM:
45997   case X86ISD::FMAX:
45998   case X86ISD::FMIN:
45999   case ISD::FABS: // Begin 1 operand
46000   case ISD::FSQRT:
46001   case ISD::FRINT:
46002   case ISD::FCEIL:
46003   case ISD::FTRUNC:
46004   case ISD::FNEARBYINT:
46005   case ISD::FROUND:
46006   case ISD::FFLOOR:
46007   case X86ISD::FRCP:
46008   case X86ISD::FRSQRT: {
46009     // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46010     SDLoc DL(ExtElt);
46011     SmallVector<SDValue, 4> ExtOps;
46012     for (SDValue Op : Vec->ops())
46013       ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46014     return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46015   }
46016   default:
46017     return SDValue();
46018   }
46019   llvm_unreachable("All opcodes should return within switch");
46020 }
46021
46022 /// Try to convert a vector reduction sequence composed of binops and shuffles
46023 /// into horizontal ops.
46024 static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
46025                                      const X86Subtarget &Subtarget) {
46026   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46027
46028   // We need at least SSE2 to anything here.
46029   if (!Subtarget.hasSSE2())
46030     return SDValue();
46031
46032   ISD::NodeType Opc;
46033   SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46034                                         {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46035   if (!Rdx)
46036     return SDValue();
46037
46038   SDValue Index = ExtElt->getOperand(1);
46039   assert(isNullConstant(Index) &&
46040          "Reduction doesn't end in an extract from index 0");
46041
46042   EVT VT = ExtElt->getValueType(0);
46043   EVT VecVT = Rdx.getValueType();
46044   if (VecVT.getScalarType() != VT)
46045     return SDValue();
46046
46047   SDLoc DL(ExtElt);
46048   unsigned NumElts = VecVT.getVectorNumElements();
46049   unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46050
46051   // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46052   auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46053     if (V.getValueType() == MVT::v4i8) {
46054       if (ZeroExtend && Subtarget.hasSSE41()) {
46055         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46056                         DAG.getConstant(0, DL, MVT::v4i32),
46057                         DAG.getBitcast(MVT::i32, V),
46058                         DAG.getIntPtrConstant(0, DL));
46059         return DAG.getBitcast(MVT::v16i8, V);
46060       }
46061       V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
46062                       ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
46063                                  : DAG.getUNDEF(MVT::v4i8));
46064     }
46065     return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
46066                        DAG.getUNDEF(MVT::v8i8));
46067   };
46068
46069   // vXi8 mul reduction - promote to vXi16 mul reduction.
46070   if (Opc == ISD::MUL) {
46071     if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
46072       return SDValue();
46073     if (VecVT.getSizeInBits() >= 128) {
46074       EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
46075       SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46076       SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
46077       Lo = DAG.getBitcast(WideVT, Lo);
46078       Hi = DAG.getBitcast(WideVT, Hi);
46079       Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
46080       while (Rdx.getValueSizeInBits() > 128) {
46081         std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46082         Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
46083       }
46084     } else {
46085       Rdx = WidenToV16I8(Rdx, false);
46086       Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
46087       Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
46088     }
46089     if (NumElts >= 8)
46090       Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46091                         DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46092                                              {4, 5, 6, 7, -1, -1, -1, -1}));
46093     Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46094                       DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46095                                            {2, 3, -1, -1, -1, -1, -1, -1}));
46096     Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
46097                       DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
46098                                            {1, -1, -1, -1, -1, -1, -1, -1}));
46099     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46100     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46101   }
46102
46103   // vXi8 add reduction - sub 128-bit vector.
46104   if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
46105     Rdx = WidenToV16I8(Rdx, true);
46106     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
46107                       DAG.getConstant(0, DL, MVT::v16i8));
46108     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46109     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46110   }
46111
46112   // Must be a >=128-bit vector with pow2 elements.
46113   if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
46114     return SDValue();
46115
46116   // vXi8 add reduction - sum lo/hi halves then use PSADBW.
46117   if (VT == MVT::i8) {
46118     while (Rdx.getValueSizeInBits() > 128) {
46119       SDValue Lo, Hi;
46120       std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46121       VecVT = Lo.getValueType();
46122       Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
46123     }
46124     assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
46125
46126     SDValue Hi = DAG.getVectorShuffle(
46127         MVT::v16i8, DL, Rdx, Rdx,
46128         {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
46129     Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
46130     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
46131                       getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
46132     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
46133     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46134   }
46135
46136   // See if we can use vXi8 PSADBW add reduction for larger zext types.
46137   // If the source vector values are 0-255, then we can use PSADBW to
46138   // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
46139   // TODO: See if its worth avoiding vXi16/i32 truncations?
46140   if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
46141       DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
46142       (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
46143        Subtarget.hasAVX512())) {
46144     EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
46145     Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
46146     if (ByteVT.getSizeInBits() < 128)
46147       Rdx = WidenToV16I8(Rdx, true);
46148
46149     // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46150     auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46151                             ArrayRef<SDValue> Ops) {
46152       MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46153       SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
46154       return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
46155     };
46156     MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
46157     Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
46158
46159     // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
46160     while (Rdx.getValueSizeInBits() > 128) {
46161       SDValue Lo, Hi;
46162       std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
46163       VecVT = Lo.getValueType();
46164       Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
46165     }
46166     assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
46167
46168     if (NumElts > 8) {
46169       SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
46170       Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
46171     }
46172
46173     VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
46174     Rdx = DAG.getBitcast(VecVT, Rdx);
46175     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46176   }
46177
46178   // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
46179   if (!shouldUseHorizontalOp(true, DAG, Subtarget))
46180     return SDValue();
46181
46182   unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
46183
46184   // 256-bit horizontal instructions operate on 128-bit chunks rather than
46185   // across the whole vector, so we need an extract + hop preliminary stage.
46186   // This is the only step where the operands of the hop are not the same value.
46187   // TODO: We could extend this to handle 512-bit or even longer vectors.
46188   if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
46189       ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
46190     unsigned NumElts = VecVT.getVectorNumElements();
46191     SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
46192     SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
46193     Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
46194     VecVT = Rdx.getValueType();
46195   }
46196   if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
46197       !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
46198     return SDValue();
46199
46200   // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
46201   unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
46202   for (unsigned i = 0; i != ReductionSteps; ++i)
46203     Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
46204
46205   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
46206 }
46207
46208 /// Detect vector gather/scatter index generation and convert it from being a
46209 /// bunch of shuffles and extracts into a somewhat faster sequence.
46210 /// For i686, the best sequence is apparently storing the value and loading
46211 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
46212 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
46213                                        TargetLowering::DAGCombinerInfo &DCI,
46214                                        const X86Subtarget &Subtarget) {
46215   if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
46216     return NewOp;
46217
46218   SDValue InputVector = N->getOperand(0);
46219   SDValue EltIdx = N->getOperand(1);
46220   auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
46221
46222   EVT SrcVT = InputVector.getValueType();
46223   EVT VT = N->getValueType(0);
46224   SDLoc dl(InputVector);
46225   bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
46226   unsigned NumSrcElts = SrcVT.getVectorNumElements();
46227   unsigned NumEltBits = VT.getScalarSizeInBits();
46228   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46229
46230   if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
46231     return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
46232
46233   // Integer Constant Folding.
46234   if (CIdx && VT.isInteger()) {
46235     APInt UndefVecElts;
46236     SmallVector<APInt, 16> EltBits;
46237     unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
46238     if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
46239                                       EltBits, true, false)) {
46240       uint64_t Idx = CIdx->getZExtValue();
46241       if (UndefVecElts[Idx])
46242         return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
46243       return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
46244     }
46245
46246     // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
46247     // Improves lowering of bool masks on rust which splits them into byte array.
46248     if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
46249       SDValue Src = peekThroughBitcasts(InputVector);
46250       if (Src.getValueType().getScalarType() == MVT::i1 &&
46251           TLI.isTypeLegal(Src.getValueType())) {
46252         MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
46253         SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
46254             DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
46255         return DAG.getBitcast(VT, Sub);
46256       }
46257     }
46258   }
46259
46260   if (IsPextr) {
46261     if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
46262                                  DCI))
46263       return SDValue(N, 0);
46264
46265     // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
46266     if ((InputVector.getOpcode() == X86ISD::PINSRB ||
46267          InputVector.getOpcode() == X86ISD::PINSRW) &&
46268         InputVector.getOperand(2) == EltIdx) {
46269       assert(SrcVT == InputVector.getOperand(0).getValueType() &&
46270              "Vector type mismatch");
46271       SDValue Scl = InputVector.getOperand(1);
46272       Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
46273       return DAG.getZExtOrTrunc(Scl, dl, VT);
46274     }
46275
46276     // TODO - Remove this once we can handle the implicit zero-extension of
46277     // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
46278     // combineBasicSADPattern.
46279     return SDValue();
46280   }
46281
46282   // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
46283   if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
46284       InputVector.getOpcode() == ISD::BITCAST &&
46285       InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
46286       isNullConstant(EltIdx) && InputVector.hasOneUse())
46287     return DAG.getBitcast(VT, InputVector);
46288
46289   // Detect mmx to i32 conversion through a v2i32 elt extract.
46290   if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
46291       InputVector.getOpcode() == ISD::BITCAST &&
46292       InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
46293       isNullConstant(EltIdx) && InputVector.hasOneUse())
46294     return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
46295                        InputVector.getOperand(0));
46296
46297   // Check whether this extract is the root of a sum of absolute differences
46298   // pattern. This has to be done here because we really want it to happen
46299   // pre-legalization,
46300   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
46301     return SAD;
46302
46303   if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
46304     return VPDPBUSD;
46305
46306   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
46307   if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
46308     return Cmp;
46309
46310   // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
46311   if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
46312     return MinMax;
46313
46314   // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
46315   if (SDValue V = combineArithReduction(N, DAG, Subtarget))
46316     return V;
46317
46318   if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
46319     return V;
46320
46321   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
46322   // and then testing the relevant element.
46323   //
46324   // Note that we only combine extracts on the *same* result number, i.e.
46325   //   t0 = merge_values a0, a1, a2, a3
46326   //   i1 = extract_vector_elt t0, Constant:i64<2>
46327   //   i1 = extract_vector_elt t0, Constant:i64<3>
46328   // but not
46329   //   i1 = extract_vector_elt t0:1, Constant:i64<2>
46330   // since the latter would need its own MOVMSK.
46331   if (SrcVT.getScalarType() == MVT::i1) {
46332     bool IsVar = !CIdx;
46333     SmallVector<SDNode *, 16> BoolExtracts;
46334     unsigned ResNo = InputVector.getResNo();
46335     auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
46336       if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46337           Use->getOperand(0).getResNo() == ResNo &&
46338           Use->getValueType(0) == MVT::i1) {
46339         BoolExtracts.push_back(Use);
46340         IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
46341         return true;
46342       }
46343       return false;
46344     };
46345     // TODO: Can we drop the oneuse check for constant extracts?
46346     if (all_of(InputVector->uses(), IsBoolExtract) &&
46347         (IsVar || BoolExtracts.size() > 1)) {
46348       EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
46349       if (SDValue BC =
46350               combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
46351         for (SDNode *Use : BoolExtracts) {
46352           // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
46353           // Mask = 1 << MaskIdx
46354           SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
46355           SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
46356           SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
46357           SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
46358           Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
46359           DCI.CombineTo(Use, Res);
46360         }
46361         return SDValue(N, 0);
46362       }
46363     }
46364   }
46365
46366   // If this extract is from a loaded vector value and will be used as an
46367   // integer, that requires a potentially expensive XMM -> GPR transfer.
46368   // Additionally, if we can convert to a scalar integer load, that will likely
46369   // be folded into a subsequent integer op.
46370   // Note: Unlike the related fold for this in DAGCombiner, this is not limited
46371   //       to a single-use of the loaded vector. For the reasons above, we
46372   //       expect this to be profitable even if it creates an extra load.
46373   bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
46374     return Use->getOpcode() == ISD::STORE ||
46375            Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46376            Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46377   });
46378   auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
46379   if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46380       SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
46381       !LikelyUsedAsVector && LoadVec->isSimple()) {
46382     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46383     SDValue NewPtr =
46384         TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
46385     unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
46386     MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46387     Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46388     SDValue Load =
46389         DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46390                     LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46391     DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46392     return Load;
46393   }
46394
46395   return SDValue();
46396 }
46397
46398 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
46399 // This is more or less the reverse of combineBitcastvxi1.
46400 static SDValue combineToExtendBoolVectorInReg(
46401     unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
46402     TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
46403   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
46404       Opcode != ISD::ANY_EXTEND)
46405     return SDValue();
46406   if (!DCI.isBeforeLegalizeOps())
46407     return SDValue();
46408   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
46409     return SDValue();
46410
46411   EVT SVT = VT.getScalarType();
46412   EVT InSVT = N0.getValueType().getScalarType();
46413   unsigned EltSizeInBits = SVT.getSizeInBits();
46414
46415   // Input type must be extending a bool vector (bit-casted from a scalar
46416   // integer) to legal integer types.
46417   if (!VT.isVector())
46418     return SDValue();
46419   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
46420     return SDValue();
46421   if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
46422     return SDValue();
46423
46424   SDValue N00 = N0.getOperand(0);
46425   EVT SclVT = N00.getValueType();
46426   if (!SclVT.isScalarInteger())
46427     return SDValue();
46428
46429   SDValue Vec;
46430   SmallVector<int> ShuffleMask;
46431   unsigned NumElts = VT.getVectorNumElements();
46432   assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
46433
46434   // Broadcast the scalar integer to the vector elements.
46435   if (NumElts > EltSizeInBits) {
46436     // If the scalar integer is greater than the vector element size, then we
46437     // must split it down into sub-sections for broadcasting. For example:
46438     //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
46439     //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
46440     assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
46441     unsigned Scale = NumElts / EltSizeInBits;
46442     EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
46443     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46444     Vec = DAG.getBitcast(VT, Vec);
46445
46446     for (unsigned i = 0; i != Scale; ++i)
46447       ShuffleMask.append(EltSizeInBits, i);
46448     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46449   } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
46450              (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
46451     // If we have register broadcast instructions, use the scalar size as the
46452     // element type for the shuffle. Then cast to the wider element type. The
46453     // widened bits won't be used, and this might allow the use of a broadcast
46454     // load.
46455     assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
46456     unsigned Scale = EltSizeInBits / NumElts;
46457     EVT BroadcastVT =
46458         EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
46459     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
46460     ShuffleMask.append(NumElts * Scale, 0);
46461     Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
46462     Vec = DAG.getBitcast(VT, Vec);
46463   } else {
46464     // For smaller scalar integers, we can simply any-extend it to the vector
46465     // element size (we don't care about the upper bits) and broadcast it to all
46466     // elements.
46467     SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
46468     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
46469     ShuffleMask.append(NumElts, 0);
46470     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
46471   }
46472
46473   // Now, mask the relevant bit in each element.
46474   SmallVector<SDValue, 32> Bits;
46475   for (unsigned i = 0; i != NumElts; ++i) {
46476     int BitIdx = (i % EltSizeInBits);
46477     APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
46478     Bits.push_back(DAG.getConstant(Bit, DL, SVT));
46479   }
46480   SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
46481   Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
46482
46483   // Compare against the bitmask and extend the result.
46484   EVT CCVT = VT.changeVectorElementType(MVT::i1);
46485   Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
46486   Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
46487
46488   // For SEXT, this is now done, otherwise shift the result down for
46489   // zero-extension.
46490   if (Opcode == ISD::SIGN_EXTEND)
46491     return Vec;
46492   return DAG.getNode(ISD::SRL, DL, VT, Vec,
46493                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
46494 }
46495
46496 /// If a vector select has an operand that is -1 or 0, try to simplify the
46497 /// select to a bitwise logic operation.
46498 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
46499 static SDValue
46500 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
46501                                  TargetLowering::DAGCombinerInfo &DCI,
46502                                  const X86Subtarget &Subtarget) {
46503   SDValue Cond = N->getOperand(0);
46504   SDValue LHS = N->getOperand(1);
46505   SDValue RHS = N->getOperand(2);
46506   EVT VT = LHS.getValueType();
46507   EVT CondVT = Cond.getValueType();
46508   SDLoc DL(N);
46509   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46510
46511   if (N->getOpcode() != ISD::VSELECT)
46512     return SDValue();
46513
46514   assert(CondVT.isVector() && "Vector select expects a vector selector!");
46515
46516   // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
46517   // TODO: Can we assert that both operands are not zeros (because that should
46518   //       get simplified at node creation time)?
46519   bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
46520   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
46521
46522   // If both inputs are 0/undef, create a complete zero vector.
46523   // FIXME: As noted above this should be handled by DAGCombiner/getNode.
46524   if (TValIsAllZeros && FValIsAllZeros) {
46525     if (VT.isFloatingPoint())
46526       return DAG.getConstantFP(0.0, DL, VT);
46527     return DAG.getConstant(0, DL, VT);
46528   }
46529
46530   // To use the condition operand as a bitwise mask, it must have elements that
46531   // are the same size as the select elements. Ie, the condition operand must
46532   // have already been promoted from the IR select condition type <N x i1>.
46533   // Don't check if the types themselves are equal because that excludes
46534   // vector floating-point selects.
46535   if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
46536     return SDValue();
46537
46538   // Try to invert the condition if true value is not all 1s and false value is
46539   // not all 0s. Only do this if the condition has one use.
46540   bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
46541   if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
46542       // Check if the selector will be produced by CMPP*/PCMP*.
46543       Cond.getOpcode() == ISD::SETCC &&
46544       // Check if SETCC has already been promoted.
46545       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
46546           CondVT) {
46547     bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
46548
46549     if (TValIsAllZeros || FValIsAllOnes) {
46550       SDValue CC = Cond.getOperand(2);
46551       ISD::CondCode NewCC = ISD::getSetCCInverse(
46552           cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
46553       Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
46554                           NewCC);
46555       std::swap(LHS, RHS);
46556       TValIsAllOnes = FValIsAllOnes;
46557       FValIsAllZeros = TValIsAllZeros;
46558     }
46559   }
46560
46561   // Cond value must be 'sign splat' to be converted to a logical op.
46562   if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
46563     return SDValue();
46564
46565   // vselect Cond, 111..., 000... -> Cond
46566   if (TValIsAllOnes && FValIsAllZeros)
46567     return DAG.getBitcast(VT, Cond);
46568
46569   if (!TLI.isTypeLegal(CondVT))
46570     return SDValue();
46571
46572   // vselect Cond, 111..., X -> or Cond, X
46573   if (TValIsAllOnes) {
46574     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46575     SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
46576     return DAG.getBitcast(VT, Or);
46577   }
46578
46579   // vselect Cond, X, 000... -> and Cond, X
46580   if (FValIsAllZeros) {
46581     SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
46582     SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
46583     return DAG.getBitcast(VT, And);
46584   }
46585
46586   // vselect Cond, 000..., X -> andn Cond, X
46587   if (TValIsAllZeros) {
46588     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
46589     SDValue AndN;
46590     // The canonical form differs for i1 vectors - x86andnp is not used
46591     if (CondVT.getScalarType() == MVT::i1)
46592       AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
46593                          CastRHS);
46594     else
46595       AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
46596     return DAG.getBitcast(VT, AndN);
46597   }
46598
46599   return SDValue();
46600 }
46601
46602 /// If both arms of a vector select are concatenated vectors, split the select,
46603 /// and concatenate the result to eliminate a wide (256-bit) vector instruction:
46604 ///   vselect Cond, (concat T0, T1), (concat F0, F1) -->
46605 ///   concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
46606 static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
46607                                   const X86Subtarget &Subtarget) {
46608   unsigned Opcode = N->getOpcode();
46609   if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
46610     return SDValue();
46611
46612   // TODO: Split 512-bit vectors too?
46613   EVT VT = N->getValueType(0);
46614   if (!VT.is256BitVector())
46615     return SDValue();
46616
46617   // TODO: Split as long as any 2 of the 3 operands are concatenated?
46618   SDValue Cond = N->getOperand(0);
46619   SDValue TVal = N->getOperand(1);
46620   SDValue FVal = N->getOperand(2);
46621   if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
46622       !isFreeToSplitVector(TVal.getNode(), DAG) ||
46623       !isFreeToSplitVector(FVal.getNode(), DAG))
46624     return SDValue();
46625
46626   auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
46627                             ArrayRef<SDValue> Ops) {
46628     return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
46629   };
46630   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
46631                           makeBlend, /*CheckBWI*/ false);
46632 }
46633
46634 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
46635   SDValue Cond = N->getOperand(0);
46636   SDValue LHS = N->getOperand(1);
46637   SDValue RHS = N->getOperand(2);
46638   SDLoc DL(N);
46639
46640   auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
46641   auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
46642   if (!TrueC || !FalseC)
46643     return SDValue();
46644
46645   // Don't do this for crazy integer types.
46646   EVT VT = N->getValueType(0);
46647   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
46648     return SDValue();
46649
46650   // We're going to use the condition bit in math or logic ops. We could allow
46651   // this with a wider condition value (post-legalization it becomes an i8),
46652   // but if nothing is creating selects that late, it doesn't matter.
46653   if (Cond.getValueType() != MVT::i1)
46654     return SDValue();
46655
46656   // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
46657   // 3, 5, or 9 with i32/i64, so those get transformed too.
46658   // TODO: For constants that overflow or do not differ by power-of-2 or small
46659   // multiplier, convert to 'and' + 'add'.
46660   const APInt &TrueVal = TrueC->getAPIntValue();
46661   const APInt &FalseVal = FalseC->getAPIntValue();
46662
46663   // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
46664   if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
46665       Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
46666     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46667     if (CC == ISD::SETEQ || CC == ISD::SETNE)
46668       return SDValue();
46669   }
46670
46671   bool OV;
46672   APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
46673   if (OV)
46674     return SDValue();
46675
46676   APInt AbsDiff = Diff.abs();
46677   if (AbsDiff.isPowerOf2() ||
46678       ((VT == MVT::i32 || VT == MVT::i64) &&
46679        (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
46680
46681     // We need a positive multiplier constant for shift/LEA codegen. The 'not'
46682     // of the condition can usually be folded into a compare predicate, but even
46683     // without that, the sequence should be cheaper than a CMOV alternative.
46684     if (TrueVal.slt(FalseVal)) {
46685       Cond = DAG.getNOT(DL, Cond, MVT::i1);
46686       std::swap(TrueC, FalseC);
46687     }
46688
46689     // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
46690     SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
46691
46692     // Multiply condition by the difference if non-one.
46693     if (!AbsDiff.isOne())
46694       R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
46695
46696     // Add the base if non-zero.
46697     if (!FalseC->isZero())
46698       R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
46699
46700     return R;
46701   }
46702
46703   return SDValue();
46704 }
46705
46706 /// If this is a *dynamic* select (non-constant condition) and we can match
46707 /// this node with one of the variable blend instructions, restructure the
46708 /// condition so that blends can use the high (sign) bit of each element.
46709 /// This function will also call SimplifyDemandedBits on already created
46710 /// BLENDV to perform additional simplifications.
46711 static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
46712                                       TargetLowering::DAGCombinerInfo &DCI,
46713                                       const X86Subtarget &Subtarget) {
46714   SDValue Cond = N->getOperand(0);
46715   if ((N->getOpcode() != ISD::VSELECT &&
46716        N->getOpcode() != X86ISD::BLENDV) ||
46717       ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
46718     return SDValue();
46719
46720   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46721   unsigned BitWidth = Cond.getScalarValueSizeInBits();
46722   EVT VT = N->getValueType(0);
46723
46724   // We can only handle the cases where VSELECT is directly legal on the
46725   // subtarget. We custom lower VSELECT nodes with constant conditions and
46726   // this makes it hard to see whether a dynamic VSELECT will correctly
46727   // lower, so we both check the operation's status and explicitly handle the
46728   // cases where a *dynamic* blend will fail even though a constant-condition
46729   // blend could be custom lowered.
46730   // FIXME: We should find a better way to handle this class of problems.
46731   // Potentially, we should combine constant-condition vselect nodes
46732   // pre-legalization into shuffles and not mark as many types as custom
46733   // lowered.
46734   if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
46735     return SDValue();
46736   // FIXME: We don't support i16-element blends currently. We could and
46737   // should support them by making *all* the bits in the condition be set
46738   // rather than just the high bit and using an i8-element blend.
46739   if (VT.getVectorElementType() == MVT::i16)
46740     return SDValue();
46741   // Dynamic blending was only available from SSE4.1 onward.
46742   if (VT.is128BitVector() && !Subtarget.hasSSE41())
46743     return SDValue();
46744   // Byte blends are only available in AVX2
46745   if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
46746     return SDValue();
46747   // There are no 512-bit blend instructions that use sign bits.
46748   if (VT.is512BitVector())
46749     return SDValue();
46750
46751   // Don't optimize before the condition has been transformed to a legal type
46752   // and don't ever optimize vector selects that map to AVX512 mask-registers.
46753   if (BitWidth < 8 || BitWidth > 64)
46754     return SDValue();
46755
46756   auto OnlyUsedAsSelectCond = [](SDValue Cond) {
46757     for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
46758          UI != UE; ++UI)
46759       if ((UI->getOpcode() != ISD::VSELECT &&
46760            UI->getOpcode() != X86ISD::BLENDV) ||
46761           UI.getOperandNo() != 0)
46762         return false;
46763
46764     return true;
46765   };
46766
46767   APInt DemandedBits(APInt::getSignMask(BitWidth));
46768
46769   if (OnlyUsedAsSelectCond(Cond)) {
46770     KnownBits Known;
46771     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
46772                                           !DCI.isBeforeLegalizeOps());
46773     if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
46774       return SDValue();
46775
46776     // If we changed the computation somewhere in the DAG, this change will
46777     // affect all users of Cond. Update all the nodes so that we do not use
46778     // the generic VSELECT anymore. Otherwise, we may perform wrong
46779     // optimizations as we messed with the actual expectation for the vector
46780     // boolean values.
46781     for (SDNode *U : Cond->uses()) {
46782       if (U->getOpcode() == X86ISD::BLENDV)
46783         continue;
46784
46785       SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
46786                                Cond, U->getOperand(1), U->getOperand(2));
46787       DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
46788       DCI.AddToWorklist(U);
46789     }
46790     DCI.CommitTargetLoweringOpt(TLO);
46791     return SDValue(N, 0);
46792   }
46793
46794   // Otherwise we can still at least try to simplify multiple use bits.
46795   if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
46796       return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
46797                          N->getOperand(1), N->getOperand(2));
46798
46799   return SDValue();
46800 }
46801
46802 // Try to match:
46803 //   (or (and (M, (sub 0, X)), (pandn M, X)))
46804 // which is a special case of:
46805 //   (select M, (sub 0, X), X)
46806 // Per:
46807 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
46808 // We know that, if fNegate is 0 or 1:
46809 //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
46810 //
46811 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
46812 //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
46813 //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
46814 // This lets us transform our vselect to:
46815 //   (add (xor X, M), (and M, 1))
46816 // And further to:
46817 //   (sub (xor X, M), M)
46818 static SDValue combineLogicBlendIntoConditionalNegate(
46819     EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
46820     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
46821   EVT MaskVT = Mask.getValueType();
46822   assert(MaskVT.isInteger() &&
46823          DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
46824          "Mask must be zero/all-bits");
46825
46826   if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
46827     return SDValue();
46828   if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
46829     return SDValue();
46830
46831   auto IsNegV = [](SDNode *N, SDValue V) {
46832     return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
46833            ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
46834   };
46835
46836   SDValue V;
46837   if (IsNegV(Y.getNode(), X))
46838     V = X;
46839   else if (IsNegV(X.getNode(), Y))
46840     V = Y;
46841   else
46842     return SDValue();
46843
46844   SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
46845   SDValue SubOp2 = Mask;
46846
46847   // If the negate was on the false side of the select, then
46848   // the operands of the SUB need to be swapped. PR 27251.
46849   // This is because the pattern being matched above is
46850   // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
46851   // but if the pattern matched was
46852   // (vselect M, X, (sub (0, X))), that is really negation of the pattern
46853   // above, -(vselect M, (sub 0, X), X), and therefore the replacement
46854   // pattern also needs to be a negation of the replacement pattern above.
46855   // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
46856   // sub accomplishes the negation of the replacement pattern.
46857   if (V == Y)
46858     std::swap(SubOp1, SubOp2);
46859
46860   SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
46861   return DAG.getBitcast(VT, Res);
46862 }
46863
46864 static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG,
46865                                   const X86Subtarget &Subtarget) {
46866   if (!Subtarget.hasAVX512())
46867     return SDValue();
46868   if (N->getOpcode() != ISD::VSELECT)
46869     return SDValue();
46870
46871   SDLoc DL(N);
46872   SDValue Cond = N->getOperand(0);
46873   SDValue LHS = N->getOperand(1);
46874   SDValue RHS = N->getOperand(2);
46875
46876   if (canCombineAsMaskOperation(LHS, Subtarget))
46877     return SDValue();
46878
46879   if (!canCombineAsMaskOperation(RHS, Subtarget))
46880     return SDValue();
46881
46882   if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
46883     return SDValue();
46884
46885   // Commute LHS and RHS to create opportunity to select mask instruction.
46886   // (vselect M, L, R) -> (vselect ~M, R, L)
46887   ISD::CondCode NewCC =
46888       ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
46889                            Cond.getOperand(0).getValueType());
46890   Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
46891                                         Cond.getOperand(1), NewCC);
46892   return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
46893 }
46894
46895 /// Do target-specific dag combines on SELECT and VSELECT nodes.
46896 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
46897                              TargetLowering::DAGCombinerInfo &DCI,
46898                              const X86Subtarget &Subtarget) {
46899   SDLoc DL(N);
46900   SDValue Cond = N->getOperand(0);
46901   SDValue LHS = N->getOperand(1);
46902   SDValue RHS = N->getOperand(2);
46903
46904   // Try simplification again because we use this function to optimize
46905   // BLENDV nodes that are not handled by the generic combiner.
46906   if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
46907     return V;
46908
46909   // When avx512 is available the lhs operand of select instruction can be
46910   // folded with mask instruction, while the rhs operand can't. Commute the
46911   // lhs and rhs of the select instruction to create the opportunity of
46912   // folding.
46913   if (SDValue V = commuteSelect(N, DAG, Subtarget))
46914     return V;
46915
46916   EVT VT = LHS.getValueType();
46917   EVT CondVT = Cond.getValueType();
46918   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46919   bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
46920
46921   // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
46922   // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
46923   // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
46924   if (CondVT.isVector() && CondVT.isInteger() &&
46925       CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
46926       (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
46927       DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
46928     if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
46929                                                            DL, DAG, Subtarget))
46930       return V;
46931
46932   // Convert vselects with constant condition into shuffles.
46933   if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
46934       (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
46935     SmallVector<int, 64> Mask;
46936     if (createShuffleMaskFromVSELECT(Mask, Cond,
46937                                      N->getOpcode() == X86ISD::BLENDV))
46938       return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
46939   }
46940
46941   // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
46942   // by forcing the unselected elements to zero.
46943   // TODO: Can we handle more shuffles with this?
46944   if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
46945       LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
46946       LHS.hasOneUse() && RHS.hasOneUse()) {
46947     MVT SimpleVT = VT.getSimpleVT();
46948     SmallVector<SDValue, 1> LHSOps, RHSOps;
46949     SmallVector<int, 64> LHSMask, RHSMask, CondMask;
46950     if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
46951         getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
46952         getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
46953       int NumElts = VT.getVectorNumElements();
46954       for (int i = 0; i != NumElts; ++i) {
46955         // getConstVector sets negative shuffle mask values as undef, so ensure
46956         // we hardcode SM_SentinelZero values to zero (0x80).
46957         if (CondMask[i] < NumElts) {
46958           LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
46959           RHSMask[i] = 0x80;
46960         } else {
46961           LHSMask[i] = 0x80;
46962           RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
46963         }
46964       }
46965       LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
46966                         getConstVector(LHSMask, SimpleVT, DAG, DL, true));
46967       RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
46968                         getConstVector(RHSMask, SimpleVT, DAG, DL, true));
46969       return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
46970     }
46971   }
46972
46973   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
46974   // instructions match the semantics of the common C idiom x<y?x:y but not
46975   // x<=y?x:y, because of how they handle negative zero (which can be
46976   // ignored in unsafe-math mode).
46977   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
46978   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
46979       VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&
46980       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
46981       (Subtarget.hasSSE2() ||
46982        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
46983     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
46984
46985     unsigned Opcode = 0;
46986     // Check for x CC y ? x : y.
46987     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
46988         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
46989       switch (CC) {
46990       default: break;
46991       case ISD::SETULT:
46992         // Converting this to a min would handle NaNs incorrectly, and swapping
46993         // the operands would cause it to handle comparisons between positive
46994         // and negative zero incorrectly.
46995         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
46996           if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
46997               !(DAG.isKnownNeverZeroFloat(LHS) ||
46998                 DAG.isKnownNeverZeroFloat(RHS)))
46999             break;
47000           std::swap(LHS, RHS);
47001         }
47002         Opcode = X86ISD::FMIN;
47003         break;
47004       case ISD::SETOLE:
47005         // Converting this to a min would handle comparisons between positive
47006         // and negative zero incorrectly.
47007         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
47008             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
47009           break;
47010         Opcode = X86ISD::FMIN;
47011         break;
47012       case ISD::SETULE:
47013         // Converting this to a min would handle both negative zeros and NaNs
47014         // incorrectly, but we can swap the operands to fix both.
47015         std::swap(LHS, RHS);
47016         [[fallthrough]];
47017       case ISD::SETOLT:
47018       case ISD::SETLT:
47019       case ISD::SETLE:
47020         Opcode = X86ISD::FMIN;
47021         break;
47022
47023       case ISD::SETOGE:
47024         // Converting this to a max would handle comparisons between positive
47025         // and negative zero incorrectly.
47026         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
47027             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
47028           break;
47029         Opcode = X86ISD::FMAX;
47030         break;
47031       case ISD::SETUGT:
47032         // Converting this to a max would handle NaNs incorrectly, and swapping
47033         // the operands would cause it to handle comparisons between positive
47034         // and negative zero incorrectly.
47035         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47036           if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
47037               !(DAG.isKnownNeverZeroFloat(LHS) ||
47038                 DAG.isKnownNeverZeroFloat(RHS)))
47039             break;
47040           std::swap(LHS, RHS);
47041         }
47042         Opcode = X86ISD::FMAX;
47043         break;
47044       case ISD::SETUGE:
47045         // Converting this to a max would handle both negative zeros and NaNs
47046         // incorrectly, but we can swap the operands to fix both.
47047         std::swap(LHS, RHS);
47048         [[fallthrough]];
47049       case ISD::SETOGT:
47050       case ISD::SETGT:
47051       case ISD::SETGE:
47052         Opcode = X86ISD::FMAX;
47053         break;
47054       }
47055     // Check for x CC y ? y : x -- a min/max with reversed arms.
47056     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
47057                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
47058       switch (CC) {
47059       default: break;
47060       case ISD::SETOGE:
47061         // Converting this to a min would handle comparisons between positive
47062         // and negative zero incorrectly, and swapping the operands would
47063         // cause it to handle NaNs incorrectly.
47064         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
47065             !(DAG.isKnownNeverZeroFloat(LHS) ||
47066               DAG.isKnownNeverZeroFloat(RHS))) {
47067           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47068             break;
47069           std::swap(LHS, RHS);
47070         }
47071         Opcode = X86ISD::FMIN;
47072         break;
47073       case ISD::SETUGT:
47074         // Converting this to a min would handle NaNs incorrectly.
47075         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47076           break;
47077         Opcode = X86ISD::FMIN;
47078         break;
47079       case ISD::SETUGE:
47080         // Converting this to a min would handle both negative zeros and NaNs
47081         // incorrectly, but we can swap the operands to fix both.
47082         std::swap(LHS, RHS);
47083         [[fallthrough]];
47084       case ISD::SETOGT:
47085       case ISD::SETGT:
47086       case ISD::SETGE:
47087         Opcode = X86ISD::FMIN;
47088         break;
47089
47090       case ISD::SETULT:
47091         // Converting this to a max would handle NaNs incorrectly.
47092         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47093           break;
47094         Opcode = X86ISD::FMAX;
47095         break;
47096       case ISD::SETOLE:
47097         // Converting this to a max would handle comparisons between positive
47098         // and negative zero incorrectly, and swapping the operands would
47099         // cause it to handle NaNs incorrectly.
47100         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
47101             !DAG.isKnownNeverZeroFloat(LHS) &&
47102             !DAG.isKnownNeverZeroFloat(RHS)) {
47103           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47104             break;
47105           std::swap(LHS, RHS);
47106         }
47107         Opcode = X86ISD::FMAX;
47108         break;
47109       case ISD::SETULE:
47110         // Converting this to a max would handle both negative zeros and NaNs
47111         // incorrectly, but we can swap the operands to fix both.
47112         std::swap(LHS, RHS);
47113         [[fallthrough]];
47114       case ISD::SETOLT:
47115       case ISD::SETLT:
47116       case ISD::SETLE:
47117         Opcode = X86ISD::FMAX;
47118         break;
47119       }
47120     }
47121
47122     if (Opcode)
47123       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47124   }
47125
47126   // Some mask scalar intrinsics rely on checking if only one bit is set
47127   // and implement it in C code like this:
47128   // A[0] = (U & 1) ? A[0] : W[0];
47129   // This creates some redundant instructions that break pattern matching.
47130   // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47131   if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47132       Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47133     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47134     SDValue AndNode = Cond.getOperand(0);
47135     if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47136         isNullConstant(Cond.getOperand(1)) &&
47137         isOneConstant(AndNode.getOperand(1))) {
47138       // LHS and RHS swapped due to
47139       // setcc outputting 1 when AND resulted in 0 and vice versa.
47140       AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47141       return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47142     }
47143   }
47144
47145   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47146   // lowering on KNL. In this case we convert it to
47147   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47148   // The same situation all vectors of i8 and i16 without BWI.
47149   // Make sure we extend these even before type legalization gets a chance to
47150   // split wide vectors.
47151   // Since SKX these selects have a proper lowering.
47152   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47153       CondVT.getVectorElementType() == MVT::i1 &&
47154       (VT.getVectorElementType() == MVT::i8 ||
47155        VT.getVectorElementType() == MVT::i16)) {
47156     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47157     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47158   }
47159
47160   // AVX512 - Extend select with zero to merge with target shuffle.
47161   // select(mask, extract_subvector(shuffle(x)), zero) -->
47162   // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
47163   // TODO - support non target shuffles as well.
47164   if (Subtarget.hasAVX512() && CondVT.isVector() &&
47165       CondVT.getVectorElementType() == MVT::i1) {
47166     auto SelectableOp = [&TLI](SDValue Op) {
47167       return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47168              isTargetShuffle(Op.getOperand(0).getOpcode()) &&
47169              isNullConstant(Op.getOperand(1)) &&
47170              TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
47171              Op.hasOneUse() && Op.getOperand(0).hasOneUse();
47172     };
47173
47174     bool SelectableLHS = SelectableOp(LHS);
47175     bool SelectableRHS = SelectableOp(RHS);
47176     bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
47177     bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
47178
47179     if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
47180       EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
47181                                 : RHS.getOperand(0).getValueType();
47182       EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
47183       LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
47184                             VT.getSizeInBits());
47185       RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
47186                             VT.getSizeInBits());
47187       Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
47188                          DAG.getUNDEF(SrcCondVT), Cond,
47189                          DAG.getIntPtrConstant(0, DL));
47190       SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
47191       return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
47192     }
47193   }
47194
47195   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
47196     return V;
47197
47198   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
47199       Cond.hasOneUse()) {
47200     EVT CondVT = Cond.getValueType();
47201     SDValue Cond0 = Cond.getOperand(0);
47202     SDValue Cond1 = Cond.getOperand(1);
47203     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47204
47205     // Canonicalize min/max:
47206     // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
47207     // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
47208     // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
47209     // the need for an extra compare against zero. e.g.
47210     // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
47211     // subl   %esi, %edi
47212     // testl  %edi, %edi
47213     // movl   $0, %eax
47214     // cmovgl %edi, %eax
47215     // =>
47216     // xorl   %eax, %eax
47217     // subl   %esi, $edi
47218     // cmovsl %eax, %edi
47219     //
47220     // We can also canonicalize
47221     //  (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
47222     //  (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
47223     // This allows the use of a test instruction for the compare.
47224     if (LHS == Cond0 && RHS == Cond1) {
47225       if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
47226           (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
47227         ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
47228         Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47229         return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47230       }
47231       if (CC == ISD::SETUGT && isOneConstant(RHS)) {
47232         ISD::CondCode NewCC = ISD::SETUGE;
47233         Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
47234         return DAG.getSelect(DL, VT, Cond, LHS, RHS);
47235       }
47236     }
47237
47238     // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
47239     // fold eq + gt/lt nested selects into ge/le selects
47240     // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
47241     // --> (select (cmpuge Cond0, Cond1), LHS, Y)
47242     // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
47243     // --> (select (cmpsle Cond0, Cond1), LHS, Y)
47244     // .. etc ..
47245     if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
47246         RHS.getOperand(0).getOpcode() == ISD::SETCC) {
47247       SDValue InnerSetCC = RHS.getOperand(0);
47248       ISD::CondCode InnerCC =
47249           cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
47250       if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
47251           Cond0 == InnerSetCC.getOperand(0) &&
47252           Cond1 == InnerSetCC.getOperand(1)) {
47253         ISD::CondCode NewCC;
47254         switch (CC == ISD::SETEQ ? InnerCC : CC) {
47255         case ISD::SETGT:  NewCC = ISD::SETGE; break;
47256         case ISD::SETLT:  NewCC = ISD::SETLE; break;
47257         case ISD::SETUGT: NewCC = ISD::SETUGE; break;
47258         case ISD::SETULT: NewCC = ISD::SETULE; break;
47259         default: NewCC = ISD::SETCC_INVALID; break;
47260         }
47261         if (NewCC != ISD::SETCC_INVALID) {
47262           Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
47263           return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
47264         }
47265       }
47266     }
47267   }
47268
47269   // Check if the first operand is all zeros and Cond type is vXi1.
47270   // If this an avx512 target we can improve the use of zero masking by
47271   // swapping the operands and inverting the condition.
47272   if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
47273       Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
47274       ISD::isBuildVectorAllZeros(LHS.getNode()) &&
47275       !ISD::isBuildVectorAllZeros(RHS.getNode())) {
47276     // Invert the cond to not(cond) : xor(op,allones)=not(op)
47277     SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
47278     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
47279     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
47280   }
47281
47282   // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
47283   // get split by legalization.
47284   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
47285       CondVT.getVectorElementType() == MVT::i1 &&
47286       TLI.isTypeLegal(VT.getScalarType())) {
47287     EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
47288     if (SDValue ExtCond = combineToExtendBoolVectorInReg(
47289             ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
47290       ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
47291       return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
47292     }
47293   }
47294
47295   // Early exit check
47296   if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))
47297     return SDValue();
47298
47299   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
47300     return V;
47301
47302   if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
47303     return V;
47304
47305   if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
47306     return V;
47307
47308   // select(~Cond, X, Y) -> select(Cond, Y, X)
47309   if (CondVT.getScalarType() != MVT::i1) {
47310     if (SDValue CondNot = IsNOT(Cond, DAG))
47311       return DAG.getNode(N->getOpcode(), DL, VT,
47312                          DAG.getBitcast(CondVT, CondNot), RHS, LHS);
47313
47314     // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
47315     // signbit.
47316     if (Cond.getOpcode() == X86ISD::PCMPGT &&
47317         ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
47318         Cond.hasOneUse()) {
47319       Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
47320                          DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
47321       return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
47322     }
47323   }
47324
47325   // Try to optimize vXi1 selects if both operands are either all constants or
47326   // bitcasts from scalar integer type. In that case we can convert the operands
47327   // to integer and use an integer select which will be converted to a CMOV.
47328   // We need to take a little bit of care to avoid creating an i64 type after
47329   // type legalization.
47330   if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
47331       VT.getVectorElementType() == MVT::i1 &&
47332       (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
47333     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
47334     if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
47335       bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
47336       bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
47337
47338       if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
47339                           LHS.getOperand(0).getValueType() == IntVT)) &&
47340           (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
47341                           RHS.getOperand(0).getValueType() == IntVT))) {
47342         if (LHSIsConst)
47343           LHS = combinevXi1ConstantToInteger(LHS, DAG);
47344         else
47345           LHS = LHS.getOperand(0);
47346
47347         if (RHSIsConst)
47348           RHS = combinevXi1ConstantToInteger(RHS, DAG);
47349         else
47350           RHS = RHS.getOperand(0);
47351
47352         SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
47353         return DAG.getBitcast(VT, Select);
47354       }
47355     }
47356   }
47357
47358   // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
47359   // single bits, then invert the predicate and swap the select operands.
47360   // This can lower using a vector shift bit-hack rather than mask and compare.
47361   if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
47362       N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
47363       Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
47364       Cond.getOperand(0).getOpcode() == ISD::AND &&
47365       isNullOrNullSplat(Cond.getOperand(1)) &&
47366       cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
47367       Cond.getOperand(0).getValueType() == VT) {
47368     // The 'and' mask must be composed of power-of-2 constants.
47369     SDValue And = Cond.getOperand(0);
47370     auto *C = isConstOrConstSplat(And.getOperand(1));
47371     if (C && C->getAPIntValue().isPowerOf2()) {
47372       // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
47373       SDValue NotCond =
47374           DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
47375       return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
47376     }
47377
47378     // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
47379     // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
47380     // 16-bit lacks a proper blendv.
47381     unsigned EltBitWidth = VT.getScalarSizeInBits();
47382     bool CanShiftBlend =
47383         TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
47384                                 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
47385                                 (Subtarget.hasXOP()));
47386     if (CanShiftBlend &&
47387         ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
47388           return C->getAPIntValue().isPowerOf2();
47389         })) {
47390       // Create a left-shift constant to get the mask bits over to the sign-bit.
47391       SDValue Mask = And.getOperand(1);
47392       SmallVector<int, 32> ShlVals;
47393       for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
47394         auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
47395         ShlVals.push_back(EltBitWidth - 1 -
47396                           MaskVal->getAPIntValue().exactLogBase2());
47397       }
47398       // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
47399       SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
47400       SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
47401       SDValue NewCond =
47402           DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
47403       return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
47404     }
47405   }
47406
47407   return SDValue();
47408 }
47409
47410 /// Combine:
47411 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
47412 /// to:
47413 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
47414 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
47415 /// Note that this is only legal for some op/cc combinations.
47416 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
47417                                        SelectionDAG &DAG,
47418                                        const X86Subtarget &Subtarget) {
47419   // This combine only operates on CMP-like nodes.
47420   if (!(Cmp.getOpcode() == X86ISD::CMP ||
47421         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47422     return SDValue();
47423
47424   // Can't replace the cmp if it has more uses than the one we're looking at.
47425   // FIXME: We would like to be able to handle this, but would need to make sure
47426   // all uses were updated.
47427   if (!Cmp.hasOneUse())
47428     return SDValue();
47429
47430   // This only applies to variations of the common case:
47431   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
47432   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
47433   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
47434   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
47435   // Using the proper condcodes (see below), overflow is checked for.
47436
47437   // FIXME: We can generalize both constraints:
47438   // - XOR/OR/AND (if they were made to survive AtomicExpand)
47439   // - LHS != 1
47440   // if the result is compared.
47441
47442   SDValue CmpLHS = Cmp.getOperand(0);
47443   SDValue CmpRHS = Cmp.getOperand(1);
47444   EVT CmpVT = CmpLHS.getValueType();
47445
47446   if (!CmpLHS.hasOneUse())
47447     return SDValue();
47448
47449   unsigned Opc = CmpLHS.getOpcode();
47450   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
47451     return SDValue();
47452
47453   SDValue OpRHS = CmpLHS.getOperand(2);
47454   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
47455   if (!OpRHSC)
47456     return SDValue();
47457
47458   APInt Addend = OpRHSC->getAPIntValue();
47459   if (Opc == ISD::ATOMIC_LOAD_SUB)
47460     Addend = -Addend;
47461
47462   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
47463   if (!CmpRHSC)
47464     return SDValue();
47465
47466   APInt Comparison = CmpRHSC->getAPIntValue();
47467   APInt NegAddend = -Addend;
47468
47469   // See if we can adjust the CC to make the comparison match the negated
47470   // addend.
47471   if (Comparison != NegAddend) {
47472     APInt IncComparison = Comparison + 1;
47473     if (IncComparison == NegAddend) {
47474       if (CC == X86::COND_A && !Comparison.isMaxValue()) {
47475         Comparison = IncComparison;
47476         CC = X86::COND_AE;
47477       } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
47478         Comparison = IncComparison;
47479         CC = X86::COND_L;
47480       }
47481     }
47482     APInt DecComparison = Comparison - 1;
47483     if (DecComparison == NegAddend) {
47484       if (CC == X86::COND_AE && !Comparison.isMinValue()) {
47485         Comparison = DecComparison;
47486         CC = X86::COND_A;
47487       } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
47488         Comparison = DecComparison;
47489         CC = X86::COND_LE;
47490       }
47491     }
47492   }
47493
47494   // If the addend is the negation of the comparison value, then we can do
47495   // a full comparison by emitting the atomic arithmetic as a locked sub.
47496   if (Comparison == NegAddend) {
47497     // The CC is fine, but we need to rewrite the LHS of the comparison as an
47498     // atomic sub.
47499     auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
47500     auto AtomicSub = DAG.getAtomic(
47501         ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
47502         /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
47503         /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
47504         AN->getMemOperand());
47505     auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
47506     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47507     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47508     return LockOp;
47509   }
47510
47511   // We can handle comparisons with zero in a number of cases by manipulating
47512   // the CC used.
47513   if (!Comparison.isZero())
47514     return SDValue();
47515
47516   if (CC == X86::COND_S && Addend == 1)
47517     CC = X86::COND_LE;
47518   else if (CC == X86::COND_NS && Addend == 1)
47519     CC = X86::COND_G;
47520   else if (CC == X86::COND_G && Addend == -1)
47521     CC = X86::COND_GE;
47522   else if (CC == X86::COND_LE && Addend == -1)
47523     CC = X86::COND_L;
47524   else
47525     return SDValue();
47526
47527   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
47528   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
47529   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
47530   return LockOp;
47531 }
47532
47533 // Check whether a boolean test is testing a boolean value generated by
47534 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
47535 // code.
47536 //
47537 // Simplify the following patterns:
47538 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
47539 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
47540 // to (Op EFLAGS Cond)
47541 //
47542 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
47543 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
47544 // to (Op EFLAGS !Cond)
47545 //
47546 // where Op could be BRCOND or CMOV.
47547 //
47548 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
47549   // This combine only operates on CMP-like nodes.
47550   if (!(Cmp.getOpcode() == X86ISD::CMP ||
47551         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
47552     return SDValue();
47553
47554   // Quit if not used as a boolean value.
47555   if (CC != X86::COND_E && CC != X86::COND_NE)
47556     return SDValue();
47557
47558   // Check CMP operands. One of them should be 0 or 1 and the other should be
47559   // an SetCC or extended from it.
47560   SDValue Op1 = Cmp.getOperand(0);
47561   SDValue Op2 = Cmp.getOperand(1);
47562
47563   SDValue SetCC;
47564   const ConstantSDNode* C = nullptr;
47565   bool needOppositeCond = (CC == X86::COND_E);
47566   bool checkAgainstTrue = false; // Is it a comparison against 1?
47567
47568   if ((C = dyn_cast<ConstantSDNode>(Op1)))
47569     SetCC = Op2;
47570   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
47571     SetCC = Op1;
47572   else // Quit if all operands are not constants.
47573     return SDValue();
47574
47575   if (C->getZExtValue() == 1) {
47576     needOppositeCond = !needOppositeCond;
47577     checkAgainstTrue = true;
47578   } else if (C->getZExtValue() != 0)
47579     // Quit if the constant is neither 0 or 1.
47580     return SDValue();
47581
47582   bool truncatedToBoolWithAnd = false;
47583   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
47584   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
47585          SetCC.getOpcode() == ISD::TRUNCATE ||
47586          SetCC.getOpcode() == ISD::AND) {
47587     if (SetCC.getOpcode() == ISD::AND) {
47588       int OpIdx = -1;
47589       if (isOneConstant(SetCC.getOperand(0)))
47590         OpIdx = 1;
47591       if (isOneConstant(SetCC.getOperand(1)))
47592         OpIdx = 0;
47593       if (OpIdx < 0)
47594         break;
47595       SetCC = SetCC.getOperand(OpIdx);
47596       truncatedToBoolWithAnd = true;
47597     } else
47598       SetCC = SetCC.getOperand(0);
47599   }
47600
47601   switch (SetCC.getOpcode()) {
47602   case X86ISD::SETCC_CARRY:
47603     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
47604     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
47605     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
47606     // truncated to i1 using 'and'.
47607     if (checkAgainstTrue && !truncatedToBoolWithAnd)
47608       break;
47609     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
47610            "Invalid use of SETCC_CARRY!");
47611     [[fallthrough]];
47612   case X86ISD::SETCC:
47613     // Set the condition code or opposite one if necessary.
47614     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
47615     if (needOppositeCond)
47616       CC = X86::GetOppositeBranchCondition(CC);
47617     return SetCC.getOperand(1);
47618   case X86ISD::CMOV: {
47619     // Check whether false/true value has canonical one, i.e. 0 or 1.
47620     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
47621     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
47622     // Quit if true value is not a constant.
47623     if (!TVal)
47624       return SDValue();
47625     // Quit if false value is not a constant.
47626     if (!FVal) {
47627       SDValue Op = SetCC.getOperand(0);
47628       // Skip 'zext' or 'trunc' node.
47629       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
47630           Op.getOpcode() == ISD::TRUNCATE)
47631         Op = Op.getOperand(0);
47632       // A special case for rdrand/rdseed, where 0 is set if false cond is
47633       // found.
47634       if ((Op.getOpcode() != X86ISD::RDRAND &&
47635            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
47636         return SDValue();
47637     }
47638     // Quit if false value is not the constant 0 or 1.
47639     bool FValIsFalse = true;
47640     if (FVal && FVal->getZExtValue() != 0) {
47641       if (FVal->getZExtValue() != 1)
47642         return SDValue();
47643       // If FVal is 1, opposite cond is needed.
47644       needOppositeCond = !needOppositeCond;
47645       FValIsFalse = false;
47646     }
47647     // Quit if TVal is not the constant opposite of FVal.
47648     if (FValIsFalse && TVal->getZExtValue() != 1)
47649       return SDValue();
47650     if (!FValIsFalse && TVal->getZExtValue() != 0)
47651       return SDValue();
47652     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
47653     if (needOppositeCond)
47654       CC = X86::GetOppositeBranchCondition(CC);
47655     return SetCC.getOperand(3);
47656   }
47657   }
47658
47659   return SDValue();
47660 }
47661
47662 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
47663 /// Match:
47664 ///   (X86or (X86setcc) (X86setcc))
47665 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
47666 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
47667                                            X86::CondCode &CC1, SDValue &Flags,
47668                                            bool &isAnd) {
47669   if (Cond->getOpcode() == X86ISD::CMP) {
47670     if (!isNullConstant(Cond->getOperand(1)))
47671       return false;
47672
47673     Cond = Cond->getOperand(0);
47674   }
47675
47676   isAnd = false;
47677
47678   SDValue SetCC0, SetCC1;
47679   switch (Cond->getOpcode()) {
47680   default: return false;
47681   case ISD::AND:
47682   case X86ISD::AND:
47683     isAnd = true;
47684     [[fallthrough]];
47685   case ISD::OR:
47686   case X86ISD::OR:
47687     SetCC0 = Cond->getOperand(0);
47688     SetCC1 = Cond->getOperand(1);
47689     break;
47690   };
47691
47692   // Make sure we have SETCC nodes, using the same flags value.
47693   if (SetCC0.getOpcode() != X86ISD::SETCC ||
47694       SetCC1.getOpcode() != X86ISD::SETCC ||
47695       SetCC0->getOperand(1) != SetCC1->getOperand(1))
47696     return false;
47697
47698   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
47699   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
47700   Flags = SetCC0->getOperand(1);
47701   return true;
47702 }
47703
47704 // When legalizing carry, we create carries via add X, -1
47705 // If that comes from an actual carry, via setcc, we use the
47706 // carry directly.
47707 static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
47708   if (EFLAGS.getOpcode() == X86ISD::ADD) {
47709     if (isAllOnesConstant(EFLAGS.getOperand(1))) {
47710       bool FoundAndLSB = false;
47711       SDValue Carry = EFLAGS.getOperand(0);
47712       while (Carry.getOpcode() == ISD::TRUNCATE ||
47713              Carry.getOpcode() == ISD::ZERO_EXTEND ||
47714              (Carry.getOpcode() == ISD::AND &&
47715               isOneConstant(Carry.getOperand(1)))) {
47716         FoundAndLSB |= Carry.getOpcode() == ISD::AND;
47717         Carry = Carry.getOperand(0);
47718       }
47719       if (Carry.getOpcode() == X86ISD::SETCC ||
47720           Carry.getOpcode() == X86ISD::SETCC_CARRY) {
47721         // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
47722         uint64_t CarryCC = Carry.getConstantOperandVal(0);
47723         SDValue CarryOp1 = Carry.getOperand(1);
47724         if (CarryCC == X86::COND_B)
47725           return CarryOp1;
47726         if (CarryCC == X86::COND_A) {
47727           // Try to convert COND_A into COND_B in an attempt to facilitate
47728           // materializing "setb reg".
47729           //
47730           // Do not flip "e > c", where "c" is a constant, because Cmp
47731           // instruction cannot take an immediate as its first operand.
47732           //
47733           if (CarryOp1.getOpcode() == X86ISD::SUB &&
47734               CarryOp1.getNode()->hasOneUse() &&
47735               CarryOp1.getValueType().isInteger() &&
47736               !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
47737             SDValue SubCommute =
47738                 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
47739                             CarryOp1.getOperand(1), CarryOp1.getOperand(0));
47740             return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
47741           }
47742         }
47743         // If this is a check of the z flag of an add with 1, switch to the
47744         // C flag.
47745         if (CarryCC == X86::COND_E &&
47746             CarryOp1.getOpcode() == X86ISD::ADD &&
47747             isOneConstant(CarryOp1.getOperand(1)))
47748           return CarryOp1;
47749       } else if (FoundAndLSB) {
47750         SDLoc DL(Carry);
47751         SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
47752         if (Carry.getOpcode() == ISD::SRL) {
47753           BitNo = Carry.getOperand(1);
47754           Carry = Carry.getOperand(0);
47755         }
47756         return getBT(Carry, BitNo, DL, DAG);
47757       }
47758     }
47759   }
47760
47761   return SDValue();
47762 }
47763
47764 /// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
47765 /// to avoid the inversion.
47766 static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
47767                               SelectionDAG &DAG,
47768                               const X86Subtarget &Subtarget) {
47769   // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
47770   if (EFLAGS.getOpcode() != X86ISD::PTEST &&
47771       EFLAGS.getOpcode() != X86ISD::TESTP)
47772     return SDValue();
47773
47774   // PTEST/TESTP sets EFLAGS as:
47775   // TESTZ: ZF = (Op0 & Op1) == 0
47776   // TESTC: CF = (~Op0 & Op1) == 0
47777   // TESTNZC: ZF == 0 && CF == 0
47778   MVT VT = EFLAGS.getSimpleValueType();
47779   SDValue Op0 = EFLAGS.getOperand(0);
47780   SDValue Op1 = EFLAGS.getOperand(1);
47781   MVT OpVT = Op0.getSimpleValueType();
47782
47783   // TEST*(~X,Y) == TEST*(X,Y)
47784   if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
47785     X86::CondCode InvCC;
47786     switch (CC) {
47787     case X86::COND_B:
47788       // testc -> testz.
47789       InvCC = X86::COND_E;
47790       break;
47791     case X86::COND_AE:
47792       // !testc -> !testz.
47793       InvCC = X86::COND_NE;
47794       break;
47795     case X86::COND_E:
47796       // testz -> testc.
47797       InvCC = X86::COND_B;
47798       break;
47799     case X86::COND_NE:
47800       // !testz -> !testc.
47801       InvCC = X86::COND_AE;
47802       break;
47803     case X86::COND_A:
47804     case X86::COND_BE:
47805       // testnzc -> testnzc (no change).
47806       InvCC = CC;
47807       break;
47808     default:
47809       InvCC = X86::COND_INVALID;
47810       break;
47811     }
47812
47813     if (InvCC != X86::COND_INVALID) {
47814       CC = InvCC;
47815       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47816                          DAG.getBitcast(OpVT, NotOp0), Op1);
47817     }
47818   }
47819
47820   if (CC == X86::COND_B || CC == X86::COND_AE) {
47821     // TESTC(X,~X) == TESTC(X,-1)
47822     if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
47823       if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
47824         SDLoc DL(EFLAGS);
47825         return DAG.getNode(
47826             EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
47827             DAG.getBitcast(OpVT,
47828                            DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
47829       }
47830     }
47831   }
47832
47833   if (CC == X86::COND_E || CC == X86::COND_NE) {
47834     // TESTZ(X,~Y) == TESTC(Y,X)
47835     if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
47836       CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47837       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47838                          DAG.getBitcast(OpVT, NotOp1), Op0);
47839     }
47840
47841     if (Op0 == Op1) {
47842       SDValue BC = peekThroughBitcasts(Op0);
47843       EVT BCVT = BC.getValueType();
47844
47845       // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
47846       if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
47847         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47848                            DAG.getBitcast(OpVT, BC.getOperand(0)),
47849                            DAG.getBitcast(OpVT, BC.getOperand(1)));
47850       }
47851
47852       // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
47853       if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
47854         CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
47855         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47856                            DAG.getBitcast(OpVT, BC.getOperand(0)),
47857                            DAG.getBitcast(OpVT, BC.getOperand(1)));
47858       }
47859
47860       // If every element is an all-sign value, see if we can use TESTP/MOVMSK
47861       // to more efficiently extract the sign bits and compare that.
47862       // TODO: Handle TESTC with comparison inversion.
47863       // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
47864       // TESTP/MOVMSK combines to make sure its never worse than PTEST?
47865       if (BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
47866         unsigned EltBits = BCVT.getScalarSizeInBits();
47867         if (DAG.ComputeNumSignBits(BC) == EltBits) {
47868           assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
47869           APInt SignMask = APInt::getSignMask(EltBits);
47870           const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47871           if (SDValue Res =
47872                   TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
47873             // For vXi16 cases we need to use pmovmksb and extract every other
47874             // sign bit.
47875             SDLoc DL(EFLAGS);
47876             if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
47877               MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
47878               MVT FloatVT =
47879                   MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
47880               Res = DAG.getBitcast(FloatVT, Res);
47881               return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
47882             } else if (EltBits == 16) {
47883               MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
47884               Res = DAG.getBitcast(MovmskVT, Res);
47885               Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
47886               Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
47887                                 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
47888             } else {
47889               Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
47890             }
47891             return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
47892                                DAG.getConstant(0, DL, MVT::i32));
47893           }
47894         }
47895       }
47896     }
47897
47898     // TESTZ(-1,X) == TESTZ(X,X)
47899     if (ISD::isBuildVectorAllOnes(Op0.getNode()))
47900       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
47901
47902     // TESTZ(X,-1) == TESTZ(X,X)
47903     if (ISD::isBuildVectorAllOnes(Op1.getNode()))
47904       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
47905
47906     // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
47907     // TODO: Add COND_NE handling?
47908     if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
47909       SDValue Src0 = peekThroughBitcasts(Op0);
47910       SDValue Src1 = peekThroughBitcasts(Op1);
47911       if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
47912         Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),
47913                                  peekThroughBitcasts(Src0.getOperand(1)), true);
47914         Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),
47915                                  peekThroughBitcasts(Src1.getOperand(1)), true);
47916         if (Src0 && Src1) {
47917           MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
47918           return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
47919                              DAG.getBitcast(OpVT2, Src0),
47920                              DAG.getBitcast(OpVT2, Src1));
47921         }
47922       }
47923     }
47924   }
47925
47926   return SDValue();
47927 }
47928
47929 // Attempt to simplify the MOVMSK input based on the comparison type.
47930 static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
47931                                   SelectionDAG &DAG,
47932                                   const X86Subtarget &Subtarget) {
47933   // Handle eq/ne against zero (any_of).
47934   // Handle eq/ne against -1 (all_of).
47935   if (!(CC == X86::COND_E || CC == X86::COND_NE))
47936     return SDValue();
47937   if (EFLAGS.getValueType() != MVT::i32)
47938     return SDValue();
47939   unsigned CmpOpcode = EFLAGS.getOpcode();
47940   if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
47941     return SDValue();
47942   auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
47943   if (!CmpConstant)
47944     return SDValue();
47945   const APInt &CmpVal = CmpConstant->getAPIntValue();
47946
47947   SDValue CmpOp = EFLAGS.getOperand(0);
47948   unsigned CmpBits = CmpOp.getValueSizeInBits();
47949   assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
47950
47951   // Peek through any truncate.
47952   if (CmpOp.getOpcode() == ISD::TRUNCATE)
47953     CmpOp = CmpOp.getOperand(0);
47954
47955   // Bail if we don't find a MOVMSK.
47956   if (CmpOp.getOpcode() != X86ISD::MOVMSK)
47957     return SDValue();
47958
47959   SDValue Vec = CmpOp.getOperand(0);
47960   MVT VecVT = Vec.getSimpleValueType();
47961   assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
47962          "Unexpected MOVMSK operand");
47963   unsigned NumElts = VecVT.getVectorNumElements();
47964   unsigned NumEltBits = VecVT.getScalarSizeInBits();
47965
47966   bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
47967   bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
47968                  NumElts <= CmpBits && CmpVal.isMask(NumElts);
47969   if (!IsAnyOf && !IsAllOf)
47970     return SDValue();
47971
47972   // TODO: Check more combining cases for me.
47973   // Here we check the cmp use number to decide do combining or not.
47974   // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
47975   // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
47976   bool IsOneUse = CmpOp.getNode()->hasOneUse();
47977
47978   // See if we can peek through to a vector with a wider element type, if the
47979   // signbits extend down to all the sub-elements as well.
47980   // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
47981   // potential SimplifyDemandedBits/Elts cases.
47982   // If we looked through a truncate that discard bits, we can't do this
47983   // transform.
47984   // FIXME: We could do this transform for truncates that discarded bits by
47985   // inserting an AND mask between the new MOVMSK and the CMP.
47986   if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
47987     SDValue BC = peekThroughBitcasts(Vec);
47988     MVT BCVT = BC.getSimpleValueType();
47989     unsigned BCNumElts = BCVT.getVectorNumElements();
47990     unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
47991     if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
47992         BCNumEltBits > NumEltBits &&
47993         DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
47994       SDLoc DL(EFLAGS);
47995       APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
47996       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
47997                          DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
47998                          DAG.getConstant(CmpMask, DL, MVT::i32));
47999     }
48000   }
48001
48002   // MOVMSK(CONCAT(X,Y)) == 0 ->  MOVMSK(OR(X,Y)).
48003   // MOVMSK(CONCAT(X,Y)) != 0 ->  MOVMSK(OR(X,Y)).
48004   // MOVMSK(CONCAT(X,Y)) == -1 ->  MOVMSK(AND(X,Y)).
48005   // MOVMSK(CONCAT(X,Y)) != -1 ->  MOVMSK(AND(X,Y)).
48006   if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48007     SmallVector<SDValue> Ops;
48008     if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48009         Ops.size() == 2) {
48010       SDLoc DL(EFLAGS);
48011       EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48012       APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48013       SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48014                               DAG.getBitcast(SubVT, Ops[0]),
48015                               DAG.getBitcast(SubVT, Ops[1]));
48016       V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48017       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48018                          DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48019                          DAG.getConstant(CmpMask, DL, MVT::i32));
48020     }
48021   }
48022
48023   // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48024   // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48025   // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48026   // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48027   if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48028     MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48029     SDValue BC = peekThroughBitcasts(Vec);
48030     // Ensure MOVMSK was testing every signbit of BC.
48031     if (BC.getValueType().getVectorNumElements() <= NumElts) {
48032       if (BC.getOpcode() == X86ISD::PCMPEQ) {
48033         SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48034                                 BC.getOperand(0), BC.getOperand(1));
48035         V = DAG.getBitcast(TestVT, V);
48036         return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48037       }
48038       // Check for 256-bit split vector cases.
48039       if (BC.getOpcode() == ISD::AND &&
48040           BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48041           BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48042         SDValue LHS = BC.getOperand(0);
48043         SDValue RHS = BC.getOperand(1);
48044         LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48045                           LHS.getOperand(0), LHS.getOperand(1));
48046         RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
48047                           RHS.getOperand(0), RHS.getOperand(1));
48048         LHS = DAG.getBitcast(TestVT, LHS);
48049         RHS = DAG.getBitcast(TestVT, RHS);
48050         SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
48051         return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48052       }
48053     }
48054   }
48055
48056   // See if we can avoid a PACKSS by calling MOVMSK on the sources.
48057   // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
48058   // sign bits prior to the comparison with zero unless we know that
48059   // the vXi16 splats the sign bit down to the lower i8 half.
48060   // TODO: Handle all_of patterns.
48061   if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
48062     SDValue VecOp0 = Vec.getOperand(0);
48063     SDValue VecOp1 = Vec.getOperand(1);
48064     bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
48065     bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
48066     // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
48067     if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
48068       SDLoc DL(EFLAGS);
48069       SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
48070       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48071       Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
48072       if (!SignExt0) {
48073         Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
48074                              DAG.getConstant(0xAAAA, DL, MVT::i16));
48075       }
48076       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48077                          DAG.getConstant(0, DL, MVT::i16));
48078     }
48079     // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
48080     // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
48081     if (CmpBits >= 16 && Subtarget.hasInt256() &&
48082         (IsAnyOf || (SignExt0 && SignExt1))) {
48083       if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
48084         SDLoc DL(EFLAGS);
48085         SDValue Result = peekThroughBitcasts(Src);
48086         if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
48087             Result.getValueType().getVectorNumElements() <= NumElts) {
48088           SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
48089                                   Result.getOperand(0), Result.getOperand(1));
48090           V = DAG.getBitcast(MVT::v4i64, V);
48091           return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48092         }
48093         Result = DAG.getBitcast(MVT::v32i8, Result);
48094         Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48095         unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
48096         if (!SignExt0 || !SignExt1) {
48097           assert(IsAnyOf &&
48098                  "Only perform v16i16 signmasks for any_of patterns");
48099           Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
48100                                DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48101         }
48102         return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48103                            DAG.getConstant(CmpMask, DL, MVT::i32));
48104       }
48105     }
48106   }
48107
48108   // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
48109   SmallVector<int, 32> ShuffleMask;
48110   SmallVector<SDValue, 2> ShuffleInputs;
48111   if (NumElts <= CmpBits &&
48112       getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
48113                              ShuffleMask, DAG) &&
48114       ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
48115       ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
48116     unsigned NumShuffleElts = ShuffleMask.size();
48117     APInt DemandedElts = APInt::getZero(NumShuffleElts);
48118     for (int M : ShuffleMask) {
48119       assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
48120       DemandedElts.setBit(M);
48121     }
48122     if (DemandedElts.isAllOnes()) {
48123       SDLoc DL(EFLAGS);
48124       SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
48125       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48126       Result =
48127           DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
48128       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
48129                          EFLAGS.getOperand(1));
48130     }
48131   }
48132
48133   // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
48134   // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
48135   // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
48136   // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
48137   // iff every element is referenced.
48138   if (NumElts <= CmpBits && Subtarget.hasAVX() &&
48139       !Subtarget.preferMovmskOverVTest() && IsOneUse &&
48140       (NumEltBits == 32 || NumEltBits == 64)) {
48141     SDLoc DL(EFLAGS);
48142     MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
48143     MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
48144     MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
48145     SDValue LHS = Vec;
48146     SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
48147     CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48148     return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
48149                        DAG.getBitcast(FloatVT, LHS),
48150                        DAG.getBitcast(FloatVT, RHS));
48151   }
48152
48153   return SDValue();
48154 }
48155
48156 /// Optimize an EFLAGS definition used according to the condition code \p CC
48157 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
48158 /// uses of chain values.
48159 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
48160                                   SelectionDAG &DAG,
48161                                   const X86Subtarget &Subtarget) {
48162   if (CC == X86::COND_B)
48163     if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
48164       return Flags;
48165
48166   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
48167     return R;
48168
48169   if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
48170     return R;
48171
48172   if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
48173     return R;
48174
48175   return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
48176 }
48177
48178 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
48179 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
48180                            TargetLowering::DAGCombinerInfo &DCI,
48181                            const X86Subtarget &Subtarget) {
48182   SDLoc DL(N);
48183
48184   SDValue FalseOp = N->getOperand(0);
48185   SDValue TrueOp = N->getOperand(1);
48186   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
48187   SDValue Cond = N->getOperand(3);
48188
48189   // cmov X, X, ?, ? --> X
48190   if (TrueOp == FalseOp)
48191     return TrueOp;
48192
48193   // Try to simplify the EFLAGS and condition code operands.
48194   // We can't always do this as FCMOV only supports a subset of X86 cond.
48195   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
48196     if (!(FalseOp.getValueType() == MVT::f80 ||
48197           (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
48198           (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
48199         !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
48200       SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
48201                        Flags};
48202       return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
48203     }
48204   }
48205
48206   // If this is a select between two integer constants, try to do some
48207   // optimizations.  Note that the operands are ordered the opposite of SELECT
48208   // operands.
48209   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
48210     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
48211       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
48212       // larger than FalseC (the false value).
48213       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
48214         CC = X86::GetOppositeBranchCondition(CC);
48215         std::swap(TrueC, FalseC);
48216         std::swap(TrueOp, FalseOp);
48217       }
48218
48219       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
48220       // This is efficient for any integer data type (including i8/i16) and
48221       // shift amount.
48222       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
48223         Cond = getSETCC(CC, Cond, DL, DAG);
48224
48225         // Zero extend the condition if needed.
48226         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
48227
48228         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
48229         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
48230                            DAG.getConstant(ShAmt, DL, MVT::i8));
48231         return Cond;
48232       }
48233
48234       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
48235       // for any integer data type, including i8/i16.
48236       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
48237         Cond = getSETCC(CC, Cond, DL, DAG);
48238
48239         // Zero extend the condition if needed.
48240         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
48241                            FalseC->getValueType(0), Cond);
48242         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
48243                            SDValue(FalseC, 0));
48244         return Cond;
48245       }
48246
48247       // Optimize cases that will turn into an LEA instruction.  This requires
48248       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
48249       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
48250         APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
48251         assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
48252                "Implicit constant truncation");
48253
48254         bool isFastMultiplier = false;
48255         if (Diff.ult(10)) {
48256           switch (Diff.getZExtValue()) {
48257           default: break;
48258           case 1:  // result = add base, cond
48259           case 2:  // result = lea base(    , cond*2)
48260           case 3:  // result = lea base(cond, cond*2)
48261           case 4:  // result = lea base(    , cond*4)
48262           case 5:  // result = lea base(cond, cond*4)
48263           case 8:  // result = lea base(    , cond*8)
48264           case 9:  // result = lea base(cond, cond*8)
48265             isFastMultiplier = true;
48266             break;
48267           }
48268         }
48269
48270         if (isFastMultiplier) {
48271           Cond = getSETCC(CC, Cond, DL ,DAG);
48272           // Zero extend the condition if needed.
48273           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
48274                              Cond);
48275           // Scale the condition by the difference.
48276           if (Diff != 1)
48277             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
48278                                DAG.getConstant(Diff, DL, Cond.getValueType()));
48279
48280           // Add the base if non-zero.
48281           if (FalseC->getAPIntValue() != 0)
48282             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
48283                                SDValue(FalseC, 0));
48284           return Cond;
48285         }
48286       }
48287     }
48288   }
48289
48290   // Handle these cases:
48291   //   (select (x != c), e, c) -> select (x != c), e, x),
48292   //   (select (x == c), c, e) -> select (x == c), x, e)
48293   // where the c is an integer constant, and the "select" is the combination
48294   // of CMOV and CMP.
48295   //
48296   // The rationale for this change is that the conditional-move from a constant
48297   // needs two instructions, however, conditional-move from a register needs
48298   // only one instruction.
48299   //
48300   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
48301   //  some instruction-combining opportunities. This opt needs to be
48302   //  postponed as late as possible.
48303   //
48304   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
48305     // the DCI.xxxx conditions are provided to postpone the optimization as
48306     // late as possible.
48307
48308     ConstantSDNode *CmpAgainst = nullptr;
48309     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
48310         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
48311         !isa<ConstantSDNode>(Cond.getOperand(0))) {
48312
48313       if (CC == X86::COND_NE &&
48314           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
48315         CC = X86::GetOppositeBranchCondition(CC);
48316         std::swap(TrueOp, FalseOp);
48317       }
48318
48319       if (CC == X86::COND_E &&
48320           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
48321         SDValue Ops[] = {FalseOp, Cond.getOperand(0),
48322                          DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
48323         return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
48324       }
48325     }
48326   }
48327
48328   // Transform:
48329   //
48330   //   (cmov 1 T (uge T 2))
48331   //
48332   // to:
48333   //
48334   //   (adc T 0 (sub T 1))
48335   if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
48336       Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
48337     SDValue Cond0 = Cond.getOperand(0);
48338     if (Cond0.getOpcode() == ISD::TRUNCATE)
48339       Cond0 = Cond0.getOperand(0);
48340     auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
48341     if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
48342       EVT CondVT = Cond->getValueType(0);
48343       EVT OuterVT = N->getValueType(0);
48344       // Subtract 1 and generate a carry.
48345       SDValue NewSub =
48346           DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
48347                       DAG.getConstant(1, DL, CondVT));
48348       SDValue EFLAGS(NewSub.getNode(), 1);
48349       return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
48350                          TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
48351     }
48352   }
48353
48354   // Fold and/or of setcc's to double CMOV:
48355   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
48356   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
48357   //
48358   // This combine lets us generate:
48359   //   cmovcc1 (jcc1 if we don't have CMOV)
48360   //   cmovcc2 (same)
48361   // instead of:
48362   //   setcc1
48363   //   setcc2
48364   //   and/or
48365   //   cmovne (jne if we don't have CMOV)
48366   // When we can't use the CMOV instruction, it might increase branch
48367   // mispredicts.
48368   // When we can use CMOV, or when there is no mispredict, this improves
48369   // throughput and reduces register pressure.
48370   //
48371   if (CC == X86::COND_NE) {
48372     SDValue Flags;
48373     X86::CondCode CC0, CC1;
48374     bool isAndSetCC;
48375     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
48376       if (isAndSetCC) {
48377         std::swap(FalseOp, TrueOp);
48378         CC0 = X86::GetOppositeBranchCondition(CC0);
48379         CC1 = X86::GetOppositeBranchCondition(CC1);
48380       }
48381
48382       SDValue LOps[] = {FalseOp, TrueOp,
48383                         DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
48384       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
48385       SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
48386                        Flags};
48387       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
48388       return CMOV;
48389     }
48390   }
48391
48392   // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
48393   //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
48394   // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
48395   //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
48396   if ((CC == X86::COND_NE || CC == X86::COND_E) &&
48397       Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
48398     SDValue Add = TrueOp;
48399     SDValue Const = FalseOp;
48400     // Canonicalize the condition code for easier matching and output.
48401     if (CC == X86::COND_E)
48402       std::swap(Add, Const);
48403
48404     // We might have replaced the constant in the cmov with the LHS of the
48405     // compare. If so change it to the RHS of the compare.
48406     if (Const == Cond.getOperand(0))
48407       Const = Cond.getOperand(1);
48408
48409     // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
48410     if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
48411         Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
48412         (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
48413          Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
48414         Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
48415       EVT VT = N->getValueType(0);
48416       // This should constant fold.
48417       SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
48418       SDValue CMov =
48419           DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
48420                       DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
48421       return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
48422     }
48423   }
48424
48425   return SDValue();
48426 }
48427
48428 /// Different mul shrinking modes.
48429 enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
48430
48431 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
48432   EVT VT = N->getOperand(0).getValueType();
48433   if (VT.getScalarSizeInBits() != 32)
48434     return false;
48435
48436   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
48437   unsigned SignBits[2] = {1, 1};
48438   bool IsPositive[2] = {false, false};
48439   for (unsigned i = 0; i < 2; i++) {
48440     SDValue Opd = N->getOperand(i);
48441
48442     SignBits[i] = DAG.ComputeNumSignBits(Opd);
48443     IsPositive[i] = DAG.SignBitIsZero(Opd);
48444   }
48445
48446   bool AllPositive = IsPositive[0] && IsPositive[1];
48447   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
48448   // When ranges are from -128 ~ 127, use MULS8 mode.
48449   if (MinSignBits >= 25)
48450     Mode = ShrinkMode::MULS8;
48451   // When ranges are from 0 ~ 255, use MULU8 mode.
48452   else if (AllPositive && MinSignBits >= 24)
48453     Mode = ShrinkMode::MULU8;
48454   // When ranges are from -32768 ~ 32767, use MULS16 mode.
48455   else if (MinSignBits >= 17)
48456     Mode = ShrinkMode::MULS16;
48457   // When ranges are from 0 ~ 65535, use MULU16 mode.
48458   else if (AllPositive && MinSignBits >= 16)
48459     Mode = ShrinkMode::MULU16;
48460   else
48461     return false;
48462   return true;
48463 }
48464
48465 /// When the operands of vector mul are extended from smaller size values,
48466 /// like i8 and i16, the type of mul may be shrinked to generate more
48467 /// efficient code. Two typical patterns are handled:
48468 /// Pattern1:
48469 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
48470 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
48471 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
48472 ///     %5 = mul <N x i32> %2, %4
48473 ///
48474 /// Pattern2:
48475 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
48476 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
48477 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
48478 ///     %5 = mul <N x i32> %2, %4
48479 ///
48480 /// There are four mul shrinking modes:
48481 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
48482 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
48483 /// generate pmullw+sext32 for it (MULS8 mode).
48484 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
48485 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
48486 /// generate pmullw+zext32 for it (MULU8 mode).
48487 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
48488 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
48489 /// generate pmullw+pmulhw for it (MULS16 mode).
48490 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
48491 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
48492 /// generate pmullw+pmulhuw for it (MULU16 mode).
48493 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
48494                                const X86Subtarget &Subtarget) {
48495   // Check for legality
48496   // pmullw/pmulhw are not supported by SSE.
48497   if (!Subtarget.hasSSE2())
48498     return SDValue();
48499
48500   // Check for profitability
48501   // pmulld is supported since SSE41. It is better to use pmulld
48502   // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
48503   // the expansion.
48504   bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
48505   if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
48506     return SDValue();
48507
48508   ShrinkMode Mode;
48509   if (!canReduceVMulWidth(N, DAG, Mode))
48510     return SDValue();
48511
48512   SDLoc DL(N);
48513   SDValue N0 = N->getOperand(0);
48514   SDValue N1 = N->getOperand(1);
48515   EVT VT = N->getOperand(0).getValueType();
48516   unsigned NumElts = VT.getVectorNumElements();
48517   if ((NumElts % 2) != 0)
48518     return SDValue();
48519
48520   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
48521
48522   // Shrink the operands of mul.
48523   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
48524   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
48525
48526   // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
48527   // lower part is needed.
48528   SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
48529   if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
48530     return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
48531                                                    : ISD::SIGN_EXTEND,
48532                        DL, VT, MulLo);
48533
48534   EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
48535   // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
48536   // the higher part is also needed.
48537   SDValue MulHi =
48538       DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
48539                   ReducedVT, NewN0, NewN1);
48540
48541   // Repack the lower part and higher part result of mul into a wider
48542   // result.
48543   // Generate shuffle functioning as punpcklwd.
48544   SmallVector<int, 16> ShuffleMask(NumElts);
48545   for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48546     ShuffleMask[2 * i] = i;
48547     ShuffleMask[2 * i + 1] = i + NumElts;
48548   }
48549   SDValue ResLo =
48550       DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48551   ResLo = DAG.getBitcast(ResVT, ResLo);
48552   // Generate shuffle functioning as punpckhwd.
48553   for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
48554     ShuffleMask[2 * i] = i + NumElts / 2;
48555     ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
48556   }
48557   SDValue ResHi =
48558       DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
48559   ResHi = DAG.getBitcast(ResVT, ResHi);
48560   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
48561 }
48562
48563 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
48564                                  EVT VT, const SDLoc &DL) {
48565
48566   auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
48567     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48568                                  DAG.getConstant(Mult, DL, VT));
48569     Result = DAG.getNode(ISD::SHL, DL, VT, Result,
48570                          DAG.getConstant(Shift, DL, MVT::i8));
48571     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48572                          N->getOperand(0));
48573     return Result;
48574   };
48575
48576   auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
48577     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48578                                  DAG.getConstant(Mul1, DL, VT));
48579     Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
48580                          DAG.getConstant(Mul2, DL, VT));
48581     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
48582                          N->getOperand(0));
48583     return Result;
48584   };
48585
48586   switch (MulAmt) {
48587   default:
48588     break;
48589   case 11:
48590     // mul x, 11 => add ((shl (mul x, 5), 1), x)
48591     return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
48592   case 21:
48593     // mul x, 21 => add ((shl (mul x, 5), 2), x)
48594     return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
48595   case 41:
48596     // mul x, 41 => add ((shl (mul x, 5), 3), x)
48597     return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
48598   case 22:
48599     // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
48600     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48601                        combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
48602   case 19:
48603     // mul x, 19 => add ((shl (mul x, 9), 1), x)
48604     return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
48605   case 37:
48606     // mul x, 37 => add ((shl (mul x, 9), 2), x)
48607     return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
48608   case 73:
48609     // mul x, 73 => add ((shl (mul x, 9), 3), x)
48610     return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
48611   case 13:
48612     // mul x, 13 => add ((shl (mul x, 3), 2), x)
48613     return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
48614   case 23:
48615     // mul x, 23 => sub ((shl (mul x, 3), 3), x)
48616     return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
48617   case 26:
48618     // mul x, 26 => add ((mul (mul x, 5), 5), x)
48619     return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
48620   case 28:
48621     // mul x, 28 => add ((mul (mul x, 9), 3), x)
48622     return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
48623   case 29:
48624     // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
48625     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
48626                        combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
48627   }
48628
48629   // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
48630   // by a single LEA.
48631   // First check if this a sum of two power of 2s because that's easy. Then
48632   // count how many zeros are up to the first bit.
48633   // TODO: We can do this even without LEA at a cost of two shifts and an add.
48634   if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
48635     unsigned ScaleShift = llvm::countr_zero(MulAmt);
48636     if (ScaleShift >= 1 && ScaleShift < 4) {
48637       unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
48638       SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48639                                    DAG.getConstant(ShiftAmt, DL, MVT::i8));
48640       SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48641                                    DAG.getConstant(ScaleShift, DL, MVT::i8));
48642       return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
48643     }
48644   }
48645
48646   return SDValue();
48647 }
48648
48649 // If the upper 17 bits of either element are zero and the other element are
48650 // zero/sign bits then we can use PMADDWD, which is always at least as quick as
48651 // PMULLD, except on KNL.
48652 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
48653                                    const X86Subtarget &Subtarget) {
48654   if (!Subtarget.hasSSE2())
48655     return SDValue();
48656
48657   if (Subtarget.isPMADDWDSlow())
48658     return SDValue();
48659
48660   EVT VT = N->getValueType(0);
48661
48662   // Only support vXi32 vectors.
48663   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
48664     return SDValue();
48665
48666   // Make sure the type is legal or can split/widen to a legal type.
48667   // With AVX512 but without BWI, we would need to split v32i16.
48668   unsigned NumElts = VT.getVectorNumElements();
48669   if (NumElts == 1 || !isPowerOf2_32(NumElts))
48670     return SDValue();
48671
48672   // With AVX512 but without BWI, we would need to split v32i16.
48673   if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
48674     return SDValue();
48675
48676   SDValue N0 = N->getOperand(0);
48677   SDValue N1 = N->getOperand(1);
48678
48679   // If we are zero/sign extending two steps without SSE4.1, its better to
48680   // reduce the vmul width instead.
48681   if (!Subtarget.hasSSE41() &&
48682       (((N0.getOpcode() == ISD::ZERO_EXTEND &&
48683          N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48684         (N1.getOpcode() == ISD::ZERO_EXTEND &&
48685          N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
48686        ((N0.getOpcode() == ISD::SIGN_EXTEND &&
48687          N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
48688         (N1.getOpcode() == ISD::SIGN_EXTEND &&
48689          N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
48690     return SDValue();
48691
48692   // If we are sign extending a wide vector without SSE4.1, its better to reduce
48693   // the vmul width instead.
48694   if (!Subtarget.hasSSE41() &&
48695       (N0.getOpcode() == ISD::SIGN_EXTEND &&
48696        N0.getOperand(0).getValueSizeInBits() > 128) &&
48697       (N1.getOpcode() == ISD::SIGN_EXTEND &&
48698        N1.getOperand(0).getValueSizeInBits() > 128))
48699     return SDValue();
48700
48701   // Sign bits must extend down to the lowest i16.
48702   if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
48703       DAG.ComputeMaxSignificantBits(N0) > 16)
48704     return SDValue();
48705
48706   // At least one of the elements must be zero in the upper 17 bits, or can be
48707   // safely made zero without altering the final result.
48708   auto GetZeroableOp = [&](SDValue Op) {
48709     APInt Mask17 = APInt::getHighBitsSet(32, 17);
48710     if (DAG.MaskedValueIsZero(Op, Mask17))
48711       return Op;
48712     // Mask off upper 16-bits of sign-extended constants.
48713     if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
48714       return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
48715                          DAG.getConstant(0xFFFF, SDLoc(N), VT));
48716     if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
48717       SDValue Src = Op.getOperand(0);
48718       // Convert sext(vXi16) to zext(vXi16).
48719       if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
48720         return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
48721       // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
48722       // which will expand the extension.
48723       if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
48724         EVT ExtVT = VT.changeVectorElementType(MVT::i16);
48725         Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
48726         return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
48727       }
48728     }
48729     // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
48730     if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
48731         N->isOnlyUserOf(Op.getNode())) {
48732       SDValue Src = Op.getOperand(0);
48733       if (Src.getScalarValueSizeInBits() == 16)
48734         return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
48735     }
48736     // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
48737     if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
48738         N->isOnlyUserOf(Op.getNode())) {
48739       return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
48740                          Op.getOperand(1));
48741     }
48742     return SDValue();
48743   };
48744   SDValue ZeroN0 = GetZeroableOp(N0);
48745   SDValue ZeroN1 = GetZeroableOp(N1);
48746   if (!ZeroN0 && !ZeroN1)
48747     return SDValue();
48748   N0 = ZeroN0 ? ZeroN0 : N0;
48749   N1 = ZeroN1 ? ZeroN1 : N1;
48750
48751   // Use SplitOpsAndApply to handle AVX splitting.
48752   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48753                            ArrayRef<SDValue> Ops) {
48754     MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
48755     MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
48756     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
48757                        DAG.getBitcast(OpVT, Ops[0]),
48758                        DAG.getBitcast(OpVT, Ops[1]));
48759   };
48760   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},
48761                           PMADDWDBuilder);
48762 }
48763
48764 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
48765                                   const X86Subtarget &Subtarget) {
48766   if (!Subtarget.hasSSE2())
48767     return SDValue();
48768
48769   EVT VT = N->getValueType(0);
48770
48771   // Only support vXi64 vectors.
48772   if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
48773       VT.getVectorNumElements() < 2 ||
48774       !isPowerOf2_32(VT.getVectorNumElements()))
48775     return SDValue();
48776
48777   SDValue N0 = N->getOperand(0);
48778   SDValue N1 = N->getOperand(1);
48779
48780   // MULDQ returns the 64-bit result of the signed multiplication of the lower
48781   // 32-bits. We can lower with this if the sign bits stretch that far.
48782   if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
48783       DAG.ComputeNumSignBits(N1) > 32) {
48784     auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48785                             ArrayRef<SDValue> Ops) {
48786       return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
48787     };
48788     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
48789                             PMULDQBuilder, /*CheckBWI*/false);
48790   }
48791
48792   // If the upper bits are zero we can use a single pmuludq.
48793   APInt Mask = APInt::getHighBitsSet(64, 32);
48794   if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
48795     auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
48796                              ArrayRef<SDValue> Ops) {
48797       return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
48798     };
48799     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
48800                             PMULUDQBuilder, /*CheckBWI*/false);
48801   }
48802
48803   return SDValue();
48804 }
48805
48806 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
48807                           TargetLowering::DAGCombinerInfo &DCI,
48808                           const X86Subtarget &Subtarget) {
48809   EVT VT = N->getValueType(0);
48810
48811   if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
48812     return V;
48813
48814   if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
48815     return V;
48816
48817   if (DCI.isBeforeLegalize() && VT.isVector())
48818     return reduceVMULWidth(N, DAG, Subtarget);
48819
48820   // Optimize a single multiply with constant into two operations in order to
48821   // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
48822   if (!MulConstantOptimization)
48823     return SDValue();
48824
48825   // An imul is usually smaller than the alternative sequence.
48826   if (DAG.getMachineFunction().getFunction().hasMinSize())
48827     return SDValue();
48828
48829   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
48830     return SDValue();
48831
48832   if (VT != MVT::i64 && VT != MVT::i32 &&
48833       (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
48834     return SDValue();
48835
48836   ConstantSDNode *CNode = isConstOrConstSplat(
48837       N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false);
48838   const APInt *C = nullptr;
48839   if (!CNode) {
48840     if (VT.isVector())
48841       if (auto *RawC = getTargetConstantFromNode(N->getOperand(1)))
48842         if (auto *SplatC = RawC->getSplatValue())
48843           C = &(SplatC->getUniqueInteger());
48844
48845     if (!C || C->getBitWidth() != VT.getScalarSizeInBits())
48846       return SDValue();
48847   } else {
48848     C = &(CNode->getAPIntValue());
48849   }
48850
48851   if (isPowerOf2_64(C->getZExtValue()))
48852     return SDValue();
48853
48854   int64_t SignMulAmt = C->getSExtValue();
48855   assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
48856   uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
48857
48858   SDLoc DL(N);
48859   SDValue NewMul = SDValue();
48860   if (VT == MVT::i64 || VT == MVT::i32) {
48861     if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
48862       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48863                            DAG.getConstant(AbsMulAmt, DL, VT));
48864       if (SignMulAmt < 0)
48865         NewMul =
48866             DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
48867
48868       return NewMul;
48869     }
48870
48871     uint64_t MulAmt1 = 0;
48872     uint64_t MulAmt2 = 0;
48873     if ((AbsMulAmt % 9) == 0) {
48874       MulAmt1 = 9;
48875       MulAmt2 = AbsMulAmt / 9;
48876     } else if ((AbsMulAmt % 5) == 0) {
48877       MulAmt1 = 5;
48878       MulAmt2 = AbsMulAmt / 5;
48879     } else if ((AbsMulAmt % 3) == 0) {
48880       MulAmt1 = 3;
48881       MulAmt2 = AbsMulAmt / 3;
48882     }
48883
48884     // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
48885     if (MulAmt2 &&
48886         (isPowerOf2_64(MulAmt2) ||
48887          (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
48888
48889       if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
48890                                       N->use_begin()->getOpcode() == ISD::ADD))
48891         // If second multiplifer is pow2, issue it first. We want the multiply
48892         // by 3, 5, or 9 to be folded into the addressing mode unless the lone
48893         // use is an add. Only do this for positive multiply amounts since the
48894         // negate would prevent it from being used as an address mode anyway.
48895         std::swap(MulAmt1, MulAmt2);
48896
48897       if (isPowerOf2_64(MulAmt1))
48898         NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48899                              DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
48900       else
48901         NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
48902                              DAG.getConstant(MulAmt1, DL, VT));
48903
48904       if (isPowerOf2_64(MulAmt2))
48905         NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
48906                              DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
48907       else
48908         NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
48909                              DAG.getConstant(MulAmt2, DL, VT));
48910
48911       // Negate the result.
48912       if (SignMulAmt < 0)
48913         NewMul =
48914             DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
48915     } else if (!Subtarget.slowLEA())
48916       NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
48917   }
48918   if (!NewMul) {
48919     EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
48920     assert(C->getZExtValue() != 0 &&
48921            C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) &&
48922            "Both cases that could cause potential overflows should have "
48923            "already been handled.");
48924     if (isPowerOf2_64(AbsMulAmt - 1)) {
48925       // (mul x, 2^N + 1) => (add (shl x, N), x)
48926       NewMul = DAG.getNode(
48927           ISD::ADD, DL, VT, N->getOperand(0),
48928           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48929                       DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
48930       // To negate, subtract the number from zero
48931       if (SignMulAmt < 0)
48932         NewMul =
48933             DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
48934     } else if (isPowerOf2_64(AbsMulAmt + 1)) {
48935       // (mul x, 2^N - 1) => (sub (shl x, N), x)
48936       NewMul =
48937           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48938                       DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
48939       // To negate, reverse the operands of the subtract.
48940       if (SignMulAmt < 0)
48941         NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
48942       else
48943         NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
48944     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
48945                (!VT.isVector() || Subtarget.fastImmVectorShift())) {
48946       // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
48947       NewMul =
48948           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48949                       DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
48950       NewMul = DAG.getNode(
48951           ISD::ADD, DL, VT, NewMul,
48952           DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48953     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
48954                (!VT.isVector() || Subtarget.fastImmVectorShift())) {
48955       // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
48956       NewMul =
48957           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48958                       DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
48959       NewMul = DAG.getNode(
48960           ISD::SUB, DL, VT, NewMul,
48961           DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
48962     } else if (SignMulAmt >= 0 && VT.isVector() &&
48963                Subtarget.fastImmVectorShift()) {
48964       uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
48965       uint64_t ShiftAmt1;
48966       std::optional<unsigned> Opc;
48967       if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
48968         ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
48969         Opc = ISD::ADD;
48970       } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
48971         ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
48972         Opc = ISD::SUB;
48973       }
48974
48975       if (Opc) {
48976         SDValue Shift1 =
48977             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48978                         DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
48979         SDValue Shift2 =
48980             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
48981                         DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
48982         NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
48983       }
48984     }
48985   }
48986
48987   return NewMul;
48988 }
48989
48990 // Try to form a MULHU or MULHS node by looking for
48991 // (srl (mul ext, ext), 16)
48992 // TODO: This is X86 specific because we want to be able to handle wide types
48993 // before type legalization. But we can only do it if the vector will be
48994 // legalized via widening/splitting. Type legalization can't handle promotion
48995 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
48996 // combiner.
48997 static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
48998                                    const X86Subtarget &Subtarget) {
48999   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49000            "SRL or SRA node is required here!");
49001   SDLoc DL(N);
49002
49003   if (!Subtarget.hasSSE2())
49004     return SDValue();
49005
49006   // The operation feeding into the shift must be a multiply.
49007   SDValue ShiftOperand = N->getOperand(0);
49008   if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
49009     return SDValue();
49010
49011   // Input type should be at least vXi32.
49012   EVT VT = N->getValueType(0);
49013   if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49014     return SDValue();
49015
49016   // Need a shift by 16.
49017   APInt ShiftAmt;
49018   if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
49019       ShiftAmt != 16)
49020     return SDValue();
49021
49022   SDValue LHS = ShiftOperand.getOperand(0);
49023   SDValue RHS = ShiftOperand.getOperand(1);
49024
49025   unsigned ExtOpc = LHS.getOpcode();
49026   if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49027       RHS.getOpcode() != ExtOpc)
49028     return SDValue();
49029
49030   // Peek through the extends.
49031   LHS = LHS.getOperand(0);
49032   RHS = RHS.getOperand(0);
49033
49034   // Ensure the input types match.
49035   EVT MulVT = LHS.getValueType();
49036   if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49037     return SDValue();
49038
49039   unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49040   SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49041
49042   ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49043   return DAG.getNode(ExtOpc, DL, VT, Mulh);
49044 }
49045
49046 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
49047   SDValue N0 = N->getOperand(0);
49048   SDValue N1 = N->getOperand(1);
49049   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
49050   EVT VT = N0.getValueType();
49051
49052   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
49053   // since the result of setcc_c is all zero's or all ones.
49054   if (VT.isInteger() && !VT.isVector() &&
49055       N1C && N0.getOpcode() == ISD::AND &&
49056       N0.getOperand(1).getOpcode() == ISD::Constant) {
49057     SDValue N00 = N0.getOperand(0);
49058     APInt Mask = N0.getConstantOperandAPInt(1);
49059     Mask <<= N1C->getAPIntValue();
49060     bool MaskOK = false;
49061     // We can handle cases concerning bit-widening nodes containing setcc_c if
49062     // we carefully interrogate the mask to make sure we are semantics
49063     // preserving.
49064     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
49065     // of the underlying setcc_c operation if the setcc_c was zero extended.
49066     // Consider the following example:
49067     //   zext(setcc_c)                 -> i32 0x0000FFFF
49068     //   c1                            -> i32 0x0000FFFF
49069     //   c2                            -> i32 0x00000001
49070     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
49071     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
49072     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
49073       MaskOK = true;
49074     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
49075                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
49076       MaskOK = true;
49077     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
49078                 N00.getOpcode() == ISD::ANY_EXTEND) &&
49079                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
49080       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
49081     }
49082     if (MaskOK && Mask != 0) {
49083       SDLoc DL(N);
49084       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
49085     }
49086   }
49087
49088   return SDValue();
49089 }
49090
49091 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
49092                                            const X86Subtarget &Subtarget) {
49093   SDValue N0 = N->getOperand(0);
49094   SDValue N1 = N->getOperand(1);
49095   EVT VT = N0.getValueType();
49096   unsigned Size = VT.getSizeInBits();
49097
49098   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
49099     return V;
49100
49101   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
49102   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
49103   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
49104   // depending on sign of (SarConst - [56,48,32,24,16])
49105
49106   // sexts in X86 are MOVs. The MOVs have the same code size
49107   // as above SHIFTs (only SHIFT on 1 has lower code size).
49108   // However the MOVs have 2 advantages to a SHIFT:
49109   // 1. MOVs can write to a register that differs from source
49110   // 2. MOVs accept memory operands
49111
49112   if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
49113       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
49114       N0.getOperand(1).getOpcode() != ISD::Constant)
49115     return SDValue();
49116
49117   SDValue N00 = N0.getOperand(0);
49118   SDValue N01 = N0.getOperand(1);
49119   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
49120   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
49121   EVT CVT = N1.getValueType();
49122
49123   if (SarConst.isNegative())
49124     return SDValue();
49125
49126   for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
49127     unsigned ShiftSize = SVT.getSizeInBits();
49128     // skipping types without corresponding sext/zext and
49129     // ShlConst that is not one of [56,48,32,24,16]
49130     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
49131       continue;
49132     SDLoc DL(N);
49133     SDValue NN =
49134         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
49135     SarConst = SarConst - (Size - ShiftSize);
49136     if (SarConst == 0)
49137       return NN;
49138     if (SarConst.isNegative())
49139       return DAG.getNode(ISD::SHL, DL, VT, NN,
49140                          DAG.getConstant(-SarConst, DL, CVT));
49141     return DAG.getNode(ISD::SRA, DL, VT, NN,
49142                        DAG.getConstant(SarConst, DL, CVT));
49143   }
49144   return SDValue();
49145 }
49146
49147 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
49148                                         TargetLowering::DAGCombinerInfo &DCI,
49149                                         const X86Subtarget &Subtarget) {
49150   SDValue N0 = N->getOperand(0);
49151   SDValue N1 = N->getOperand(1);
49152   EVT VT = N0.getValueType();
49153
49154   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
49155     return V;
49156
49157   // Only do this on the last DAG combine as it can interfere with other
49158   // combines.
49159   if (!DCI.isAfterLegalizeDAG())
49160     return SDValue();
49161
49162   // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
49163   // TODO: This is a generic DAG combine that became an x86-only combine to
49164   // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
49165   // and-not ('andn').
49166   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
49167     return SDValue();
49168
49169   auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
49170   auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
49171   if (!ShiftC || !AndC)
49172     return SDValue();
49173
49174   // If we can shrink the constant mask below 8-bits or 32-bits, then this
49175   // transform should reduce code size. It may also enable secondary transforms
49176   // from improved known-bits analysis or instruction selection.
49177   APInt MaskVal = AndC->getAPIntValue();
49178
49179   // If this can be matched by a zero extend, don't optimize.
49180   if (MaskVal.isMask()) {
49181     unsigned TO = MaskVal.countr_one();
49182     if (TO >= 8 && isPowerOf2_32(TO))
49183       return SDValue();
49184   }
49185
49186   APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
49187   unsigned OldMaskSize = MaskVal.getSignificantBits();
49188   unsigned NewMaskSize = NewMaskVal.getSignificantBits();
49189   if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
49190       (OldMaskSize > 32 && NewMaskSize <= 32)) {
49191     // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
49192     SDLoc DL(N);
49193     SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
49194     SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
49195     return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
49196   }
49197   return SDValue();
49198 }
49199
49200 static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
49201                                          const X86Subtarget &Subtarget) {
49202   unsigned Opcode = N->getOpcode();
49203   assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
49204
49205   SDLoc DL(N);
49206   EVT VT = N->getValueType(0);
49207   SDValue N0 = N->getOperand(0);
49208   SDValue N1 = N->getOperand(1);
49209   EVT SrcVT = N0.getValueType();
49210
49211   SDValue BC0 =
49212       N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
49213   SDValue BC1 =
49214       N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
49215
49216   // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
49217   // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
49218   // truncation trees that help us avoid lane crossing shuffles.
49219   // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
49220   // TODO: We don't handle vXf64 shuffles yet.
49221   if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
49222     if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
49223       SmallVector<SDValue> ShuffleOps;
49224       SmallVector<int> ShuffleMask, ScaledMask;
49225       SDValue Vec = peekThroughBitcasts(BCSrc);
49226       if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
49227         resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
49228         // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
49229         // shuffle to a v4X64 width - we can probably relax this in the future.
49230         if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
49231             ShuffleOps[0].getValueType().is256BitVector() &&
49232             scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
49233           SDValue Lo, Hi;
49234           MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
49235           std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
49236           Lo = DAG.getBitcast(SrcVT, Lo);
49237           Hi = DAG.getBitcast(SrcVT, Hi);
49238           SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
49239           Res = DAG.getBitcast(ShufVT, Res);
49240           Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
49241           return DAG.getBitcast(VT, Res);
49242         }
49243       }
49244     }
49245   }
49246
49247   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
49248   if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
49249     // If either/both ops are a shuffle that can scale to v2x64,
49250     // then see if we can perform this as a v4x32 post shuffle.
49251     SmallVector<SDValue> Ops0, Ops1;
49252     SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
49253     bool IsShuf0 =
49254         getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
49255         scaleShuffleElements(Mask0, 2, ScaledMask0) &&
49256         all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
49257     bool IsShuf1 =
49258         getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
49259         scaleShuffleElements(Mask1, 2, ScaledMask1) &&
49260         all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
49261     if (IsShuf0 || IsShuf1) {
49262       if (!IsShuf0) {
49263         Ops0.assign({BC0});
49264         ScaledMask0.assign({0, 1});
49265       }
49266       if (!IsShuf1) {
49267         Ops1.assign({BC1});
49268         ScaledMask1.assign({0, 1});
49269       }
49270
49271       SDValue LHS, RHS;
49272       int PostShuffle[4] = {-1, -1, -1, -1};
49273       auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
49274         if (M < 0)
49275           return true;
49276         Idx = M % 2;
49277         SDValue Src = Ops[M / 2];
49278         if (!LHS || LHS == Src) {
49279           LHS = Src;
49280           return true;
49281         }
49282         if (!RHS || RHS == Src) {
49283           Idx += 2;
49284           RHS = Src;
49285           return true;
49286         }
49287         return false;
49288       };
49289       if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
49290           FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
49291           FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
49292           FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
49293         LHS = DAG.getBitcast(SrcVT, LHS);
49294         RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
49295         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
49296         SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
49297         Res = DAG.getBitcast(ShufVT, Res);
49298         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
49299         return DAG.getBitcast(VT, Res);
49300       }
49301     }
49302   }
49303
49304   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
49305   if (VT.is256BitVector() && Subtarget.hasInt256()) {
49306     SmallVector<int> Mask0, Mask1;
49307     SmallVector<SDValue> Ops0, Ops1;
49308     SmallVector<int, 2> ScaledMask0, ScaledMask1;
49309     if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
49310         getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
49311         !Ops0.empty() && !Ops1.empty() &&
49312         all_of(Ops0,
49313                [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
49314         all_of(Ops1,
49315                [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
49316         scaleShuffleElements(Mask0, 2, ScaledMask0) &&
49317         scaleShuffleElements(Mask1, 2, ScaledMask1)) {
49318       SDValue Op00 = peekThroughBitcasts(Ops0.front());
49319       SDValue Op10 = peekThroughBitcasts(Ops1.front());
49320       SDValue Op01 = peekThroughBitcasts(Ops0.back());
49321       SDValue Op11 = peekThroughBitcasts(Ops1.back());
49322       if ((Op00 == Op11) && (Op01 == Op10)) {
49323         std::swap(Op10, Op11);
49324         ShuffleVectorSDNode::commuteMask(ScaledMask1);
49325       }
49326       if ((Op00 == Op10) && (Op01 == Op11)) {
49327         const int Map[4] = {0, 2, 1, 3};
49328         SmallVector<int, 4> ShuffleMask(
49329             {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
49330              Map[ScaledMask1[1]]});
49331         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
49332         SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
49333                                   DAG.getBitcast(SrcVT, Op01));
49334         Res = DAG.getBitcast(ShufVT, Res);
49335         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
49336         return DAG.getBitcast(VT, Res);
49337       }
49338     }
49339   }
49340
49341   return SDValue();
49342 }
49343
49344 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
49345                                  TargetLowering::DAGCombinerInfo &DCI,
49346                                  const X86Subtarget &Subtarget) {
49347   unsigned Opcode = N->getOpcode();
49348   assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
49349          "Unexpected pack opcode");
49350
49351   EVT VT = N->getValueType(0);
49352   SDValue N0 = N->getOperand(0);
49353   SDValue N1 = N->getOperand(1);
49354   unsigned NumDstElts = VT.getVectorNumElements();
49355   unsigned DstBitsPerElt = VT.getScalarSizeInBits();
49356   unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
49357   assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
49358          N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
49359          "Unexpected PACKSS/PACKUS input type");
49360
49361   bool IsSigned = (X86ISD::PACKSS == Opcode);
49362
49363   // Constant Folding.
49364   APInt UndefElts0, UndefElts1;
49365   SmallVector<APInt, 32> EltBits0, EltBits1;
49366   if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
49367       (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
49368       getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
49369       getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
49370     unsigned NumLanes = VT.getSizeInBits() / 128;
49371     unsigned NumSrcElts = NumDstElts / 2;
49372     unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
49373     unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
49374
49375     APInt Undefs(NumDstElts, 0);
49376     SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
49377     for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
49378       for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
49379         unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
49380         auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
49381         auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
49382
49383         if (UndefElts[SrcIdx]) {
49384           Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
49385           continue;
49386         }
49387
49388         APInt &Val = EltBits[SrcIdx];
49389         if (IsSigned) {
49390           // PACKSS: Truncate signed value with signed saturation.
49391           // Source values less than dst minint are saturated to minint.
49392           // Source values greater than dst maxint are saturated to maxint.
49393           if (Val.isSignedIntN(DstBitsPerElt))
49394             Val = Val.trunc(DstBitsPerElt);
49395           else if (Val.isNegative())
49396             Val = APInt::getSignedMinValue(DstBitsPerElt);
49397           else
49398             Val = APInt::getSignedMaxValue(DstBitsPerElt);
49399         } else {
49400           // PACKUS: Truncate signed value with unsigned saturation.
49401           // Source values less than zero are saturated to zero.
49402           // Source values greater than dst maxuint are saturated to maxuint.
49403           if (Val.isIntN(DstBitsPerElt))
49404             Val = Val.trunc(DstBitsPerElt);
49405           else if (Val.isNegative())
49406             Val = APInt::getZero(DstBitsPerElt);
49407           else
49408             Val = APInt::getAllOnes(DstBitsPerElt);
49409         }
49410         Bits[Lane * NumDstEltsPerLane + Elt] = Val;
49411       }
49412     }
49413
49414     return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
49415   }
49416
49417   // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
49418   if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49419     return V;
49420
49421   // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
49422   // truncate to create a larger truncate.
49423   if (Subtarget.hasAVX512() &&
49424       N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
49425       N0.getOperand(0).getValueType() == MVT::v8i32) {
49426     if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
49427         (!IsSigned &&
49428          DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
49429       if (Subtarget.hasVLX())
49430         return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
49431
49432       // Widen input to v16i32 so we can truncate that.
49433       SDLoc dl(N);
49434       SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
49435                                    N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
49436       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
49437     }
49438   }
49439
49440   // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
49441   if (VT.is128BitVector()) {
49442     unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49443     SDValue Src0, Src1;
49444     if (N0.getOpcode() == ExtOpc &&
49445         N0.getOperand(0).getValueType().is64BitVector() &&
49446         N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
49447       Src0 = N0.getOperand(0);
49448     }
49449     if (N1.getOpcode() == ExtOpc &&
49450         N1.getOperand(0).getValueType().is64BitVector() &&
49451         N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
49452       Src1 = N1.getOperand(0);
49453     }
49454     if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
49455       assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
49456       Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
49457       Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
49458       return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
49459     }
49460
49461     // Try again with pack(*_extend_vector_inreg, undef).
49462     unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
49463                                     : ISD::ZERO_EXTEND_VECTOR_INREG;
49464     if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
49465         N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
49466       return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
49467                                     DAG);
49468   }
49469
49470   // Attempt to combine as shuffle.
49471   SDValue Op(N, 0);
49472   if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49473     return Res;
49474
49475   return SDValue();
49476 }
49477
49478 static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
49479                                     TargetLowering::DAGCombinerInfo &DCI,
49480                                     const X86Subtarget &Subtarget) {
49481   assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
49482           X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
49483          "Unexpected horizontal add/sub opcode");
49484
49485   if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
49486     MVT VT = N->getSimpleValueType(0);
49487     SDValue LHS = N->getOperand(0);
49488     SDValue RHS = N->getOperand(1);
49489
49490     // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
49491     if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
49492         LHS.getOpcode() == RHS.getOpcode() &&
49493         LHS.getValueType() == RHS.getValueType() &&
49494         N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
49495       SDValue LHS0 = LHS.getOperand(0);
49496       SDValue LHS1 = LHS.getOperand(1);
49497       SDValue RHS0 = RHS.getOperand(0);
49498       SDValue RHS1 = RHS.getOperand(1);
49499       if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
49500           (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
49501         SDLoc DL(N);
49502         SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
49503                                   LHS0.isUndef() ? LHS1 : LHS0,
49504                                   RHS0.isUndef() ? RHS1 : RHS0);
49505         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
49506         Res = DAG.getBitcast(ShufVT, Res);
49507         SDValue NewLHS =
49508             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
49509                         getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
49510         SDValue NewRHS =
49511             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
49512                         getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
49513         return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
49514                            DAG.getBitcast(VT, NewRHS));
49515       }
49516     }
49517   }
49518
49519   // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
49520   if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
49521     return V;
49522
49523   return SDValue();
49524 }
49525
49526 static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
49527                                      TargetLowering::DAGCombinerInfo &DCI,
49528                                      const X86Subtarget &Subtarget) {
49529   assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
49530           X86ISD::VSRL == N->getOpcode()) &&
49531          "Unexpected shift opcode");
49532   EVT VT = N->getValueType(0);
49533   SDValue N0 = N->getOperand(0);
49534   SDValue N1 = N->getOperand(1);
49535
49536   // Shift zero -> zero.
49537   if (ISD::isBuildVectorAllZeros(N0.getNode()))
49538     return DAG.getConstant(0, SDLoc(N), VT);
49539
49540   // Detect constant shift amounts.
49541   APInt UndefElts;
49542   SmallVector<APInt, 32> EltBits;
49543   if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
49544     unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
49545     return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
49546                                       EltBits[0].getZExtValue(), DAG);
49547   }
49548
49549   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49550   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
49551   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
49552     return SDValue(N, 0);
49553
49554   return SDValue();
49555 }
49556
49557 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
49558                                      TargetLowering::DAGCombinerInfo &DCI,
49559                                      const X86Subtarget &Subtarget) {
49560   unsigned Opcode = N->getOpcode();
49561   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
49562           X86ISD::VSRLI == Opcode) &&
49563          "Unexpected shift opcode");
49564   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
49565   EVT VT = N->getValueType(0);
49566   SDValue N0 = N->getOperand(0);
49567   SDValue N1 = N->getOperand(1);
49568   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49569   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
49570          "Unexpected value type");
49571   assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
49572
49573   // (shift undef, X) -> 0
49574   if (N0.isUndef())
49575     return DAG.getConstant(0, SDLoc(N), VT);
49576
49577   // Out of range logical bit shifts are guaranteed to be zero.
49578   // Out of range arithmetic bit shifts splat the sign bit.
49579   unsigned ShiftVal = N->getConstantOperandVal(1);
49580   if (ShiftVal >= NumBitsPerElt) {
49581     if (LogicalShift)
49582       return DAG.getConstant(0, SDLoc(N), VT);
49583     ShiftVal = NumBitsPerElt - 1;
49584   }
49585
49586   // (shift X, 0) -> X
49587   if (!ShiftVal)
49588     return N0;
49589
49590   // (shift 0, C) -> 0
49591   if (ISD::isBuildVectorAllZeros(N0.getNode()))
49592     // N0 is all zeros or undef. We guarantee that the bits shifted into the
49593     // result are all zeros, not undef.
49594     return DAG.getConstant(0, SDLoc(N), VT);
49595
49596   // (VSRAI -1, C) -> -1
49597   if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
49598     // N0 is all ones or undef. We guarantee that the bits shifted into the
49599     // result are all ones, not undef.
49600     return DAG.getConstant(-1, SDLoc(N), VT);
49601
49602   auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
49603     unsigned NewShiftVal = Amt0 + Amt1;
49604     if (NewShiftVal >= NumBitsPerElt) {
49605       // Out of range logical bit shifts are guaranteed to be zero.
49606       // Out of range arithmetic bit shifts splat the sign bit.
49607       if (LogicalShift)
49608         return DAG.getConstant(0, SDLoc(N), VT);
49609       NewShiftVal = NumBitsPerElt - 1;
49610     }
49611     return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
49612                        DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
49613   };
49614
49615   // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
49616   if (Opcode == N0.getOpcode())
49617     return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
49618
49619   // (shl (add X, X), C) -> (shl X, (C + 1))
49620   if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
49621       N0.getOperand(0) == N0.getOperand(1))
49622     return MergeShifts(N0.getOperand(0), ShiftVal, 1);
49623
49624   // We can decode 'whole byte' logical bit shifts as shuffles.
49625   if (LogicalShift && (ShiftVal % 8) == 0) {
49626     SDValue Op(N, 0);
49627     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49628       return Res;
49629   }
49630
49631   auto TryConstantFold = [&](SDValue V) {
49632     APInt UndefElts;
49633     SmallVector<APInt, 32> EltBits;
49634     if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits))
49635       return SDValue();
49636     assert(EltBits.size() == VT.getVectorNumElements() &&
49637            "Unexpected shift value type");
49638     // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
49639     // created an undef input due to no input bits being demanded, but user
49640     // still expects 0 in other bits.
49641     for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
49642       APInt &Elt = EltBits[i];
49643       if (UndefElts[i])
49644         Elt = 0;
49645       else if (X86ISD::VSHLI == Opcode)
49646         Elt <<= ShiftVal;
49647       else if (X86ISD::VSRAI == Opcode)
49648         Elt.ashrInPlace(ShiftVal);
49649       else
49650         Elt.lshrInPlace(ShiftVal);
49651     }
49652     // Reset undef elements since they were zeroed above.
49653     UndefElts = 0;
49654     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
49655   };
49656
49657   // Constant Folding.
49658   if (N->isOnlyUserOf(N0.getNode())) {
49659     if (SDValue C = TryConstantFold(N0))
49660       return C;
49661
49662     // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
49663     // Don't break NOT patterns.
49664     SDValue BC = peekThroughOneUseBitcasts(N0);
49665     if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
49666         BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
49667         !ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) {
49668       if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
49669         SDLoc DL(N);
49670         SDValue LHS = DAG.getNode(Opcode, DL, VT,
49671                                   DAG.getBitcast(VT, BC.getOperand(0)), N1);
49672         return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
49673       }
49674     }
49675   }
49676
49677   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49678   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
49679                                DCI))
49680     return SDValue(N, 0);
49681
49682   return SDValue();
49683 }
49684
49685 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
49686                                    TargetLowering::DAGCombinerInfo &DCI,
49687                                    const X86Subtarget &Subtarget) {
49688   EVT VT = N->getValueType(0);
49689   unsigned Opcode = N->getOpcode();
49690   assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
49691           (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
49692           Opcode == ISD::INSERT_VECTOR_ELT) &&
49693          "Unexpected vector insertion");
49694
49695   SDValue Vec = N->getOperand(0);
49696   SDValue Scl = N->getOperand(1);
49697   SDValue Idx = N->getOperand(2);
49698
49699   // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
49700   if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
49701     return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
49702
49703   if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
49704     unsigned NumBitsPerElt = VT.getScalarSizeInBits();
49705     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49706     if (TLI.SimplifyDemandedBits(SDValue(N, 0),
49707                                  APInt::getAllOnes(NumBitsPerElt), DCI))
49708       return SDValue(N, 0);
49709   }
49710
49711   // Attempt to combine insertion patterns to a shuffle.
49712   if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
49713     SDValue Op(N, 0);
49714     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49715       return Res;
49716   }
49717
49718   return SDValue();
49719 }
49720
49721 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
49722 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
49723 /// OR -> CMPNEQSS.
49724 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
49725                                    TargetLowering::DAGCombinerInfo &DCI,
49726                                    const X86Subtarget &Subtarget) {
49727   unsigned opcode;
49728
49729   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
49730   // we're requiring SSE2 for both.
49731   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
49732     SDValue N0 = N->getOperand(0);
49733     SDValue N1 = N->getOperand(1);
49734     SDValue CMP0 = N0.getOperand(1);
49735     SDValue CMP1 = N1.getOperand(1);
49736     SDLoc DL(N);
49737
49738     // The SETCCs should both refer to the same CMP.
49739     if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
49740       return SDValue();
49741
49742     SDValue CMP00 = CMP0->getOperand(0);
49743     SDValue CMP01 = CMP0->getOperand(1);
49744     EVT     VT    = CMP00.getValueType();
49745
49746     if (VT == MVT::f32 || VT == MVT::f64 ||
49747         (VT == MVT::f16 && Subtarget.hasFP16())) {
49748       bool ExpectingFlags = false;
49749       // Check for any users that want flags:
49750       for (const SDNode *U : N->uses()) {
49751         if (ExpectingFlags)
49752           break;
49753
49754         switch (U->getOpcode()) {
49755         default:
49756         case ISD::BR_CC:
49757         case ISD::BRCOND:
49758         case ISD::SELECT:
49759           ExpectingFlags = true;
49760           break;
49761         case ISD::CopyToReg:
49762         case ISD::SIGN_EXTEND:
49763         case ISD::ZERO_EXTEND:
49764         case ISD::ANY_EXTEND:
49765           break;
49766         }
49767       }
49768
49769       if (!ExpectingFlags) {
49770         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
49771         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
49772
49773         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
49774           X86::CondCode tmp = cc0;
49775           cc0 = cc1;
49776           cc1 = tmp;
49777         }
49778
49779         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
49780             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
49781           // FIXME: need symbolic constants for these magic numbers.
49782           // See X86ATTInstPrinter.cpp:printSSECC().
49783           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
49784           if (Subtarget.hasAVX512()) {
49785             SDValue FSetCC =
49786                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
49787                             DAG.getTargetConstant(x86cc, DL, MVT::i8));
49788             // Need to fill with zeros to ensure the bitcast will produce zeroes
49789             // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
49790             SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
49791                                       DAG.getConstant(0, DL, MVT::v16i1),
49792                                       FSetCC, DAG.getIntPtrConstant(0, DL));
49793             return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
49794                                       N->getSimpleValueType(0));
49795           }
49796           SDValue OnesOrZeroesF =
49797               DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
49798                           CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
49799
49800           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
49801           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
49802
49803           if (is64BitFP && !Subtarget.is64Bit()) {
49804             // On a 32-bit target, we cannot bitcast the 64-bit float to a
49805             // 64-bit integer, since that's not a legal type. Since
49806             // OnesOrZeroesF is all ones or all zeroes, we don't need all the
49807             // bits, but can do this little dance to extract the lowest 32 bits
49808             // and work with those going forward.
49809             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
49810                                            OnesOrZeroesF);
49811             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
49812             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
49813                                         Vector32, DAG.getIntPtrConstant(0, DL));
49814             IntVT = MVT::i32;
49815           }
49816
49817           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
49818           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
49819                                       DAG.getConstant(1, DL, IntVT));
49820           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
49821                                               ANDed);
49822           return OneBitOfTruth;
49823         }
49824       }
49825     }
49826   }
49827   return SDValue();
49828 }
49829
49830 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
49831 static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
49832   assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
49833
49834   MVT VT = N->getSimpleValueType(0);
49835   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
49836     return SDValue();
49837
49838   SDValue X, Y;
49839   SDValue N0 = N->getOperand(0);
49840   SDValue N1 = N->getOperand(1);
49841
49842   if (SDValue Not = IsNOT(N0, DAG)) {
49843     X = Not;
49844     Y = N1;
49845   } else if (SDValue Not = IsNOT(N1, DAG)) {
49846     X = Not;
49847     Y = N0;
49848   } else
49849     return SDValue();
49850
49851   X = DAG.getBitcast(VT, X);
49852   Y = DAG.getBitcast(VT, Y);
49853   return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
49854 }
49855
49856 /// Try to fold:
49857 ///   and (vector_shuffle<Z,...,Z>
49858 ///            (insert_vector_elt undef, (xor X, -1), Z), undef), Y
49859 ///   ->
49860 ///   andnp (vector_shuffle<Z,...,Z>
49861 ///              (insert_vector_elt undef, X, Z), undef), Y
49862 static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
49863                                     const X86Subtarget &Subtarget) {
49864   assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
49865
49866   EVT VT = N->getValueType(0);
49867   // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
49868   // value and require extra moves.
49869   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
49870         ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
49871     return SDValue();
49872
49873   auto GetNot = [&DAG](SDValue V) {
49874     auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
49875     // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
49876     // end-users are ISD::AND including cases
49877     // (and(extract_vector_element(SVN), Y)).
49878     if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
49879         !SVN->getOperand(1).isUndef()) {
49880       return SDValue();
49881     }
49882     SDValue IVEN = SVN->getOperand(0);
49883     if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
49884         !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
49885       return SDValue();
49886     if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
49887         IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
49888       return SDValue();
49889     SDValue Src = IVEN.getOperand(1);
49890     if (SDValue Not = IsNOT(Src, DAG)) {
49891       SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
49892       SDValue NotIVEN =
49893           DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),
49894                       IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
49895       return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
49896                                   SVN->getOperand(1), SVN->getMask());
49897     }
49898     return SDValue();
49899   };
49900
49901   SDValue X, Y;
49902   SDValue N0 = N->getOperand(0);
49903   SDValue N1 = N->getOperand(1);
49904
49905   if (SDValue Not = GetNot(N0)) {
49906     X = Not;
49907     Y = N1;
49908   } else if (SDValue Not = GetNot(N1)) {
49909     X = Not;
49910     Y = N0;
49911   } else
49912     return SDValue();
49913
49914   X = DAG.getBitcast(VT, X);
49915   Y = DAG.getBitcast(VT, Y);
49916   SDLoc DL(N);
49917   // We do not split for SSE at all, but we need to split vectors for AVX1 and
49918   // AVX2.
49919   if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {
49920     SDValue LoX, HiX;
49921     std::tie(LoX, HiX) = splitVector(X, DAG, DL);
49922     SDValue LoY, HiY;
49923     std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
49924     EVT SplitVT = LoX.getValueType();
49925     SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
49926     SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
49927     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
49928   }
49929   return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
49930 }
49931
49932 // Try to widen AND, OR and XOR nodes to VT in order to remove casts around
49933 // logical operations, like in the example below.
49934 //   or (and (truncate x, truncate y)),
49935 //      (xor (truncate z, build_vector (constants)))
49936 // Given a target type \p VT, we generate
49937 //   or (and x, y), (xor z, zext(build_vector (constants)))
49938 // given x, y and z are of type \p VT. We can do so, if operands are either
49939 // truncates from VT types, the second operand is a vector of constants or can
49940 // be recursively promoted.
49941 static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
49942                                      unsigned Depth) {
49943   // Limit recursion to avoid excessive compile times.
49944   if (Depth >= SelectionDAG::MaxRecursionDepth)
49945     return SDValue();
49946
49947   if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
49948       N->getOpcode() != ISD::OR)
49949     return SDValue();
49950
49951   SDValue N0 = N->getOperand(0);
49952   SDValue N1 = N->getOperand(1);
49953   SDLoc DL(N);
49954
49955   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49956   if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
49957     return SDValue();
49958
49959   if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
49960     N0 = NN0;
49961   else {
49962     // The Left side has to be a trunc.
49963     if (N0.getOpcode() != ISD::TRUNCATE)
49964       return SDValue();
49965
49966     // The type of the truncated inputs.
49967     if (N0.getOperand(0).getValueType() != VT)
49968       return SDValue();
49969
49970     N0 = N0.getOperand(0);
49971   }
49972
49973   if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
49974     N1 = NN1;
49975   else {
49976     // The right side has to be a 'trunc' or a constant vector.
49977     bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
49978                     N1.getOperand(0).getValueType() == VT;
49979     if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
49980       return SDValue();
49981
49982     if (RHSTrunc)
49983       N1 = N1.getOperand(0);
49984     else
49985       N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
49986   }
49987
49988   return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
49989 }
49990
49991 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
49992 // register. In most cases we actually compare or select YMM-sized registers
49993 // and mixing the two types creates horrible code. This method optimizes
49994 // some of the transition sequences.
49995 // Even with AVX-512 this is still useful for removing casts around logical
49996 // operations on vXi1 mask types.
49997 static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
49998                                      const X86Subtarget &Subtarget) {
49999   EVT VT = N->getValueType(0);
50000   assert(VT.isVector() && "Expected vector type");
50001
50002   SDLoc DL(N);
50003   assert((N->getOpcode() == ISD::ANY_EXTEND ||
50004           N->getOpcode() == ISD::ZERO_EXTEND ||
50005           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
50006
50007   SDValue Narrow = N->getOperand(0);
50008   EVT NarrowVT = Narrow.getValueType();
50009
50010   // Generate the wide operation.
50011   SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
50012   if (!Op)
50013     return SDValue();
50014   switch (N->getOpcode()) {
50015   default: llvm_unreachable("Unexpected opcode");
50016   case ISD::ANY_EXTEND:
50017     return Op;
50018   case ISD::ZERO_EXTEND:
50019     return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
50020   case ISD::SIGN_EXTEND:
50021     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
50022                        Op, DAG.getValueType(NarrowVT));
50023   }
50024 }
50025
50026 static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
50027   unsigned FPOpcode;
50028   switch (Opcode) {
50029   default: llvm_unreachable("Unexpected input node for FP logic conversion");
50030   case ISD::AND: FPOpcode = X86ISD::FAND; break;
50031   case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
50032   case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
50033   }
50034   return FPOpcode;
50035 }
50036
50037 /// If both input operands of a logic op are being cast from floating-point
50038 /// types or FP compares, try to convert this into a floating-point logic node
50039 /// to avoid unnecessary moves from SSE to integer registers.
50040 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
50041                                         TargetLowering::DAGCombinerInfo &DCI,
50042                                         const X86Subtarget &Subtarget) {
50043   EVT VT = N->getValueType(0);
50044   SDValue N0 = N->getOperand(0);
50045   SDValue N1 = N->getOperand(1);
50046   SDLoc DL(N);
50047
50048   if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
50049         (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
50050     return SDValue();
50051
50052   SDValue N00 = N0.getOperand(0);
50053   SDValue N10 = N1.getOperand(0);
50054   EVT N00Type = N00.getValueType();
50055   EVT N10Type = N10.getValueType();
50056
50057   // Ensure that both types are the same and are legal scalar fp types.
50058   if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
50059                               (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
50060                               (Subtarget.hasFP16() && N00Type == MVT::f16)))
50061     return SDValue();
50062
50063   if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
50064     unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
50065     SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
50066     return DAG.getBitcast(VT, FPLogic);
50067   }
50068
50069   if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
50070       !N1.hasOneUse())
50071     return SDValue();
50072
50073   ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
50074   ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
50075
50076   // The vector ISA for FP predicates is incomplete before AVX, so converting
50077   // COMIS* to CMPS* may not be a win before AVX.
50078   if (!Subtarget.hasAVX() &&
50079       !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
50080     return SDValue();
50081
50082   // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
50083   // and vector logic:
50084   // logic (setcc N00, N01), (setcc N10, N11) -->
50085   // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
50086   unsigned NumElts = 128 / N00Type.getSizeInBits();
50087   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
50088   EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
50089   SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
50090   SDValue N01 = N0.getOperand(1);
50091   SDValue N11 = N1.getOperand(1);
50092   SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
50093   SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
50094   SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
50095   SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
50096   SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
50097   SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
50098   SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
50099   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
50100 }
50101
50102 // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
50103 // to reduce XMM->GPR traffic.
50104 static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
50105   unsigned Opc = N->getOpcode();
50106   assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50107          "Unexpected bit opcode");
50108
50109   SDValue N0 = N->getOperand(0);
50110   SDValue N1 = N->getOperand(1);
50111
50112   // Both operands must be single use MOVMSK.
50113   if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
50114       N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
50115     return SDValue();
50116
50117   SDValue Vec0 = N0.getOperand(0);
50118   SDValue Vec1 = N1.getOperand(0);
50119   EVT VecVT0 = Vec0.getValueType();
50120   EVT VecVT1 = Vec1.getValueType();
50121
50122   // Both MOVMSK operands must be from vectors of the same size and same element
50123   // size, but its OK for a fp/int diff.
50124   if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
50125       VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
50126     return SDValue();
50127
50128   SDLoc DL(N);
50129   unsigned VecOpc =
50130       VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
50131   SDValue Result =
50132       DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
50133   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
50134 }
50135
50136 // Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
50137 // NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
50138 // handles in InstCombine.
50139 static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
50140   unsigned Opc = N->getOpcode();
50141   assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50142          "Unexpected bit opcode");
50143
50144   SDValue N0 = N->getOperand(0);
50145   SDValue N1 = N->getOperand(1);
50146   EVT VT = N->getValueType(0);
50147
50148   // Both operands must be single use.
50149   if (!N0.hasOneUse() || !N1.hasOneUse())
50150     return SDValue();
50151
50152   // Search for matching shifts.
50153   SDValue BC0 = peekThroughOneUseBitcasts(N0);
50154   SDValue BC1 = peekThroughOneUseBitcasts(N1);
50155
50156   unsigned BCOpc = BC0.getOpcode();
50157   EVT BCVT = BC0.getValueType();
50158   if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
50159     return SDValue();
50160
50161   switch (BCOpc) {
50162   case X86ISD::VSHLI:
50163   case X86ISD::VSRLI:
50164   case X86ISD::VSRAI: {
50165     if (BC0.getOperand(1) != BC1.getOperand(1))
50166       return SDValue();
50167
50168     SDLoc DL(N);
50169     SDValue BitOp =
50170         DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
50171     SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
50172     return DAG.getBitcast(VT, Shift);
50173   }
50174   }
50175
50176   return SDValue();
50177 }
50178
50179 // Attempt to fold:
50180 // BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
50181 // TODO: Handle PACKUS handling.
50182 static SDValue combineBitOpWithPACK(SDNode *N, SelectionDAG &DAG) {
50183   unsigned Opc = N->getOpcode();
50184   assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
50185          "Unexpected bit opcode");
50186
50187   SDValue N0 = N->getOperand(0);
50188   SDValue N1 = N->getOperand(1);
50189   EVT VT = N->getValueType(0);
50190
50191   // Both operands must be single use.
50192   if (!N0.hasOneUse() || !N1.hasOneUse())
50193     return SDValue();
50194
50195   // Search for matching packs.
50196   N0 = peekThroughOneUseBitcasts(N0);
50197   N1 = peekThroughOneUseBitcasts(N1);
50198
50199   if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
50200     return SDValue();
50201
50202   MVT DstVT = N0.getSimpleValueType();
50203   if (DstVT != N1.getSimpleValueType())
50204     return SDValue();
50205
50206   MVT SrcVT = N0.getOperand(0).getSimpleValueType();
50207   unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
50208
50209   // Limit to allsignbits packing.
50210   if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
50211       DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
50212       DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
50213       DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
50214     return SDValue();
50215
50216   SDLoc DL(N);
50217   SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
50218   SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
50219   return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
50220 }
50221
50222 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
50223 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
50224 /// with a shift-right to eliminate loading the vector constant mask value.
50225 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
50226                                      const X86Subtarget &Subtarget) {
50227   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
50228   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
50229   EVT VT = Op0.getValueType();
50230   if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
50231     return SDValue();
50232
50233   // Try to convert an "is positive" signbit masking operation into arithmetic
50234   // shift and "andn". This saves a materialization of a -1 vector constant.
50235   // The "is negative" variant should be handled more generally because it only
50236   // requires "and" rather than "andn":
50237   // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
50238   //
50239   // This is limited to the original type to avoid producing even more bitcasts.
50240   // If the bitcasts can't be eliminated, then it is unlikely that this fold
50241   // will be profitable.
50242   if (N->getValueType(0) == VT &&
50243       supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
50244     SDValue X, Y;
50245     if (Op1.getOpcode() == X86ISD::PCMPGT &&
50246         isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
50247       X = Op1.getOperand(0);
50248       Y = Op0;
50249     } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
50250                isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
50251       X = Op0.getOperand(0);
50252       Y = Op1;
50253     }
50254     if (X && Y) {
50255       SDLoc DL(N);
50256       SDValue Sra =
50257           getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
50258                                      VT.getScalarSizeInBits() - 1, DAG);
50259       return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
50260     }
50261   }
50262
50263   APInt SplatVal;
50264   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
50265       !SplatVal.isMask())
50266     return SDValue();
50267
50268   // Don't prevent creation of ANDN.
50269   if (isBitwiseNot(Op0))
50270     return SDValue();
50271
50272   if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
50273     return SDValue();
50274
50275   unsigned EltBitWidth = VT.getScalarSizeInBits();
50276   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
50277     return SDValue();
50278
50279   SDLoc DL(N);
50280   unsigned ShiftVal = SplatVal.countr_one();
50281   SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
50282   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
50283   return DAG.getBitcast(N->getValueType(0), Shift);
50284 }
50285
50286 // Get the index node from the lowered DAG of a GEP IR instruction with one
50287 // indexing dimension.
50288 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
50289   if (Ld->isIndexed())
50290     return SDValue();
50291
50292   SDValue Base = Ld->getBasePtr();
50293
50294   if (Base.getOpcode() != ISD::ADD)
50295     return SDValue();
50296
50297   SDValue ShiftedIndex = Base.getOperand(0);
50298
50299   if (ShiftedIndex.getOpcode() != ISD::SHL)
50300     return SDValue();
50301
50302   return ShiftedIndex.getOperand(0);
50303
50304 }
50305
50306 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
50307   if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
50308     switch (VT.getSizeInBits()) {
50309     default: return false;
50310     case 64: return Subtarget.is64Bit() ? true : false;
50311     case 32: return true;
50312     }
50313   }
50314   return false;
50315 }
50316
50317 // This function recognizes cases where X86 bzhi instruction can replace and
50318 // 'and-load' sequence.
50319 // In case of loading integer value from an array of constants which is defined
50320 // as follows:
50321 //
50322 //   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
50323 //
50324 // then applying a bitwise and on the result with another input.
50325 // It's equivalent to performing bzhi (zero high bits) on the input, with the
50326 // same index of the load.
50327 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
50328                                     const X86Subtarget &Subtarget) {
50329   MVT VT = Node->getSimpleValueType(0);
50330   SDLoc dl(Node);
50331
50332   // Check if subtarget has BZHI instruction for the node's type
50333   if (!hasBZHI(Subtarget, VT))
50334     return SDValue();
50335
50336   // Try matching the pattern for both operands.
50337   for (unsigned i = 0; i < 2; i++) {
50338     SDValue N = Node->getOperand(i);
50339     LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
50340
50341      // continue if the operand is not a load instruction
50342     if (!Ld)
50343       return SDValue();
50344
50345     const Value *MemOp = Ld->getMemOperand()->getValue();
50346
50347     if (!MemOp)
50348       return SDValue();
50349
50350     if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
50351       if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
50352         if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
50353
50354           Constant *Init = GV->getInitializer();
50355           Type *Ty = Init->getType();
50356           if (!isa<ConstantDataArray>(Init) ||
50357               !Ty->getArrayElementType()->isIntegerTy() ||
50358               Ty->getArrayElementType()->getScalarSizeInBits() !=
50359                   VT.getSizeInBits() ||
50360               Ty->getArrayNumElements() >
50361                   Ty->getArrayElementType()->getScalarSizeInBits())
50362             continue;
50363
50364           // Check if the array's constant elements are suitable to our case.
50365           uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
50366           bool ConstantsMatch = true;
50367           for (uint64_t j = 0; j < ArrayElementCount; j++) {
50368             auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
50369             if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
50370               ConstantsMatch = false;
50371               break;
50372             }
50373           }
50374           if (!ConstantsMatch)
50375             continue;
50376
50377           // Do the transformation (For 32-bit type):
50378           // -> (and (load arr[idx]), inp)
50379           // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
50380           //    that will be replaced with one bzhi instruction.
50381           SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
50382           SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
50383
50384           // Get the Node which indexes into the array.
50385           SDValue Index = getIndexFromUnindexedLoad(Ld);
50386           if (!Index)
50387             return SDValue();
50388           Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
50389
50390           SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
50391           Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
50392
50393           SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
50394           SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
50395
50396           return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
50397         }
50398       }
50399     }
50400   }
50401   return SDValue();
50402 }
50403
50404 // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
50405 // Where C is a mask containing the same number of bits as the setcc and
50406 // where the setcc will freely 0 upper bits of k-register. We can replace the
50407 // undef in the concat with 0s and remove the AND. This mainly helps with
50408 // v2i1/v4i1 setcc being casted to scalar.
50409 static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
50410                                              const X86Subtarget &Subtarget) {
50411   assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
50412
50413   EVT VT = N->getValueType(0);
50414
50415   // Make sure this is an AND with constant. We will check the value of the
50416   // constant later.
50417   auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
50418   if (!C1)
50419     return SDValue();
50420
50421   // This is implied by the ConstantSDNode.
50422   assert(!VT.isVector() && "Expected scalar VT!");
50423
50424   SDValue Src = N->getOperand(0);
50425   if (!Src.hasOneUse())
50426     return SDValue();
50427
50428   // (Optionally) peek through any_extend().
50429   if (Src.getOpcode() == ISD::ANY_EXTEND) {
50430     if (!Src.getOperand(0).hasOneUse())
50431       return SDValue();
50432     Src = Src.getOperand(0);
50433   }
50434
50435   if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
50436     return SDValue();
50437
50438   Src = Src.getOperand(0);
50439   EVT SrcVT = Src.getValueType();
50440
50441   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50442   if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
50443       !TLI.isTypeLegal(SrcVT))
50444     return SDValue();
50445
50446   if (Src.getOpcode() != ISD::CONCAT_VECTORS)
50447     return SDValue();
50448
50449   // We only care about the first subvector of the concat, we expect the
50450   // other subvectors to be ignored due to the AND if we make the change.
50451   SDValue SubVec = Src.getOperand(0);
50452   EVT SubVecVT = SubVec.getValueType();
50453
50454   // The RHS of the AND should be a mask with as many bits as SubVec.
50455   if (!TLI.isTypeLegal(SubVecVT) ||
50456       !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
50457     return SDValue();
50458
50459   // First subvector should be a setcc with a legal result type or a
50460   // AND containing at least one setcc with a legal result type.
50461   auto IsLegalSetCC = [&](SDValue V) {
50462     if (V.getOpcode() != ISD::SETCC)
50463       return false;
50464     EVT SetccVT = V.getOperand(0).getValueType();
50465     if (!TLI.isTypeLegal(SetccVT) ||
50466         !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
50467       return false;
50468     if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
50469       return false;
50470     return true;
50471   };
50472   if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
50473                                  (IsLegalSetCC(SubVec.getOperand(0)) ||
50474                                   IsLegalSetCC(SubVec.getOperand(1))))))
50475     return SDValue();
50476
50477   // We passed all the checks. Rebuild the concat_vectors with zeroes
50478   // and cast it back to VT.
50479   SDLoc dl(N);
50480   SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
50481                               DAG.getConstant(0, dl, SubVecVT));
50482   Ops[0] = SubVec;
50483   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
50484                                Ops);
50485   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
50486   return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
50487 }
50488
50489 static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
50490                                 SDValue OpMustEq, SDValue Op, unsigned Depth) {
50491   // We don't want to go crazy with the recursion here. This isn't a super
50492   // important optimization.
50493   static constexpr unsigned kMaxDepth = 2;
50494
50495   // Only do this re-ordering if op has one use.
50496   if (!Op.hasOneUse())
50497     return SDValue();
50498
50499   SDLoc DL(Op);
50500   // If we hit another assosiative op, recurse further.
50501   if (Op.getOpcode() == Opc) {
50502     // Done recursing.
50503     if (Depth++ >= kMaxDepth)
50504       return SDValue();
50505
50506     for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
50507       if (SDValue R =
50508               getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
50509         return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
50510                            Op.getOperand(1 - OpIdx));
50511
50512   } else if (Op.getOpcode() == ISD::SUB) {
50513     if (Opc == ISD::AND) {
50514       // BLSI: (and x, (sub 0, x))
50515       if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
50516         return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50517     }
50518     // Opc must be ISD::AND or ISD::XOR
50519     // BLSR: (and x, (sub x, 1))
50520     // BLSMSK: (xor x, (sub x, 1))
50521     if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
50522       return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50523
50524   } else if (Op.getOpcode() == ISD::ADD) {
50525     // Opc must be ISD::AND or ISD::XOR
50526     // BLSR: (and x, (add x, -1))
50527     // BLSMSK: (xor x, (add x, -1))
50528     if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
50529       return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
50530   }
50531   return SDValue();
50532 }
50533
50534 static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,
50535                                  const X86Subtarget &Subtarget) {
50536   EVT VT = N->getValueType(0);
50537   // Make sure this node is a candidate for BMI instructions.
50538   if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
50539       (VT != MVT::i32 && VT != MVT::i64))
50540     return SDValue();
50541
50542   assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
50543
50544   // Try and match LHS and RHS.
50545   for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
50546     if (SDValue OpMatch =
50547             getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
50548                              N->getOperand(1 - OpIdx), 0))
50549       return OpMatch;
50550   return SDValue();
50551 }
50552
50553 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
50554                           TargetLowering::DAGCombinerInfo &DCI,
50555                           const X86Subtarget &Subtarget) {
50556   SDValue N0 = N->getOperand(0);
50557   SDValue N1 = N->getOperand(1);
50558   EVT VT = N->getValueType(0);
50559   SDLoc dl(N);
50560   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50561
50562   // If this is SSE1 only convert to FAND to avoid scalarization.
50563   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
50564     return DAG.getBitcast(MVT::v4i32,
50565                           DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
50566                                       DAG.getBitcast(MVT::v4f32, N0),
50567                                       DAG.getBitcast(MVT::v4f32, N1)));
50568   }
50569
50570   // Use a 32-bit and+zext if upper bits known zero.
50571   if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
50572     APInt HiMask = APInt::getHighBitsSet(64, 32);
50573     if (DAG.MaskedValueIsZero(N1, HiMask) ||
50574         DAG.MaskedValueIsZero(N0, HiMask)) {
50575       SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
50576       SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
50577       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
50578                          DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
50579     }
50580   }
50581
50582   // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
50583   // TODO: Support multiple SrcOps.
50584   if (VT == MVT::i1) {
50585     SmallVector<SDValue, 2> SrcOps;
50586     SmallVector<APInt, 2> SrcPartials;
50587     if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
50588         SrcOps.size() == 1) {
50589       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
50590       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50591       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
50592       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
50593         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
50594       if (Mask) {
50595         assert(SrcPartials[0].getBitWidth() == NumElts &&
50596                "Unexpected partial reduction mask");
50597         SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
50598         Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
50599         return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
50600       }
50601     }
50602   }
50603
50604   // InstCombine converts:
50605   //    `(-x << C0) & C1`
50606   // to
50607   //    `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
50608   // This saves an IR instruction but on x86 the neg/shift version is preferable
50609   // so undo the transform.
50610
50611   if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
50612     // TODO: We don't actually need a splat for this, we just need the checks to
50613     // hold for each element.
50614     ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
50615                                               /*AllowTruncation*/ false);
50616     ConstantSDNode *N01C =
50617         isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
50618                             /*AllowTruncation*/ false);
50619     if (N1C && N01C) {
50620       const APInt &MulC = N01C->getAPIntValue();
50621       const APInt &AndC = N1C->getAPIntValue();
50622       APInt MulCLowBit = MulC & (-MulC);
50623       if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
50624           (MulCLowBit + MulC).isPowerOf2()) {
50625         SDValue Neg = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT),
50626                                   N0.getOperand(0));
50627         int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
50628         assert(MulCLowBitLog != -1 &&
50629                "Isolated lowbit is somehow not a power of 2!");
50630         SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
50631                                     DAG.getConstant(MulCLowBitLog, dl, VT));
50632         return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
50633       }
50634     }
50635   }
50636
50637   if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
50638     return V;
50639
50640   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
50641     return R;
50642
50643   if (SDValue R = combineBitOpWithShift(N, DAG))
50644     return R;
50645
50646   if (SDValue R = combineBitOpWithPACK(N, DAG))
50647     return R;
50648
50649   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
50650     return FPLogic;
50651
50652   if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
50653     return R;
50654
50655   if (DCI.isBeforeLegalizeOps())
50656     return SDValue();
50657
50658   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
50659     return R;
50660
50661   if (SDValue R = combineAndNotIntoANDNP(N, DAG))
50662     return R;
50663
50664   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
50665     return ShiftRight;
50666
50667   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
50668     return R;
50669
50670   // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
50671   // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
50672   // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
50673   if (VT.isVector() && getTargetConstantFromNode(N1)) {
50674     unsigned Opc0 = N0.getOpcode();
50675     if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
50676         getTargetConstantFromNode(N0.getOperand(1)) &&
50677         DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
50678         N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
50679       SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
50680       return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
50681     }
50682   }
50683
50684   // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
50685   // avoids slow variable shift (moving shift amount to ECX etc.)
50686   if (isOneConstant(N1) && N0->hasOneUse()) {
50687     SDValue Src = N0;
50688     while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
50689             Src.getOpcode() == ISD::TRUNCATE) &&
50690            Src.getOperand(0)->hasOneUse())
50691       Src = Src.getOperand(0);
50692     bool ContainsNOT = false;
50693     X86::CondCode X86CC = X86::COND_B;
50694     // Peek through AND(NOT(SRL(X,Y)),1).
50695     if (isBitwiseNot(Src)) {
50696       Src = Src.getOperand(0);
50697       X86CC = X86::COND_AE;
50698       ContainsNOT = true;
50699     }
50700     if (Src.getOpcode() == ISD::SRL &&
50701         !isa<ConstantSDNode>(Src.getOperand(1))) {
50702       SDValue BitNo = Src.getOperand(1);
50703       Src = Src.getOperand(0);
50704       // Peek through AND(SRL(NOT(X),Y),1).
50705       if (isBitwiseNot(Src)) {
50706         Src = Src.getOperand(0);
50707         X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
50708         ContainsNOT = true;
50709       }
50710       // If we have BMI2 then SHRX should be faster for i32/i64 cases.
50711       if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
50712         if (SDValue BT = getBT(Src, BitNo, dl, DAG))
50713           return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
50714     }
50715   }
50716
50717   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50718     // Attempt to recursively combine a bitmask AND with shuffles.
50719     SDValue Op(N, 0);
50720     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50721       return Res;
50722
50723     // If either operand is a constant mask, then only the elements that aren't
50724     // zero are actually demanded by the other operand.
50725     auto GetDemandedMasks = [&](SDValue Op) {
50726       APInt UndefElts;
50727       SmallVector<APInt> EltBits;
50728       int NumElts = VT.getVectorNumElements();
50729       int EltSizeInBits = VT.getScalarSizeInBits();
50730       APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
50731       APInt DemandedElts = APInt::getAllOnes(NumElts);
50732       if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
50733                                         EltBits)) {
50734         DemandedBits.clearAllBits();
50735         DemandedElts.clearAllBits();
50736         for (int I = 0; I != NumElts; ++I) {
50737           if (UndefElts[I]) {
50738             // We can't assume an undef src element gives an undef dst - the
50739             // other src might be zero.
50740             DemandedBits.setAllBits();
50741             DemandedElts.setBit(I);
50742           } else if (!EltBits[I].isZero()) {
50743             DemandedBits |= EltBits[I];
50744             DemandedElts.setBit(I);
50745           }
50746         }
50747       }
50748       return std::make_pair(DemandedBits, DemandedElts);
50749     };
50750     APInt Bits0, Elts0;
50751     APInt Bits1, Elts1;
50752     std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
50753     std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
50754
50755     if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
50756         TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
50757         TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
50758         TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
50759       if (N->getOpcode() != ISD::DELETED_NODE)
50760         DCI.AddToWorklist(N);
50761       return SDValue(N, 0);
50762     }
50763
50764     SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
50765     SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
50766     if (NewN0 || NewN1)
50767       return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
50768                          NewN1 ? NewN1 : N1);
50769   }
50770
50771   // Attempt to combine a scalar bitmask AND with an extracted shuffle.
50772   if ((VT.getScalarSizeInBits() % 8) == 0 &&
50773       N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
50774       isa<ConstantSDNode>(N0.getOperand(1))) {
50775     SDValue BitMask = N1;
50776     SDValue SrcVec = N0.getOperand(0);
50777     EVT SrcVecVT = SrcVec.getValueType();
50778
50779     // Check that the constant bitmask masks whole bytes.
50780     APInt UndefElts;
50781     SmallVector<APInt, 64> EltBits;
50782     if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
50783         getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
50784         llvm::all_of(EltBits, [](const APInt &M) {
50785           return M.isZero() || M.isAllOnes();
50786         })) {
50787       unsigned NumElts = SrcVecVT.getVectorNumElements();
50788       unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
50789       unsigned Idx = N0.getConstantOperandVal(1);
50790
50791       // Create a root shuffle mask from the byte mask and the extracted index.
50792       SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
50793       for (unsigned i = 0; i != Scale; ++i) {
50794         if (UndefElts[i])
50795           continue;
50796         int VecIdx = Scale * Idx + i;
50797         ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
50798       }
50799
50800       if (SDValue Shuffle = combineX86ShufflesRecursively(
50801               {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
50802               X86::MaxShuffleCombineDepth,
50803               /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
50804               /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
50805         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
50806                            N0.getOperand(1));
50807     }
50808   }
50809
50810   if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
50811     return R;
50812
50813   return SDValue();
50814 }
50815
50816 // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
50817 static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
50818                                      const X86Subtarget &Subtarget) {
50819   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
50820
50821   MVT VT = N->getSimpleValueType(0);
50822   unsigned EltSizeInBits = VT.getScalarSizeInBits();
50823   if (!VT.isVector() || (EltSizeInBits % 8) != 0)
50824     return SDValue();
50825
50826   SDValue N0 = peekThroughBitcasts(N->getOperand(0));
50827   SDValue N1 = peekThroughBitcasts(N->getOperand(1));
50828   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
50829     return SDValue();
50830
50831   // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
50832   // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
50833   if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
50834         !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
50835     return SDValue();
50836
50837   // Attempt to extract constant byte masks.
50838   APInt UndefElts0, UndefElts1;
50839   SmallVector<APInt, 32> EltBits0, EltBits1;
50840   if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
50841                                      false, false))
50842     return SDValue();
50843   if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
50844                                      false, false))
50845     return SDValue();
50846
50847   for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
50848     // TODO - add UNDEF elts support.
50849     if (UndefElts0[i] || UndefElts1[i])
50850       return SDValue();
50851     if (EltBits0[i] != ~EltBits1[i])
50852       return SDValue();
50853   }
50854
50855   SDLoc DL(N);
50856
50857   if (useVPTERNLOG(Subtarget, VT)) {
50858     // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
50859     // VPTERNLOG is only available as vXi32/64-bit types.
50860     MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;
50861     MVT OpVT =
50862         MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
50863     SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
50864     SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
50865     SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
50866     SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
50867     SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
50868                                 DAG, Subtarget);
50869     return DAG.getBitcast(VT, Res);
50870   }
50871
50872   SDValue X = N->getOperand(0);
50873   SDValue Y =
50874       DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
50875                   DAG.getBitcast(VT, N1.getOperand(0)));
50876   return DAG.getNode(ISD::OR, DL, VT, X, Y);
50877 }
50878
50879 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
50880 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
50881   if (N->getOpcode() != ISD::OR)
50882     return false;
50883
50884   SDValue N0 = N->getOperand(0);
50885   SDValue N1 = N->getOperand(1);
50886
50887   // Canonicalize AND to LHS.
50888   if (N1.getOpcode() == ISD::AND)
50889     std::swap(N0, N1);
50890
50891   // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
50892   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
50893     return false;
50894
50895   Mask = N1.getOperand(0);
50896   X = N1.getOperand(1);
50897
50898   // Check to see if the mask appeared in both the AND and ANDNP.
50899   if (N0.getOperand(0) == Mask)
50900     Y = N0.getOperand(1);
50901   else if (N0.getOperand(1) == Mask)
50902     Y = N0.getOperand(0);
50903   else
50904     return false;
50905
50906   // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
50907   // ANDNP combine allows other combines to happen that prevent matching.
50908   return true;
50909 }
50910
50911 // Try to fold:
50912 //   (or (and (m, y), (pandn m, x)))
50913 // into:
50914 //   (vselect m, x, y)
50915 // As a special case, try to fold:
50916 //   (or (and (m, (sub 0, x)), (pandn m, x)))
50917 // into:
50918 //   (sub (xor X, M), M)
50919 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
50920                                             const X86Subtarget &Subtarget) {
50921   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
50922
50923   EVT VT = N->getValueType(0);
50924   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50925         (VT.is256BitVector() && Subtarget.hasInt256())))
50926     return SDValue();
50927
50928   SDValue X, Y, Mask;
50929   if (!matchLogicBlend(N, X, Y, Mask))
50930     return SDValue();
50931
50932   // Validate that X, Y, and Mask are bitcasts, and see through them.
50933   Mask = peekThroughBitcasts(Mask);
50934   X = peekThroughBitcasts(X);
50935   Y = peekThroughBitcasts(Y);
50936
50937   EVT MaskVT = Mask.getValueType();
50938   unsigned EltBits = MaskVT.getScalarSizeInBits();
50939
50940   // TODO: Attempt to handle floating point cases as well?
50941   if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
50942     return SDValue();
50943
50944   SDLoc DL(N);
50945
50946   // Attempt to combine to conditional negate: (sub (xor X, M), M)
50947   if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
50948                                                            DAG, Subtarget))
50949     return Res;
50950
50951   // PBLENDVB is only available on SSE 4.1.
50952   if (!Subtarget.hasSSE41())
50953     return SDValue();
50954
50955   // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
50956   if (Subtarget.hasVLX())
50957     return SDValue();
50958
50959   MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
50960
50961   X = DAG.getBitcast(BlendVT, X);
50962   Y = DAG.getBitcast(BlendVT, Y);
50963   Mask = DAG.getBitcast(BlendVT, Mask);
50964   Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
50965   return DAG.getBitcast(VT, Mask);
50966 }
50967
50968 // Helper function for combineOrCmpEqZeroToCtlzSrl
50969 // Transforms:
50970 //   seteq(cmp x, 0)
50971 //   into:
50972 //   srl(ctlz x), log2(bitsize(x))
50973 // Input pattern is checked by caller.
50974 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
50975   SDValue Cmp = Op.getOperand(1);
50976   EVT VT = Cmp.getOperand(0).getValueType();
50977   unsigned Log2b = Log2_32(VT.getSizeInBits());
50978   SDLoc dl(Op);
50979   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
50980   // The result of the shift is true or false, and on X86, the 32-bit
50981   // encoding of shr and lzcnt is more desirable.
50982   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
50983   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
50984                             DAG.getConstant(Log2b, dl, MVT::i8));
50985   return Scc;
50986 }
50987
50988 // Try to transform:
50989 //   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
50990 //   into:
50991 //   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
50992 // Will also attempt to match more generic cases, eg:
50993 //   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
50994 // Only applies if the target supports the FastLZCNT feature.
50995 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
50996                                            TargetLowering::DAGCombinerInfo &DCI,
50997                                            const X86Subtarget &Subtarget) {
50998   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
50999     return SDValue();
51000
51001   auto isORCandidate = [](SDValue N) {
51002     return (N->getOpcode() == ISD::OR && N->hasOneUse());
51003   };
51004
51005   // Check the zero extend is extending to 32-bit or more. The code generated by
51006   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
51007   // instructions to clear the upper bits.
51008   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
51009       !isORCandidate(N->getOperand(0)))
51010     return SDValue();
51011
51012   // Check the node matches: setcc(eq, cmp 0)
51013   auto isSetCCCandidate = [](SDValue N) {
51014     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
51015            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
51016            N->getOperand(1).getOpcode() == X86ISD::CMP &&
51017            isNullConstant(N->getOperand(1).getOperand(1)) &&
51018            N->getOperand(1).getValueType().bitsGE(MVT::i32);
51019   };
51020
51021   SDNode *OR = N->getOperand(0).getNode();
51022   SDValue LHS = OR->getOperand(0);
51023   SDValue RHS = OR->getOperand(1);
51024
51025   // Save nodes matching or(or, setcc(eq, cmp 0)).
51026   SmallVector<SDNode *, 2> ORNodes;
51027   while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
51028           (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
51029     ORNodes.push_back(OR);
51030     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
51031     LHS = OR->getOperand(0);
51032     RHS = OR->getOperand(1);
51033   }
51034
51035   // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
51036   if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
51037       !isORCandidate(SDValue(OR, 0)))
51038     return SDValue();
51039
51040   // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
51041   // to
51042   // or(srl(ctlz),srl(ctlz)).
51043   // The dag combiner can then fold it into:
51044   // srl(or(ctlz, ctlz)).
51045   SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
51046   SDValue Ret, NewRHS;
51047   if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
51048     Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
51049
51050   if (!Ret)
51051     return SDValue();
51052
51053   // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
51054   while (!ORNodes.empty()) {
51055     OR = ORNodes.pop_back_val();
51056     LHS = OR->getOperand(0);
51057     RHS = OR->getOperand(1);
51058     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
51059     if (RHS->getOpcode() == ISD::OR)
51060       std::swap(LHS, RHS);
51061     NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
51062     if (!NewRHS)
51063       return SDValue();
51064     Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
51065   }
51066
51067   return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
51068 }
51069
51070 static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
51071                                    SDValue And1_L, SDValue And1_R,
51072                                    const SDLoc &DL, SelectionDAG &DAG) {
51073   if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
51074     return SDValue();
51075   SDValue NotOp = And0_L->getOperand(0);
51076   if (NotOp == And1_R)
51077     std::swap(And1_R, And1_L);
51078   if (NotOp != And1_L)
51079     return SDValue();
51080
51081   // (~(NotOp) & And0_R) | (NotOp & And1_R)
51082   // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
51083   EVT VT = And1_L->getValueType(0);
51084   SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
51085   SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
51086   SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
51087   SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
51088   return Xor1;
51089 }
51090
51091 /// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
51092 /// equivalent `((x ^ y) & m) ^ y)` pattern.
51093 /// This is typically a better representation for  targets without a fused
51094 /// "and-not" operation. This function is intended to be called from a
51095 /// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
51096 static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
51097   // Note that masked-merge variants using XOR or ADD expressions are
51098   // normalized to OR by InstCombine so we only check for OR.
51099   assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
51100   SDValue N0 = Node->getOperand(0);
51101   if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
51102     return SDValue();
51103   SDValue N1 = Node->getOperand(1);
51104   if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
51105     return SDValue();
51106
51107   SDLoc DL(Node);
51108   SDValue N00 = N0->getOperand(0);
51109   SDValue N01 = N0->getOperand(1);
51110   SDValue N10 = N1->getOperand(0);
51111   SDValue N11 = N1->getOperand(1);
51112   if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
51113     return Result;
51114   if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
51115     return Result;
51116   if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
51117     return Result;
51118   if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
51119     return Result;
51120   return SDValue();
51121 }
51122
51123 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
51124 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
51125 /// with CMP+{ADC, SBB}.
51126 /// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
51127 static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
51128                                          SDValue X, SDValue Y,
51129                                          SelectionDAG &DAG,
51130                                          bool ZeroSecondOpOnly = false) {
51131   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
51132     return SDValue();
51133
51134   // Look through a one-use zext.
51135   if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
51136     Y = Y.getOperand(0);
51137
51138   X86::CondCode CC;
51139   SDValue EFLAGS;
51140   if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
51141     CC = (X86::CondCode)Y.getConstantOperandVal(0);
51142     EFLAGS = Y.getOperand(1);
51143   } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
51144              Y.hasOneUse()) {
51145     EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
51146   }
51147
51148   if (!EFLAGS)
51149     return SDValue();
51150
51151   // If X is -1 or 0, then we have an opportunity to avoid constants required in
51152   // the general case below.
51153   auto *ConstantX = dyn_cast<ConstantSDNode>(X);
51154   if (ConstantX && !ZeroSecondOpOnly) {
51155     if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
51156         (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
51157       // This is a complicated way to get -1 or 0 from the carry flag:
51158       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51159       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
51160       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51161                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51162                          EFLAGS);
51163     }
51164
51165     if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
51166         (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
51167       if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
51168           EFLAGS.getValueType().isInteger() &&
51169           !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51170         // Swap the operands of a SUB, and we have the same pattern as above.
51171         // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
51172         //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
51173         SDValue NewSub = DAG.getNode(
51174             X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51175             EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51176         SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
51177         return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51178                            DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51179                            NewEFLAGS);
51180       }
51181     }
51182   }
51183
51184   if (CC == X86::COND_B) {
51185     // X + SETB Z --> adc X, 0
51186     // X - SETB Z --> sbb X, 0
51187     return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
51188                        DAG.getVTList(VT, MVT::i32), X,
51189                        DAG.getConstant(0, DL, VT), EFLAGS);
51190   }
51191
51192   if (ZeroSecondOpOnly)
51193     return SDValue();
51194
51195   if (CC == X86::COND_A) {
51196     // Try to convert COND_A into COND_B in an attempt to facilitate
51197     // materializing "setb reg".
51198     //
51199     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
51200     // cannot take an immediate as its first operand.
51201     //
51202     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
51203         EFLAGS.getValueType().isInteger() &&
51204         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51205       SDValue NewSub =
51206           DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51207                       EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51208       SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
51209       return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
51210                          DAG.getVTList(VT, MVT::i32), X,
51211                          DAG.getConstant(0, DL, VT), NewEFLAGS);
51212     }
51213   }
51214
51215   if (CC == X86::COND_AE) {
51216     // X + SETAE --> sbb X, -1
51217     // X - SETAE --> adc X, -1
51218     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
51219                        DAG.getVTList(VT, MVT::i32), X,
51220                        DAG.getConstant(-1, DL, VT), EFLAGS);
51221   }
51222
51223   if (CC == X86::COND_BE) {
51224     // X + SETBE --> sbb X, -1
51225     // X - SETBE --> adc X, -1
51226     // Try to convert COND_BE into COND_AE in an attempt to facilitate
51227     // materializing "setae reg".
51228     //
51229     // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
51230     // cannot take an immediate as its first operand.
51231     //
51232     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
51233         EFLAGS.getValueType().isInteger() &&
51234         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
51235       SDValue NewSub =
51236           DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
51237                       EFLAGS.getOperand(1), EFLAGS.getOperand(0));
51238       SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
51239       return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
51240                          DAG.getVTList(VT, MVT::i32), X,
51241                          DAG.getConstant(-1, DL, VT), NewEFLAGS);
51242     }
51243   }
51244
51245   if (CC != X86::COND_E && CC != X86::COND_NE)
51246     return SDValue();
51247
51248   if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
51249       !X86::isZeroNode(EFLAGS.getOperand(1)) ||
51250       !EFLAGS.getOperand(0).getValueType().isInteger())
51251     return SDValue();
51252
51253   SDValue Z = EFLAGS.getOperand(0);
51254   EVT ZVT = Z.getValueType();
51255
51256   // If X is -1 or 0, then we have an opportunity to avoid constants required in
51257   // the general case below.
51258   if (ConstantX) {
51259     // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
51260     // fake operands:
51261     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
51262     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
51263     if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
51264         (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
51265       SDValue Zero = DAG.getConstant(0, DL, ZVT);
51266       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
51267       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
51268       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51269                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51270                          SDValue(Neg.getNode(), 1));
51271     }
51272
51273     // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
51274     // with fake operands:
51275     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
51276     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
51277     if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
51278         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
51279       SDValue One = DAG.getConstant(1, DL, ZVT);
51280       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
51281       SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
51282       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
51283                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
51284                          Cmp1.getValue(1));
51285     }
51286   }
51287
51288   // (cmp Z, 1) sets the carry flag if Z is 0.
51289   SDValue One = DAG.getConstant(1, DL, ZVT);
51290   SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
51291   SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
51292
51293   // Add the flags type for ADC/SBB nodes.
51294   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
51295
51296   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
51297   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
51298   if (CC == X86::COND_NE)
51299     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
51300                        DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
51301
51302   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
51303   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
51304   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
51305                      DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
51306 }
51307
51308 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
51309 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
51310 /// with CMP+{ADC, SBB}.
51311 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
51312   bool IsSub = N->getOpcode() == ISD::SUB;
51313   SDValue X = N->getOperand(0);
51314   SDValue Y = N->getOperand(1);
51315   EVT VT = N->getValueType(0);
51316   SDLoc DL(N);
51317
51318   if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
51319     return ADCOrSBB;
51320
51321   // Commute and try again (negate the result for subtracts).
51322   if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
51323     if (IsSub)
51324       ADCOrSBB =
51325           DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
51326     return ADCOrSBB;
51327   }
51328
51329   return SDValue();
51330 }
51331
51332 static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1,
51333                                      SelectionDAG &DAG) {
51334   assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&
51335          "Unexpected opcode");
51336
51337   // Delegate to combineAddOrSubToADCOrSBB if we have:
51338   //
51339   //   (xor/or (zero_extend (setcc)) imm)
51340   //
51341   // where imm is odd if and only if we have xor, in which case the XOR/OR are
51342   // equivalent to a SUB/ADD, respectively.
51343   if (N0.getOpcode() == ISD::ZERO_EXTEND &&
51344       N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
51345     if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
51346       bool IsSub = N->getOpcode() == ISD::XOR;
51347       bool N1COdd = N1C->getZExtValue() & 1;
51348       if (IsSub ? N1COdd : !N1COdd) {
51349         SDLoc DL(N);
51350         EVT VT = N->getValueType(0);
51351         if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
51352           return R;
51353       }
51354     }
51355   }
51356
51357   return SDValue();
51358 }
51359
51360 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
51361                          TargetLowering::DAGCombinerInfo &DCI,
51362                          const X86Subtarget &Subtarget) {
51363   SDValue N0 = N->getOperand(0);
51364   SDValue N1 = N->getOperand(1);
51365   EVT VT = N->getValueType(0);
51366   SDLoc dl(N);
51367   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51368
51369   // If this is SSE1 only convert to FOR to avoid scalarization.
51370   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51371     return DAG.getBitcast(MVT::v4i32,
51372                           DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
51373                                       DAG.getBitcast(MVT::v4f32, N0),
51374                                       DAG.getBitcast(MVT::v4f32, N1)));
51375   }
51376
51377   // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
51378   // TODO: Support multiple SrcOps.
51379   if (VT == MVT::i1) {
51380     SmallVector<SDValue, 2> SrcOps;
51381     SmallVector<APInt, 2> SrcPartials;
51382     if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
51383         SrcOps.size() == 1) {
51384       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51385       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51386       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51387       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51388         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51389       if (Mask) {
51390         assert(SrcPartials[0].getBitWidth() == NumElts &&
51391                "Unexpected partial reduction mask");
51392         SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
51393         SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51394         Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51395         return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
51396       }
51397     }
51398   }
51399
51400   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
51401     return R;
51402
51403   if (SDValue R = combineBitOpWithShift(N, DAG))
51404     return R;
51405
51406   if (SDValue R = combineBitOpWithPACK(N, DAG))
51407     return R;
51408
51409   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
51410     return FPLogic;
51411
51412   if (DCI.isBeforeLegalizeOps())
51413     return SDValue();
51414
51415   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51416     return R;
51417
51418   if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
51419     return R;
51420
51421   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
51422     return R;
51423
51424   // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
51425   if ((VT == MVT::i32 || VT == MVT::i64) &&
51426       N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
51427       isNullConstant(N0.getOperand(0))) {
51428     SDValue Cond = N0.getOperand(1);
51429     if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
51430       Cond = Cond.getOperand(0);
51431
51432     if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
51433       if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
51434         uint64_t Val = CN->getZExtValue();
51435         if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
51436           X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
51437           CCode = X86::GetOppositeBranchCondition(CCode);
51438           SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
51439
51440           SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
51441           R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
51442           R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
51443           return R;
51444         }
51445       }
51446     }
51447   }
51448
51449   // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
51450   // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
51451   // iff the upper elements of the non-shifted arg are zero.
51452   // KUNPCK require 16+ bool vector elements.
51453   if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
51454     unsigned NumElts = VT.getVectorNumElements();
51455     unsigned HalfElts = NumElts / 2;
51456     APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
51457     if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
51458         N1.getConstantOperandAPInt(1) == HalfElts &&
51459         DAG.MaskedVectorIsZero(N0, UpperElts)) {
51460       return DAG.getNode(
51461           ISD::CONCAT_VECTORS, dl, VT,
51462           extractSubVector(N0, 0, DAG, dl, HalfElts),
51463           extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
51464     }
51465     if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
51466         N0.getConstantOperandAPInt(1) == HalfElts &&
51467         DAG.MaskedVectorIsZero(N1, UpperElts)) {
51468       return DAG.getNode(
51469           ISD::CONCAT_VECTORS, dl, VT,
51470           extractSubVector(N1, 0, DAG, dl, HalfElts),
51471           extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
51472     }
51473   }
51474
51475   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51476     // Attempt to recursively combine an OR of shuffles.
51477     SDValue Op(N, 0);
51478     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51479       return Res;
51480
51481     // If either operand is a constant mask, then only the elements that aren't
51482     // allones are actually demanded by the other operand.
51483     auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
51484       APInt UndefElts;
51485       SmallVector<APInt> EltBits;
51486       int NumElts = VT.getVectorNumElements();
51487       int EltSizeInBits = VT.getScalarSizeInBits();
51488       if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
51489         return false;
51490
51491       APInt DemandedElts = APInt::getZero(NumElts);
51492       for (int I = 0; I != NumElts; ++I)
51493         if (!EltBits[I].isAllOnes())
51494           DemandedElts.setBit(I);
51495
51496       return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
51497     };
51498     if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
51499       if (N->getOpcode() != ISD::DELETED_NODE)
51500         DCI.AddToWorklist(N);
51501       return SDValue(N, 0);
51502     }
51503   }
51504
51505   // We should fold "masked merge" patterns when `andn` is not available.
51506   if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
51507     if (SDValue R = foldMaskedMerge(N, DAG))
51508       return R;
51509
51510   if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
51511     return R;
51512
51513   return SDValue();
51514 }
51515
51516 /// Try to turn tests against the signbit in the form of:
51517 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
51518 /// into:
51519 ///   SETGT(X, -1)
51520 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
51521   // This is only worth doing if the output type is i8 or i1.
51522   EVT ResultType = N->getValueType(0);
51523   if (ResultType != MVT::i8 && ResultType != MVT::i1)
51524     return SDValue();
51525
51526   SDValue N0 = N->getOperand(0);
51527   SDValue N1 = N->getOperand(1);
51528
51529   // We should be performing an xor against a truncated shift.
51530   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
51531     return SDValue();
51532
51533   // Make sure we are performing an xor against one.
51534   if (!isOneConstant(N1))
51535     return SDValue();
51536
51537   // SetCC on x86 zero extends so only act on this if it's a logical shift.
51538   SDValue Shift = N0.getOperand(0);
51539   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
51540     return SDValue();
51541
51542   // Make sure we are truncating from one of i16, i32 or i64.
51543   EVT ShiftTy = Shift.getValueType();
51544   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
51545     return SDValue();
51546
51547   // Make sure the shift amount extracts the sign bit.
51548   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
51549       Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
51550     return SDValue();
51551
51552   // Create a greater-than comparison against -1.
51553   // N.B. Using SETGE against 0 works but we want a canonical looking
51554   // comparison, using SETGT matches up with what TranslateX86CC.
51555   SDLoc DL(N);
51556   SDValue ShiftOp = Shift.getOperand(0);
51557   EVT ShiftOpTy = ShiftOp.getValueType();
51558   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51559   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
51560                                                *DAG.getContext(), ResultType);
51561   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
51562                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
51563   if (SetCCResultType != ResultType)
51564     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
51565   return Cond;
51566 }
51567
51568 /// Turn vector tests of the signbit in the form of:
51569 ///   xor (sra X, elt_size(X)-1), -1
51570 /// into:
51571 ///   pcmpgt X, -1
51572 ///
51573 /// This should be called before type legalization because the pattern may not
51574 /// persist after that.
51575 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
51576                                          const X86Subtarget &Subtarget) {
51577   EVT VT = N->getValueType(0);
51578   if (!VT.isSimple())
51579     return SDValue();
51580
51581   switch (VT.getSimpleVT().SimpleTy) {
51582   default: return SDValue();
51583   case MVT::v16i8:
51584   case MVT::v8i16:
51585   case MVT::v4i32:
51586   case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
51587   case MVT::v32i8:
51588   case MVT::v16i16:
51589   case MVT::v8i32:
51590   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
51591   }
51592
51593   // There must be a shift right algebraic before the xor, and the xor must be a
51594   // 'not' operation.
51595   SDValue Shift = N->getOperand(0);
51596   SDValue Ones = N->getOperand(1);
51597   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
51598       !ISD::isBuildVectorAllOnes(Ones.getNode()))
51599     return SDValue();
51600
51601   // The shift should be smearing the sign bit across each vector element.
51602   auto *ShiftAmt =
51603       isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
51604   if (!ShiftAmt ||
51605       ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
51606     return SDValue();
51607
51608   // Create a greater-than comparison against -1. We don't use the more obvious
51609   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
51610   return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
51611 }
51612
51613 /// Detect patterns of truncation with unsigned saturation:
51614 ///
51615 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
51616 ///   Return the source value x to be truncated or SDValue() if the pattern was
51617 ///   not matched.
51618 ///
51619 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
51620 ///   where C1 >= 0 and C2 is unsigned max of destination type.
51621 ///
51622 ///    (truncate (smax (smin (x, C2), C1)) to dest_type)
51623 ///   where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
51624 ///
51625 ///   These two patterns are equivalent to:
51626 ///   (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
51627 ///   So return the smax(x, C1) value to be truncated or SDValue() if the
51628 ///   pattern was not matched.
51629 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
51630                                  const SDLoc &DL) {
51631   EVT InVT = In.getValueType();
51632
51633   // Saturation with truncation. We truncate from InVT to VT.
51634   assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
51635          "Unexpected types for truncate operation");
51636
51637   // Match min/max and return limit value as a parameter.
51638   auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
51639     if (V.getOpcode() == Opcode &&
51640         ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
51641       return V.getOperand(0);
51642     return SDValue();
51643   };
51644
51645   APInt C1, C2;
51646   if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
51647     // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
51648     // the element size of the destination type.
51649     if (C2.isMask(VT.getScalarSizeInBits()))
51650       return UMin;
51651
51652   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
51653     if (MatchMinMax(SMin, ISD::SMAX, C1))
51654       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
51655         return SMin;
51656
51657   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
51658     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
51659       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
51660           C2.uge(C1)) {
51661         return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
51662       }
51663
51664   return SDValue();
51665 }
51666
51667 /// Detect patterns of truncation with signed saturation:
51668 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
51669 ///                  signed_max_of_dest_type)) to dest_type)
51670 /// or:
51671 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
51672 ///                  signed_min_of_dest_type)) to dest_type).
51673 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
51674 /// Return the source value to be truncated or SDValue() if the pattern was not
51675 /// matched.
51676 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
51677   unsigned NumDstBits = VT.getScalarSizeInBits();
51678   unsigned NumSrcBits = In.getScalarValueSizeInBits();
51679   assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
51680
51681   auto MatchMinMax = [](SDValue V, unsigned Opcode,
51682                         const APInt &Limit) -> SDValue {
51683     APInt C;
51684     if (V.getOpcode() == Opcode &&
51685         ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
51686       return V.getOperand(0);
51687     return SDValue();
51688   };
51689
51690   APInt SignedMax, SignedMin;
51691   if (MatchPackUS) {
51692     SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
51693     SignedMin = APInt(NumSrcBits, 0);
51694   } else {
51695     SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
51696     SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
51697   }
51698
51699   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
51700     if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
51701       return SMax;
51702
51703   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
51704     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
51705       return SMin;
51706
51707   return SDValue();
51708 }
51709
51710 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
51711                                       SelectionDAG &DAG,
51712                                       const X86Subtarget &Subtarget) {
51713   if (!Subtarget.hasSSE2() || !VT.isVector())
51714     return SDValue();
51715
51716   EVT SVT = VT.getVectorElementType();
51717   EVT InVT = In.getValueType();
51718   EVT InSVT = InVT.getVectorElementType();
51719
51720   // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
51721   // split across two registers. We can use a packusdw+perm to clamp to 0-65535
51722   // and concatenate at the same time. Then we can use a final vpmovuswb to
51723   // clip to 0-255.
51724   if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
51725       InVT == MVT::v16i32 && VT == MVT::v16i8) {
51726     if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
51727       // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
51728       SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
51729                                            DL, DAG, Subtarget);
51730       assert(Mid && "Failed to pack!");
51731       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
51732     }
51733   }
51734
51735   // vXi32 truncate instructions are available with AVX512F.
51736   // vXi16 truncate instructions are only available with AVX512BW.
51737   // For 256-bit or smaller vectors, we require VLX.
51738   // FIXME: We could widen truncates to 512 to remove the VLX restriction.
51739   // If the result type is 256-bits or larger and we have disable 512-bit
51740   // registers, we should go ahead and use the pack instructions if possible.
51741   bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
51742                        (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
51743                       (InVT.getSizeInBits() > 128) &&
51744                       (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
51745                       !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
51746
51747   if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
51748       VT.getSizeInBits() >= 64 &&
51749       (SVT == MVT::i8 || SVT == MVT::i16) &&
51750       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
51751     if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
51752       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
51753       // Only do this when the result is at least 64 bits or we'll leaving
51754       // dangling PACKSSDW nodes.
51755       if (SVT == MVT::i8 && InSVT == MVT::i32) {
51756         EVT MidVT = VT.changeVectorElementType(MVT::i16);
51757         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
51758                                              DAG, Subtarget);
51759         assert(Mid && "Failed to pack!");
51760         SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
51761                                            Subtarget);
51762         assert(V && "Failed to pack!");
51763         return V;
51764       } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
51765         return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
51766                                       Subtarget);
51767     }
51768     if (SDValue SSatVal = detectSSatPattern(In, VT))
51769       return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
51770                                     Subtarget);
51771   }
51772
51773   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51774   if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
51775       Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
51776       (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
51777     unsigned TruncOpc = 0;
51778     SDValue SatVal;
51779     if (SDValue SSatVal = detectSSatPattern(In, VT)) {
51780       SatVal = SSatVal;
51781       TruncOpc = X86ISD::VTRUNCS;
51782     } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
51783       SatVal = USatVal;
51784       TruncOpc = X86ISD::VTRUNCUS;
51785     }
51786     if (SatVal) {
51787       unsigned ResElts = VT.getVectorNumElements();
51788       // If the input type is less than 512 bits and we don't have VLX, we need
51789       // to widen to 512 bits.
51790       if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
51791         unsigned NumConcats = 512 / InVT.getSizeInBits();
51792         ResElts *= NumConcats;
51793         SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
51794         ConcatOps[0] = SatVal;
51795         InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
51796                                 NumConcats * InVT.getVectorNumElements());
51797         SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
51798       }
51799       // Widen the result if its narrower than 128 bits.
51800       if (ResElts * SVT.getSizeInBits() < 128)
51801         ResElts = 128 / SVT.getSizeInBits();
51802       EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
51803       SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
51804       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
51805                          DAG.getIntPtrConstant(0, DL));
51806     }
51807   }
51808
51809   return SDValue();
51810 }
51811
51812 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
51813 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
51814 /// ISD::AVGCEILU (AVG) instruction.
51815 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
51816                                 const X86Subtarget &Subtarget,
51817                                 const SDLoc &DL) {
51818   if (!VT.isVector())
51819     return SDValue();
51820   EVT InVT = In.getValueType();
51821   unsigned NumElems = VT.getVectorNumElements();
51822
51823   EVT ScalarVT = VT.getVectorElementType();
51824   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
51825     return SDValue();
51826
51827   // InScalarVT is the intermediate type in AVG pattern and it should be greater
51828   // than the original input type (i8/i16).
51829   EVT InScalarVT = InVT.getVectorElementType();
51830   if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
51831     return SDValue();
51832
51833   if (!Subtarget.hasSSE2())
51834     return SDValue();
51835
51836   // Detect the following pattern:
51837   //
51838   //   %1 = zext <N x i8> %a to <N x i32>
51839   //   %2 = zext <N x i8> %b to <N x i32>
51840   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
51841   //   %4 = add nuw nsw <N x i32> %3, %2
51842   //   %5 = lshr <N x i32> %N, <i32 1 x N>
51843   //   %6 = trunc <N x i32> %5 to <N x i8>
51844   //
51845   // In AVX512, the last instruction can also be a trunc store.
51846   if (In.getOpcode() != ISD::SRL)
51847     return SDValue();
51848
51849   // A lambda checking the given SDValue is a constant vector and each element
51850   // is in the range [Min, Max].
51851   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
51852     return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
51853       return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
51854     });
51855   };
51856
51857   auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
51858     unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
51859     return MaxActiveBits <= ScalarVT.getSizeInBits();
51860   };
51861
51862   // Check if each element of the vector is right-shifted by one.
51863   SDValue LHS = In.getOperand(0);
51864   SDValue RHS = In.getOperand(1);
51865   if (!IsConstVectorInRange(RHS, 1, 1))
51866     return SDValue();
51867   if (LHS.getOpcode() != ISD::ADD)
51868     return SDValue();
51869
51870   // Detect a pattern of a + b + 1 where the order doesn't matter.
51871   SDValue Operands[3];
51872   Operands[0] = LHS.getOperand(0);
51873   Operands[1] = LHS.getOperand(1);
51874
51875   auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
51876                        ArrayRef<SDValue> Ops) {
51877     return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
51878   };
51879
51880   auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
51881     for (SDValue &Op : Ops)
51882       if (Op.getValueType() != VT)
51883         Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
51884     // Pad to a power-of-2 vector, split+apply and extract the original vector.
51885     unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
51886     EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
51887     if (NumElemsPow2 != NumElems) {
51888       for (SDValue &Op : Ops) {
51889         SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
51890         for (unsigned i = 0; i != NumElems; ++i) {
51891           SDValue Idx = DAG.getIntPtrConstant(i, DL);
51892           EltsOfOp[i] =
51893               DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
51894         }
51895         Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
51896       }
51897     }
51898     SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
51899     if (NumElemsPow2 == NumElems)
51900       return Res;
51901     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
51902                        DAG.getIntPtrConstant(0, DL));
51903   };
51904
51905   // Take care of the case when one of the operands is a constant vector whose
51906   // element is in the range [1, 256].
51907   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
51908       IsZExtLike(Operands[0])) {
51909     // The pattern is detected. Subtract one from the constant vector, then
51910     // demote it and emit X86ISD::AVG instruction.
51911     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
51912     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
51913     return AVGSplitter({Operands[0], Operands[1]});
51914   }
51915
51916   // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
51917   // Match the or case only if its 'add-like' - can be replaced by an add.
51918   auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
51919     if (ISD::ADD == V.getOpcode()) {
51920       Op0 = V.getOperand(0);
51921       Op1 = V.getOperand(1);
51922       return true;
51923     }
51924     if (ISD::ZERO_EXTEND != V.getOpcode())
51925       return false;
51926     V = V.getOperand(0);
51927     if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
51928         !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
51929       return false;
51930     Op0 = V.getOperand(0);
51931     Op1 = V.getOperand(1);
51932     return true;
51933   };
51934
51935   SDValue Op0, Op1;
51936   if (FindAddLike(Operands[0], Op0, Op1))
51937     std::swap(Operands[0], Operands[1]);
51938   else if (!FindAddLike(Operands[1], Op0, Op1))
51939     return SDValue();
51940   Operands[2] = Op0;
51941   Operands[1] = Op1;
51942
51943   // Now we have three operands of two additions. Check that one of them is a
51944   // constant vector with ones, and the other two can be promoted from i8/i16.
51945   for (SDValue &Op : Operands) {
51946     if (!IsConstVectorInRange(Op, 1, 1))
51947       continue;
51948     std::swap(Op, Operands[2]);
51949
51950     // Check if Operands[0] and Operands[1] are results of type promotion.
51951     for (int j = 0; j < 2; ++j)
51952       if (Operands[j].getValueType() != VT)
51953         if (!IsZExtLike(Operands[j]))
51954           return SDValue();
51955
51956     // The pattern is detected, emit X86ISD::AVG instruction(s).
51957     return AVGSplitter({Operands[0], Operands[1]});
51958   }
51959
51960   return SDValue();
51961 }
51962
51963 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
51964                            TargetLowering::DAGCombinerInfo &DCI,
51965                            const X86Subtarget &Subtarget) {
51966   LoadSDNode *Ld = cast<LoadSDNode>(N);
51967   EVT RegVT = Ld->getValueType(0);
51968   EVT MemVT = Ld->getMemoryVT();
51969   SDLoc dl(Ld);
51970   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51971
51972   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
51973   // into two 16-byte operations. Also split non-temporal aligned loads on
51974   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
51975   ISD::LoadExtType Ext = Ld->getExtensionType();
51976   unsigned Fast;
51977   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
51978       Ext == ISD::NON_EXTLOAD &&
51979       ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
51980         Ld->getAlign() >= Align(16)) ||
51981        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
51982                                *Ld->getMemOperand(), &Fast) &&
51983         !Fast))) {
51984     unsigned NumElems = RegVT.getVectorNumElements();
51985     if (NumElems < 2)
51986       return SDValue();
51987
51988     unsigned HalfOffset = 16;
51989     SDValue Ptr1 = Ld->getBasePtr();
51990     SDValue Ptr2 =
51991         DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
51992     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
51993                                   NumElems / 2);
51994     SDValue Load1 =
51995         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
51996                     Ld->getOriginalAlign(),
51997                     Ld->getMemOperand()->getFlags());
51998     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
51999                                 Ld->getPointerInfo().getWithOffset(HalfOffset),
52000                                 Ld->getOriginalAlign(),
52001                                 Ld->getMemOperand()->getFlags());
52002     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
52003                              Load1.getValue(1), Load2.getValue(1));
52004
52005     SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
52006     return DCI.CombineTo(N, NewVec, TF, true);
52007   }
52008
52009   // Bool vector load - attempt to cast to an integer, as we have good
52010   // (vXiY *ext(vXi1 bitcast(iX))) handling.
52011   if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
52012       RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
52013     unsigned NumElts = RegVT.getVectorNumElements();
52014     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52015     if (TLI.isTypeLegal(IntVT)) {
52016       SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
52017                                     Ld->getPointerInfo(),
52018                                     Ld->getOriginalAlign(),
52019                                     Ld->getMemOperand()->getFlags());
52020       SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
52021       return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
52022     }
52023   }
52024
52025   // If we also broadcast this as a subvector to a wider type, then just extract
52026   // the lowest subvector.
52027   if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
52028       (RegVT.is128BitVector() || RegVT.is256BitVector())) {
52029     SDValue Ptr = Ld->getBasePtr();
52030     SDValue Chain = Ld->getChain();
52031     for (SDNode *User : Ptr->uses()) {
52032       if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
52033           cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
52034           cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
52035           cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
52036               MemVT.getSizeInBits() &&
52037           !User->hasAnyUseOfValue(1) &&
52038           User->getValueSizeInBits(0).getFixedValue() >
52039               RegVT.getFixedSizeInBits()) {
52040         SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
52041                                            RegVT.getSizeInBits());
52042         Extract = DAG.getBitcast(RegVT, Extract);
52043         return DCI.CombineTo(N, Extract, SDValue(User, 1));
52044       }
52045     }
52046   }
52047
52048   // Cast ptr32 and ptr64 pointers to the default address space before a load.
52049   unsigned AddrSpace = Ld->getAddressSpace();
52050   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
52051       AddrSpace == X86AS::PTR32_UPTR) {
52052     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
52053     if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
52054       SDValue Cast =
52055           DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
52056       return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
52057                          Ld->getOriginalAlign(),
52058                          Ld->getMemOperand()->getFlags());
52059     }
52060   }
52061
52062   return SDValue();
52063 }
52064
52065 /// If V is a build vector of boolean constants and exactly one of those
52066 /// constants is true, return the operand index of that true element.
52067 /// Otherwise, return -1.
52068 static int getOneTrueElt(SDValue V) {
52069   // This needs to be a build vector of booleans.
52070   // TODO: Checking for the i1 type matches the IR definition for the mask,
52071   // but the mask check could be loosened to i8 or other types. That might
52072   // also require checking more than 'allOnesValue'; eg, the x86 HW
52073   // instructions only require that the MSB is set for each mask element.
52074   // The ISD::MSTORE comments/definition do not specify how the mask operand
52075   // is formatted.
52076   auto *BV = dyn_cast<BuildVectorSDNode>(V);
52077   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
52078     return -1;
52079
52080   int TrueIndex = -1;
52081   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
52082   for (unsigned i = 0; i < NumElts; ++i) {
52083     const SDValue &Op = BV->getOperand(i);
52084     if (Op.isUndef())
52085       continue;
52086     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
52087     if (!ConstNode)
52088       return -1;
52089     if (ConstNode->getAPIntValue().countr_one() >= 1) {
52090       // If we already found a one, this is too many.
52091       if (TrueIndex >= 0)
52092         return -1;
52093       TrueIndex = i;
52094     }
52095   }
52096   return TrueIndex;
52097 }
52098
52099 /// Given a masked memory load/store operation, return true if it has one mask
52100 /// bit set. If it has one mask bit set, then also return the memory address of
52101 /// the scalar element to load/store, the vector index to insert/extract that
52102 /// scalar element, and the alignment for the scalar memory access.
52103 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
52104                                          SelectionDAG &DAG, SDValue &Addr,
52105                                          SDValue &Index, Align &Alignment,
52106                                          unsigned &Offset) {
52107   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
52108   if (TrueMaskElt < 0)
52109     return false;
52110
52111   // Get the address of the one scalar element that is specified by the mask
52112   // using the appropriate offset from the base pointer.
52113   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
52114   Offset = 0;
52115   Addr = MaskedOp->getBasePtr();
52116   if (TrueMaskElt != 0) {
52117     Offset = TrueMaskElt * EltVT.getStoreSize();
52118     Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
52119                                     SDLoc(MaskedOp));
52120   }
52121
52122   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
52123   Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
52124                               EltVT.getStoreSize());
52125   return true;
52126 }
52127
52128 /// If exactly one element of the mask is set for a non-extending masked load,
52129 /// it is a scalar load and vector insert.
52130 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
52131 /// mask have already been optimized in IR, so we don't bother with those here.
52132 static SDValue
52133 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
52134                              TargetLowering::DAGCombinerInfo &DCI,
52135                              const X86Subtarget &Subtarget) {
52136   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
52137   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
52138   // However, some target hooks may need to be added to know when the transform
52139   // is profitable. Endianness would also have to be considered.
52140
52141   SDValue Addr, VecIndex;
52142   Align Alignment;
52143   unsigned Offset;
52144   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
52145     return SDValue();
52146
52147   // Load the one scalar element that is specified by the mask using the
52148   // appropriate offset from the base pointer.
52149   SDLoc DL(ML);
52150   EVT VT = ML->getValueType(0);
52151   EVT EltVT = VT.getVectorElementType();
52152
52153   EVT CastVT = VT;
52154   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
52155     EltVT = MVT::f64;
52156     CastVT = VT.changeVectorElementType(EltVT);
52157   }
52158
52159   SDValue Load =
52160       DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
52161                   ML->getPointerInfo().getWithOffset(Offset),
52162                   Alignment, ML->getMemOperand()->getFlags());
52163
52164   SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
52165
52166   // Insert the loaded element into the appropriate place in the vector.
52167   SDValue Insert =
52168       DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
52169   Insert = DAG.getBitcast(VT, Insert);
52170   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
52171 }
52172
52173 static SDValue
52174 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
52175                               TargetLowering::DAGCombinerInfo &DCI) {
52176   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
52177   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
52178     return SDValue();
52179
52180   SDLoc DL(ML);
52181   EVT VT = ML->getValueType(0);
52182
52183   // If we are loading the first and last elements of a vector, it is safe and
52184   // always faster to load the whole vector. Replace the masked load with a
52185   // vector load and select.
52186   unsigned NumElts = VT.getVectorNumElements();
52187   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
52188   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
52189   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
52190   if (LoadFirstElt && LoadLastElt) {
52191     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
52192                                 ML->getMemOperand());
52193     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
52194                                   ML->getPassThru());
52195     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
52196   }
52197
52198   // Convert a masked load with a constant mask into a masked load and a select.
52199   // This allows the select operation to use a faster kind of select instruction
52200   // (for example, vblendvps -> vblendps).
52201
52202   // Don't try this if the pass-through operand is already undefined. That would
52203   // cause an infinite loop because that's what we're about to create.
52204   if (ML->getPassThru().isUndef())
52205     return SDValue();
52206
52207   if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
52208     return SDValue();
52209
52210   // The new masked load has an undef pass-through operand. The select uses the
52211   // original pass-through operand.
52212   SDValue NewML = DAG.getMaskedLoad(
52213       VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
52214       DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
52215       ML->getAddressingMode(), ML->getExtensionType());
52216   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
52217                                 ML->getPassThru());
52218
52219   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
52220 }
52221
52222 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
52223                                  TargetLowering::DAGCombinerInfo &DCI,
52224                                  const X86Subtarget &Subtarget) {
52225   auto *Mld = cast<MaskedLoadSDNode>(N);
52226
52227   // TODO: Expanding load with constant mask may be optimized as well.
52228   if (Mld->isExpandingLoad())
52229     return SDValue();
52230
52231   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
52232     if (SDValue ScalarLoad =
52233             reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
52234       return ScalarLoad;
52235
52236     // TODO: Do some AVX512 subsets benefit from this transform?
52237     if (!Subtarget.hasAVX512())
52238       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
52239         return Blend;
52240   }
52241
52242   // If the mask value has been legalized to a non-boolean vector, try to
52243   // simplify ops leading up to it. We only demand the MSB of each lane.
52244   SDValue Mask = Mld->getMask();
52245   if (Mask.getScalarValueSizeInBits() != 1) {
52246     EVT VT = Mld->getValueType(0);
52247     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52248     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
52249     if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
52250       if (N->getOpcode() != ISD::DELETED_NODE)
52251         DCI.AddToWorklist(N);
52252       return SDValue(N, 0);
52253     }
52254     if (SDValue NewMask =
52255             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
52256       return DAG.getMaskedLoad(
52257           VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
52258           NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
52259           Mld->getAddressingMode(), Mld->getExtensionType());
52260   }
52261
52262   return SDValue();
52263 }
52264
52265 /// If exactly one element of the mask is set for a non-truncating masked store,
52266 /// it is a vector extract and scalar store.
52267 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
52268 /// mask have already been optimized in IR, so we don't bother with those here.
52269 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
52270                                               SelectionDAG &DAG,
52271                                               const X86Subtarget &Subtarget) {
52272   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
52273   // However, some target hooks may need to be added to know when the transform
52274   // is profitable. Endianness would also have to be considered.
52275
52276   SDValue Addr, VecIndex;
52277   Align Alignment;
52278   unsigned Offset;
52279   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
52280     return SDValue();
52281
52282   // Extract the one scalar element that is actually being stored.
52283   SDLoc DL(MS);
52284   SDValue Value = MS->getValue();
52285   EVT VT = Value.getValueType();
52286   EVT EltVT = VT.getVectorElementType();
52287   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
52288     EltVT = MVT::f64;
52289     EVT CastVT = VT.changeVectorElementType(EltVT);
52290     Value = DAG.getBitcast(CastVT, Value);
52291   }
52292   SDValue Extract =
52293       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
52294
52295   // Store that element at the appropriate offset from the base pointer.
52296   return DAG.getStore(MS->getChain(), DL, Extract, Addr,
52297                       MS->getPointerInfo().getWithOffset(Offset),
52298                       Alignment, MS->getMemOperand()->getFlags());
52299 }
52300
52301 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
52302                                   TargetLowering::DAGCombinerInfo &DCI,
52303                                   const X86Subtarget &Subtarget) {
52304   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
52305   if (Mst->isCompressingStore())
52306     return SDValue();
52307
52308   EVT VT = Mst->getValue().getValueType();
52309   SDLoc dl(Mst);
52310   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52311
52312   if (Mst->isTruncatingStore())
52313     return SDValue();
52314
52315   if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
52316     return ScalarStore;
52317
52318   // If the mask value has been legalized to a non-boolean vector, try to
52319   // simplify ops leading up to it. We only demand the MSB of each lane.
52320   SDValue Mask = Mst->getMask();
52321   if (Mask.getScalarValueSizeInBits() != 1) {
52322     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
52323     if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
52324       if (N->getOpcode() != ISD::DELETED_NODE)
52325         DCI.AddToWorklist(N);
52326       return SDValue(N, 0);
52327     }
52328     if (SDValue NewMask =
52329             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
52330       return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
52331                                 Mst->getBasePtr(), Mst->getOffset(), NewMask,
52332                                 Mst->getMemoryVT(), Mst->getMemOperand(),
52333                                 Mst->getAddressingMode());
52334   }
52335
52336   SDValue Value = Mst->getValue();
52337   if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
52338       TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
52339                             Mst->getMemoryVT())) {
52340     return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
52341                               Mst->getBasePtr(), Mst->getOffset(), Mask,
52342                               Mst->getMemoryVT(), Mst->getMemOperand(),
52343                               Mst->getAddressingMode(), true);
52344   }
52345
52346   return SDValue();
52347 }
52348
52349 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
52350                             TargetLowering::DAGCombinerInfo &DCI,
52351                             const X86Subtarget &Subtarget) {
52352   StoreSDNode *St = cast<StoreSDNode>(N);
52353   EVT StVT = St->getMemoryVT();
52354   SDLoc dl(St);
52355   SDValue StoredVal = St->getValue();
52356   EVT VT = StoredVal.getValueType();
52357   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52358
52359   // Convert a store of vXi1 into a store of iX and a bitcast.
52360   if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
52361       VT.getVectorElementType() == MVT::i1) {
52362
52363     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
52364     StoredVal = DAG.getBitcast(NewVT, StoredVal);
52365
52366     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52367                         St->getPointerInfo(), St->getOriginalAlign(),
52368                         St->getMemOperand()->getFlags());
52369   }
52370
52371   // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
52372   // This will avoid a copy to k-register.
52373   if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
52374       StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
52375       StoredVal.getOperand(0).getValueType() == MVT::i8) {
52376     SDValue Val = StoredVal.getOperand(0);
52377     // We must store zeros to the unused bits.
52378     Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
52379     return DAG.getStore(St->getChain(), dl, Val,
52380                         St->getBasePtr(), St->getPointerInfo(),
52381                         St->getOriginalAlign(),
52382                         St->getMemOperand()->getFlags());
52383   }
52384
52385   // Widen v2i1/v4i1 stores to v8i1.
52386   if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
52387       Subtarget.hasAVX512()) {
52388     unsigned NumConcats = 8 / VT.getVectorNumElements();
52389     // We must store zeros to the unused bits.
52390     SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
52391     Ops[0] = StoredVal;
52392     StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
52393     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52394                         St->getPointerInfo(), St->getOriginalAlign(),
52395                         St->getMemOperand()->getFlags());
52396   }
52397
52398   // Turn vXi1 stores of constants into a scalar store.
52399   if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
52400        VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
52401       ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
52402     // If its a v64i1 store without 64-bit support, we need two stores.
52403     if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
52404       SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
52405                                       StoredVal->ops().slice(0, 32));
52406       Lo = combinevXi1ConstantToInteger(Lo, DAG);
52407       SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
52408                                       StoredVal->ops().slice(32, 32));
52409       Hi = combinevXi1ConstantToInteger(Hi, DAG);
52410
52411       SDValue Ptr0 = St->getBasePtr();
52412       SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
52413
52414       SDValue Ch0 =
52415           DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
52416                        St->getOriginalAlign(),
52417                        St->getMemOperand()->getFlags());
52418       SDValue Ch1 =
52419           DAG.getStore(St->getChain(), dl, Hi, Ptr1,
52420                        St->getPointerInfo().getWithOffset(4),
52421                        St->getOriginalAlign(),
52422                        St->getMemOperand()->getFlags());
52423       return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
52424     }
52425
52426     StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
52427     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
52428                         St->getPointerInfo(), St->getOriginalAlign(),
52429                         St->getMemOperand()->getFlags());
52430   }
52431
52432   // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
52433   // Sandy Bridge, perform two 16-byte stores.
52434   unsigned Fast;
52435   if (VT.is256BitVector() && StVT == VT &&
52436       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
52437                              *St->getMemOperand(), &Fast) &&
52438       !Fast) {
52439     unsigned NumElems = VT.getVectorNumElements();
52440     if (NumElems < 2)
52441       return SDValue();
52442
52443     return splitVectorStore(St, DAG);
52444   }
52445
52446   // Split under-aligned vector non-temporal stores.
52447   if (St->isNonTemporal() && StVT == VT &&
52448       St->getAlign().value() < VT.getStoreSize()) {
52449     // ZMM/YMM nt-stores - either it can be stored as a series of shorter
52450     // vectors or the legalizer can scalarize it to use MOVNTI.
52451     if (VT.is256BitVector() || VT.is512BitVector()) {
52452       unsigned NumElems = VT.getVectorNumElements();
52453       if (NumElems < 2)
52454         return SDValue();
52455       return splitVectorStore(St, DAG);
52456     }
52457
52458     // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
52459     // to use MOVNTI.
52460     if (VT.is128BitVector() && Subtarget.hasSSE2()) {
52461       MVT NTVT = Subtarget.hasSSE4A()
52462                      ? MVT::v2f64
52463                      : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
52464       return scalarizeVectorStore(St, NTVT, DAG);
52465     }
52466   }
52467
52468   // Try to optimize v16i16->v16i8 truncating stores when BWI is not
52469   // supported, but avx512f is by extending to v16i32 and truncating.
52470   if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
52471       St->getValue().getOpcode() == ISD::TRUNCATE &&
52472       St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
52473       TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
52474       St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
52475     SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
52476                               St->getValue().getOperand(0));
52477     return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
52478                              MVT::v16i8, St->getMemOperand());
52479   }
52480
52481   // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
52482   if (!St->isTruncatingStore() &&
52483       (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
52484        StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
52485       StoredVal.hasOneUse() &&
52486       TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
52487     bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
52488     return EmitTruncSStore(IsSigned, St->getChain(),
52489                            dl, StoredVal.getOperand(0), St->getBasePtr(),
52490                            VT, St->getMemOperand(), DAG);
52491   }
52492
52493   // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
52494   if (!St->isTruncatingStore()) {
52495     auto IsExtractedElement = [](SDValue V) {
52496       if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
52497         V = V.getOperand(0);
52498       unsigned Opc = V.getOpcode();
52499       if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
52500           isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
52501           V.getOperand(0).hasOneUse())
52502         return V.getOperand(0);
52503       return SDValue();
52504     };
52505     if (SDValue Extract = IsExtractedElement(StoredVal)) {
52506       SDValue Trunc = peekThroughOneUseBitcasts(Extract);
52507       if (Trunc.getOpcode() == X86ISD::VTRUNC) {
52508         SDValue Src = Trunc.getOperand(0);
52509         MVT DstVT = Trunc.getSimpleValueType();
52510         MVT SrcVT = Src.getSimpleValueType();
52511         unsigned NumSrcElts = SrcVT.getVectorNumElements();
52512         unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
52513         MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
52514         if (NumTruncBits == VT.getSizeInBits() &&
52515             TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
52516           return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
52517                                    TruncVT, St->getMemOperand());
52518         }
52519       }
52520     }
52521   }
52522
52523   // Optimize trunc store (of multiple scalars) to shuffle and store.
52524   // First, pack all of the elements in one place. Next, store to memory
52525   // in fewer chunks.
52526   if (St->isTruncatingStore() && VT.isVector()) {
52527     // Check if we can detect an AVG pattern from the truncation. If yes,
52528     // replace the trunc store by a normal store with the result of X86ISD::AVG
52529     // instruction.
52530     if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
52531       if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
52532                                          Subtarget, dl))
52533         return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
52534                             St->getPointerInfo(), St->getOriginalAlign(),
52535                             St->getMemOperand()->getFlags());
52536
52537     if (TLI.isTruncStoreLegal(VT, StVT)) {
52538       if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
52539         return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
52540                                dl, Val, St->getBasePtr(),
52541                                St->getMemoryVT(), St->getMemOperand(), DAG);
52542       if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
52543                                           DAG, dl))
52544         return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
52545                                dl, Val, St->getBasePtr(),
52546                                St->getMemoryVT(), St->getMemOperand(), DAG);
52547     }
52548
52549     return SDValue();
52550   }
52551
52552   // Cast ptr32 and ptr64 pointers to the default address space before a store.
52553   unsigned AddrSpace = St->getAddressSpace();
52554   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
52555       AddrSpace == X86AS::PTR32_UPTR) {
52556     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
52557     if (PtrVT != St->getBasePtr().getSimpleValueType()) {
52558       SDValue Cast =
52559           DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
52560       return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
52561                           St->getPointerInfo(), St->getOriginalAlign(),
52562                           St->getMemOperand()->getFlags(), St->getAAInfo());
52563     }
52564   }
52565
52566   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
52567   // the FP state in cases where an emms may be missing.
52568   // A preferable solution to the general problem is to figure out the right
52569   // places to insert EMMS.  This qualifies as a quick hack.
52570
52571   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
52572   if (VT.getSizeInBits() != 64)
52573     return SDValue();
52574
52575   const Function &F = DAG.getMachineFunction().getFunction();
52576   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
52577   bool F64IsLegal =
52578       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
52579   if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
52580       isa<LoadSDNode>(St->getValue()) &&
52581       cast<LoadSDNode>(St->getValue())->isSimple() &&
52582       St->getChain().hasOneUse() && St->isSimple()) {
52583     LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
52584
52585     if (!ISD::isNormalLoad(Ld))
52586       return SDValue();
52587
52588     // Avoid the transformation if there are multiple uses of the loaded value.
52589     if (!Ld->hasNUsesOfValue(1, 0))
52590       return SDValue();
52591
52592     SDLoc LdDL(Ld);
52593     SDLoc StDL(N);
52594     // Lower to a single movq load/store pair.
52595     SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
52596                                 Ld->getBasePtr(), Ld->getMemOperand());
52597
52598     // Make sure new load is placed in same chain order.
52599     DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
52600     return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
52601                         St->getMemOperand());
52602   }
52603
52604   // This is similar to the above case, but here we handle a scalar 64-bit
52605   // integer store that is extracted from a vector on a 32-bit target.
52606   // If we have SSE2, then we can treat it like a floating-point double
52607   // to get past legalization. The execution dependencies fixup pass will
52608   // choose the optimal machine instruction for the store if this really is
52609   // an integer or v2f32 rather than an f64.
52610   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
52611       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
52612     SDValue OldExtract = St->getOperand(1);
52613     SDValue ExtOp0 = OldExtract.getOperand(0);
52614     unsigned VecSize = ExtOp0.getValueSizeInBits();
52615     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
52616     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
52617     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
52618                                      BitCast, OldExtract.getOperand(1));
52619     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
52620                         St->getPointerInfo(), St->getOriginalAlign(),
52621                         St->getMemOperand()->getFlags());
52622   }
52623
52624   return SDValue();
52625 }
52626
52627 static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
52628                                      TargetLowering::DAGCombinerInfo &DCI,
52629                                      const X86Subtarget &Subtarget) {
52630   auto *St = cast<MemIntrinsicSDNode>(N);
52631
52632   SDValue StoredVal = N->getOperand(1);
52633   MVT VT = StoredVal.getSimpleValueType();
52634   EVT MemVT = St->getMemoryVT();
52635
52636   // Figure out which elements we demand.
52637   unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
52638   APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
52639
52640   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52641   if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
52642     if (N->getOpcode() != ISD::DELETED_NODE)
52643       DCI.AddToWorklist(N);
52644     return SDValue(N, 0);
52645   }
52646
52647   return SDValue();
52648 }
52649
52650 /// Return 'true' if this vector operation is "horizontal"
52651 /// and return the operands for the horizontal operation in LHS and RHS.  A
52652 /// horizontal operation performs the binary operation on successive elements
52653 /// of its first operand, then on successive elements of its second operand,
52654 /// returning the resulting values in a vector.  For example, if
52655 ///   A = < float a0, float a1, float a2, float a3 >
52656 /// and
52657 ///   B = < float b0, float b1, float b2, float b3 >
52658 /// then the result of doing a horizontal operation on A and B is
52659 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
52660 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
52661 /// A horizontal-op B, for some already available A and B, and if so then LHS is
52662 /// set to A, RHS to B, and the routine returns 'true'.
52663 static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
52664                               SelectionDAG &DAG, const X86Subtarget &Subtarget,
52665                               bool IsCommutative,
52666                               SmallVectorImpl<int> &PostShuffleMask) {
52667   // If either operand is undef, bail out. The binop should be simplified.
52668   if (LHS.isUndef() || RHS.isUndef())
52669     return false;
52670
52671   // Look for the following pattern:
52672   //   A = < float a0, float a1, float a2, float a3 >
52673   //   B = < float b0, float b1, float b2, float b3 >
52674   // and
52675   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
52676   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
52677   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
52678   // which is A horizontal-op B.
52679
52680   MVT VT = LHS.getSimpleValueType();
52681   assert((VT.is128BitVector() || VT.is256BitVector()) &&
52682          "Unsupported vector type for horizontal add/sub");
52683   unsigned NumElts = VT.getVectorNumElements();
52684
52685   auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
52686                         SmallVectorImpl<int> &ShuffleMask) {
52687     bool UseSubVector = false;
52688     if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
52689         Op.getOperand(0).getValueType().is256BitVector() &&
52690         llvm::isNullConstant(Op.getOperand(1))) {
52691       Op = Op.getOperand(0);
52692       UseSubVector = true;
52693     }
52694     SmallVector<SDValue, 2> SrcOps;
52695     SmallVector<int, 16> SrcMask, ScaledMask;
52696     SDValue BC = peekThroughBitcasts(Op);
52697     if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
52698         !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
52699           return Op.getValueSizeInBits() == BC.getValueSizeInBits();
52700         })) {
52701       resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
52702       if (!UseSubVector && SrcOps.size() <= 2 &&
52703           scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
52704         N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
52705         N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
52706         ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
52707       }
52708       if (UseSubVector && SrcOps.size() == 1 &&
52709           scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
52710         std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
52711         ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
52712         ShuffleMask.assign(Mask.begin(), Mask.end());
52713       }
52714     }
52715   };
52716
52717   // View LHS in the form
52718   //   LHS = VECTOR_SHUFFLE A, B, LMask
52719   // If LHS is not a shuffle, then pretend it is the identity shuffle:
52720   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
52721   // NOTE: A default initialized SDValue represents an UNDEF of type VT.
52722   SDValue A, B;
52723   SmallVector<int, 16> LMask;
52724   GetShuffle(LHS, A, B, LMask);
52725
52726   // Likewise, view RHS in the form
52727   //   RHS = VECTOR_SHUFFLE C, D, RMask
52728   SDValue C, D;
52729   SmallVector<int, 16> RMask;
52730   GetShuffle(RHS, C, D, RMask);
52731
52732   // At least one of the operands should be a vector shuffle.
52733   unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
52734   if (NumShuffles == 0)
52735     return false;
52736
52737   if (LMask.empty()) {
52738     A = LHS;
52739     for (unsigned i = 0; i != NumElts; ++i)
52740       LMask.push_back(i);
52741   }
52742
52743   if (RMask.empty()) {
52744     C = RHS;
52745     for (unsigned i = 0; i != NumElts; ++i)
52746       RMask.push_back(i);
52747   }
52748
52749   // If we have an unary mask, ensure the other op is set to null.
52750   if (isUndefOrInRange(LMask, 0, NumElts))
52751     B = SDValue();
52752   else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
52753     A = SDValue();
52754
52755   if (isUndefOrInRange(RMask, 0, NumElts))
52756     D = SDValue();
52757   else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
52758     C = SDValue();
52759
52760   // If A and B occur in reverse order in RHS, then canonicalize by commuting
52761   // RHS operands and shuffle mask.
52762   if (A != C) {
52763     std::swap(C, D);
52764     ShuffleVectorSDNode::commuteMask(RMask);
52765   }
52766   // Check that the shuffles are both shuffling the same vectors.
52767   if (!(A == C && B == D))
52768     return false;
52769
52770   PostShuffleMask.clear();
52771   PostShuffleMask.append(NumElts, SM_SentinelUndef);
52772
52773   // LHS and RHS are now:
52774   //   LHS = shuffle A, B, LMask
52775   //   RHS = shuffle A, B, RMask
52776   // Check that the masks correspond to performing a horizontal operation.
52777   // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
52778   // so we just repeat the inner loop if this is a 256-bit op.
52779   unsigned Num128BitChunks = VT.getSizeInBits() / 128;
52780   unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
52781   unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
52782   assert((NumEltsPer128BitChunk % 2 == 0) &&
52783          "Vector type should have an even number of elements in each lane");
52784   for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
52785     for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
52786       // Ignore undefined components.
52787       int LIdx = LMask[i + j], RIdx = RMask[i + j];
52788       if (LIdx < 0 || RIdx < 0 ||
52789           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
52790           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
52791         continue;
52792
52793       // Check that successive odd/even elements are being operated on. If not,
52794       // this is not a horizontal operation.
52795       if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
52796           !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
52797         return false;
52798
52799       // Compute the post-shuffle mask index based on where the element
52800       // is stored in the HOP result, and where it needs to be moved to.
52801       int Base = LIdx & ~1u;
52802       int Index = ((Base % NumEltsPer128BitChunk) / 2) +
52803                   ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
52804
52805       // The  low half of the 128-bit result must choose from A.
52806       // The high half of the 128-bit result must choose from B,
52807       // unless B is undef. In that case, we are always choosing from A.
52808       if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
52809         Index += NumEltsPer64BitChunk;
52810       PostShuffleMask[i + j] = Index;
52811     }
52812   }
52813
52814   SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
52815   SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
52816
52817   bool IsIdentityPostShuffle =
52818       isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
52819   if (IsIdentityPostShuffle)
52820     PostShuffleMask.clear();
52821
52822   // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
52823   if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
52824       isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
52825     return false;
52826
52827   // If the source nodes are already used in HorizOps then always accept this.
52828   // Shuffle folding should merge these back together.
52829   bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
52830     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
52831   });
52832   bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
52833     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
52834   });
52835   bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
52836
52837   // Assume a SingleSource HOP if we only shuffle one input and don't need to
52838   // shuffle the result.
52839   if (!ForceHorizOp &&
52840       !shouldUseHorizontalOp(NewLHS == NewRHS &&
52841                                  (NumShuffles < 2 || !IsIdentityPostShuffle),
52842                              DAG, Subtarget))
52843     return false;
52844
52845   LHS = DAG.getBitcast(VT, NewLHS);
52846   RHS = DAG.getBitcast(VT, NewRHS);
52847   return true;
52848 }
52849
52850 // Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
52851 static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
52852                                          const X86Subtarget &Subtarget) {
52853   EVT VT = N->getValueType(0);
52854   unsigned Opcode = N->getOpcode();
52855   bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
52856   SmallVector<int, 8> PostShuffleMask;
52857
52858   switch (Opcode) {
52859   case ISD::FADD:
52860   case ISD::FSUB:
52861     if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
52862         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
52863       SDValue LHS = N->getOperand(0);
52864       SDValue RHS = N->getOperand(1);
52865       auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
52866       if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
52867                             PostShuffleMask)) {
52868         SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
52869         if (!PostShuffleMask.empty())
52870           HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
52871                                             DAG.getUNDEF(VT), PostShuffleMask);
52872         return HorizBinOp;
52873       }
52874     }
52875     break;
52876   case ISD::ADD:
52877   case ISD::SUB:
52878     if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
52879                                  VT == MVT::v16i16 || VT == MVT::v8i32)) {
52880       SDValue LHS = N->getOperand(0);
52881       SDValue RHS = N->getOperand(1);
52882       auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
52883       if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
52884                             PostShuffleMask)) {
52885         auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
52886                                         ArrayRef<SDValue> Ops) {
52887           return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
52888         };
52889         SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
52890                                               {LHS, RHS}, HOpBuilder);
52891         if (!PostShuffleMask.empty())
52892           HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
52893                                             DAG.getUNDEF(VT), PostShuffleMask);
52894         return HorizBinOp;
52895       }
52896     }
52897     break;
52898   }
52899
52900   return SDValue();
52901 }
52902
52903 //  Try to combine the following nodes
52904 //  t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
52905 //    <i32 -2147483648[float -0.000000e+00]> 0
52906 //  t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
52907 //    <(load 4 from constant-pool)> t0, t29
52908 //  [t30: v16i32 = bitcast t27]
52909 //  t6: v16i32 = xor t7, t27[t30]
52910 //  t11: v16f32 = bitcast t6
52911 //  t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
52912 //  into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
52913 //  t22: v16f32 = bitcast t7
52914 //  t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
52915 //  t24: v32f16 = bitcast t23
52916 static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
52917                                   const X86Subtarget &Subtarget) {
52918   EVT VT = N->getValueType(0);
52919   SDValue LHS = N->getOperand(0);
52920   SDValue RHS = N->getOperand(1);
52921   int CombineOpcode =
52922       N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
52923   auto isConjugationConstant = [](const Constant *c) {
52924     if (const auto *CI = dyn_cast<ConstantInt>(c)) {
52925       APInt ConjugationInt32 = APInt(32, 0x80000000, true);
52926       APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
52927       switch (CI->getBitWidth()) {
52928       case 16:
52929         return false;
52930       case 32:
52931         return CI->getValue() == ConjugationInt32;
52932       case 64:
52933         return CI->getValue() == ConjugationInt64;
52934       default:
52935         llvm_unreachable("Unexpected bit width");
52936       }
52937     }
52938     if (const auto *CF = dyn_cast<ConstantFP>(c))
52939       return CF->isNegativeZeroValue();
52940     return false;
52941   };
52942   auto combineConjugation = [&](SDValue &r) {
52943     if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
52944       SDValue XOR = LHS.getOperand(0);
52945       if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
52946         SDValue XORRHS = XOR.getOperand(1);
52947         if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
52948           XORRHS = XORRHS.getOperand(0);
52949         if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
52950             XORRHS.getOperand(1).getNumOperands()) {
52951           ConstantPoolSDNode *CP =
52952               dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
52953           if (CP && isConjugationConstant(CP->getConstVal())) {
52954             SelectionDAG::FlagInserter FlagsInserter(DAG, N);
52955             SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
52956             SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
52957             r = DAG.getBitcast(VT, FCMulC);
52958             return true;
52959           }
52960         }
52961       }
52962     }
52963     return false;
52964   };
52965   SDValue Res;
52966   if (combineConjugation(Res))
52967     return Res;
52968   std::swap(LHS, RHS);
52969   if (combineConjugation(Res))
52970     return Res;
52971   return Res;
52972 }
52973
52974 //  Try to combine the following nodes:
52975 //  FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
52976 static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
52977                                 const X86Subtarget &Subtarget) {
52978   auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
52979     return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
52980            Flags.hasAllowContract();
52981   };
52982
52983   auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
52984     return DAG.getTarget().Options.NoSignedZerosFPMath ||
52985            Flags.hasNoSignedZeros();
52986   };
52987   auto IsVectorAllNegativeZero = [](const SDNode *N) {
52988     if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
52989       return false;
52990     assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&
52991            "Unexpected vector type!");
52992     if (ConstantPoolSDNode *CP =
52993             dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
52994       APInt AI = APInt(32, 0x80008000, true);
52995       if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
52996         return CI->getValue() == AI;
52997       if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
52998         return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
52999     }
53000     return false;
53001   };
53002
53003   if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
53004       !AllowContract(N->getFlags()))
53005     return SDValue();
53006
53007   EVT VT = N->getValueType(0);
53008   if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
53009     return SDValue();
53010
53011   SDValue LHS = N->getOperand(0);
53012   SDValue RHS = N->getOperand(1);
53013   bool IsConj;
53014   SDValue FAddOp1, MulOp0, MulOp1;
53015   auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
53016                        &IsVectorAllNegativeZero,
53017                        &HasNoSignedZero](SDValue N) -> bool {
53018     if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
53019       return false;
53020     SDValue Op0 = N.getOperand(0);
53021     unsigned Opcode = Op0.getOpcode();
53022     if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
53023       if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
53024         MulOp0 = Op0.getOperand(0);
53025         MulOp1 = Op0.getOperand(1);
53026         IsConj = Opcode == X86ISD::VFCMULC;
53027         return true;
53028       }
53029       if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
53030           ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
53031             HasNoSignedZero(Op0->getFlags())) ||
53032            IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
53033         MulOp0 = Op0.getOperand(0);
53034         MulOp1 = Op0.getOperand(1);
53035         IsConj = Opcode == X86ISD::VFCMADDC;
53036         return true;
53037       }
53038     }
53039     return false;
53040   };
53041
53042   if (GetCFmulFrom(LHS))
53043     FAddOp1 = RHS;
53044   else if (GetCFmulFrom(RHS))
53045     FAddOp1 = LHS;
53046   else
53047     return SDValue();
53048
53049   MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
53050   FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
53051   unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
53052   // FIXME: How do we handle when fast math flags of FADD are different from
53053   // CFMUL's?
53054   SDValue CFmul =
53055       DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
53056   return DAG.getBitcast(VT, CFmul);
53057 }
53058
53059 /// Do target-specific dag combines on floating-point adds/subs.
53060 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
53061                                const X86Subtarget &Subtarget) {
53062   if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
53063     return HOp;
53064
53065   if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
53066     return COp;
53067
53068   return SDValue();
53069 }
53070
53071 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
53072 /// the codegen.
53073 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
53074 /// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
53075 ///       anything that is guaranteed to be transformed by DAGCombiner.
53076 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
53077                                           const X86Subtarget &Subtarget,
53078                                           const SDLoc &DL) {
53079   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
53080   SDValue Src = N->getOperand(0);
53081   unsigned SrcOpcode = Src.getOpcode();
53082   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53083
53084   EVT VT = N->getValueType(0);
53085   EVT SrcVT = Src.getValueType();
53086
53087   auto IsFreeTruncation = [VT](SDValue Op) {
53088     unsigned TruncSizeInBits = VT.getScalarSizeInBits();
53089
53090     // See if this has been extended from a smaller/equal size to
53091     // the truncation size, allowing a truncation to combine with the extend.
53092     unsigned Opcode = Op.getOpcode();
53093     if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
53094          Opcode == ISD::ZERO_EXTEND) &&
53095         Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
53096       return true;
53097
53098     // See if this is a single use constant which can be constant folded.
53099     // NOTE: We don't peek throught bitcasts here because there is currently
53100     // no support for constant folding truncate+bitcast+vector_of_constants. So
53101     // we'll just send up with a truncate on both operands which will
53102     // get turned back into (truncate (binop)) causing an infinite loop.
53103     return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
53104   };
53105
53106   auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
53107     SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
53108     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
53109     return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
53110   };
53111
53112   // Don't combine if the operation has other uses.
53113   if (!Src.hasOneUse())
53114     return SDValue();
53115
53116   // Only support vector truncation for now.
53117   // TODO: i64 scalar math would benefit as well.
53118   if (!VT.isVector())
53119     return SDValue();
53120
53121   // In most cases its only worth pre-truncating if we're only facing the cost
53122   // of one truncation.
53123   // i.e. if one of the inputs will constant fold or the input is repeated.
53124   switch (SrcOpcode) {
53125   case ISD::MUL:
53126     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
53127     // better to truncate if we have the chance.
53128     if (SrcVT.getScalarType() == MVT::i64 &&
53129         TLI.isOperationLegal(SrcOpcode, VT) &&
53130         !TLI.isOperationLegal(SrcOpcode, SrcVT))
53131       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
53132     [[fallthrough]];
53133   case ISD::AND:
53134   case ISD::XOR:
53135   case ISD::OR:
53136   case ISD::ADD:
53137   case ISD::SUB: {
53138     SDValue Op0 = Src.getOperand(0);
53139     SDValue Op1 = Src.getOperand(1);
53140     if (TLI.isOperationLegal(SrcOpcode, VT) &&
53141         (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
53142       return TruncateArithmetic(Op0, Op1);
53143     break;
53144   }
53145   }
53146
53147   return SDValue();
53148 }
53149
53150 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
53151 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
53152 /// legalization the truncation will be translated into a BUILD_VECTOR with each
53153 /// element that is extracted from a vector and then truncated, and it is
53154 /// difficult to do this optimization based on them.
53155 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
53156                                        const X86Subtarget &Subtarget) {
53157   EVT OutVT = N->getValueType(0);
53158   if (!OutVT.isVector())
53159     return SDValue();
53160
53161   SDValue In = N->getOperand(0);
53162   if (!In.getValueType().isSimple())
53163     return SDValue();
53164
53165   EVT InVT = In.getValueType();
53166   unsigned NumElems = OutVT.getVectorNumElements();
53167
53168   // AVX512 provides fast truncate ops.
53169   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
53170     return SDValue();
53171
53172   EVT OutSVT = OutVT.getVectorElementType();
53173   EVT InSVT = InVT.getVectorElementType();
53174   if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
53175         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
53176         NumElems >= 8))
53177     return SDValue();
53178
53179   // SSSE3's pshufb results in less instructions in the cases below.
53180   if (Subtarget.hasSSSE3() && NumElems == 8) {
53181     if (InSVT == MVT::i16)
53182       return SDValue();
53183     if (InSVT == MVT::i32 &&
53184         (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))
53185       return SDValue();
53186   }
53187
53188   SDLoc DL(N);
53189   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
53190   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
53191   // truncate 2 x v4i32 to v8i16.
53192   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
53193     return truncateVectorWithPACKUS(OutVT, In, DL, Subtarget, DAG);
53194   if (InSVT == MVT::i32)
53195     return truncateVectorWithPACKSS(OutVT, In, DL, Subtarget, DAG);
53196
53197   return SDValue();
53198 }
53199
53200 /// This function transforms vector truncation of 'extended sign-bits' or
53201 /// 'extended zero-bits' values.
53202 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
53203 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
53204                                                SelectionDAG &DAG,
53205                                                const X86Subtarget &Subtarget) {
53206   // Requires SSE2.
53207   if (!Subtarget.hasSSE2())
53208     return SDValue();
53209
53210   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
53211     return SDValue();
53212
53213   SDValue In = N->getOperand(0);
53214   if (!In.getValueType().isSimple())
53215     return SDValue();
53216
53217   MVT VT = N->getValueType(0).getSimpleVT();
53218   MVT SVT = VT.getScalarType();
53219
53220   MVT InVT = In.getValueType().getSimpleVT();
53221   MVT InSVT = InVT.getScalarType();
53222
53223   // Check we have a truncation suited for PACKSS/PACKUS.
53224   if (!isPowerOf2_32(VT.getVectorNumElements()))
53225     return SDValue();
53226   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
53227     return SDValue();
53228   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
53229     return SDValue();
53230
53231   // Truncation to sub-128bit vXi32 can be better handled with shuffles.
53232   if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
53233     return SDValue();
53234
53235   // AVX512 has fast truncate, but if the input is already going to be split,
53236   // there's no harm in trying pack.
53237   if (Subtarget.hasAVX512() &&
53238       !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
53239         InVT.is512BitVector())) {
53240     // PACK should still be worth it for 128-bit vectors if the sources were
53241     // originally concatenated from subvectors.
53242     if (VT.getSizeInBits() > 128 || !isFreeToSplitVector(In.getNode(), DAG))
53243       return SDValue();
53244   }
53245
53246   unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
53247   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
53248
53249   // Use PACKUS if the input has zero-bits that extend all the way to the
53250   // packed/truncated value. e.g. masks, zext_in_reg, etc.
53251   KnownBits Known = DAG.computeKnownBits(In);
53252   unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
53253   if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
53254     return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
53255
53256   // Use PACKSS if the input has sign-bits that extend all the way to the
53257   // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
53258   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
53259
53260   // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
53261   // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
53262   // on and combines/simplifications can't then use it.
53263   if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
53264     return SDValue();
53265
53266   unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
53267   if (NumSignBits > MinSignBits)
53268     return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
53269
53270   // If we have a srl that only generates signbits that we will discard in
53271   // the truncation then we can use PACKSS by converting the srl to a sra.
53272   // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
53273   if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
53274     if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
53275             In, APInt::getAllOnes(VT.getVectorNumElements()))) {
53276       if (*ShAmt == MinSignBits) {
53277         SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
53278         return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
53279                                       Subtarget);
53280       }
53281     }
53282
53283   return SDValue();
53284 }
53285
53286 // Try to form a MULHU or MULHS node by looking for
53287 // (trunc (srl (mul ext, ext), 16))
53288 // TODO: This is X86 specific because we want to be able to handle wide types
53289 // before type legalization. But we can only do it if the vector will be
53290 // legalized via widening/splitting. Type legalization can't handle promotion
53291 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
53292 // combiner.
53293 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
53294                             SelectionDAG &DAG, const X86Subtarget &Subtarget) {
53295   // First instruction should be a right shift of a multiply.
53296   if (Src.getOpcode() != ISD::SRL ||
53297       Src.getOperand(0).getOpcode() != ISD::MUL)
53298     return SDValue();
53299
53300   if (!Subtarget.hasSSE2())
53301     return SDValue();
53302
53303   // Only handle vXi16 types that are at least 128-bits unless they will be
53304   // widened.
53305   if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
53306     return SDValue();
53307
53308   // Input type should be at least vXi32.
53309   EVT InVT = Src.getValueType();
53310   if (InVT.getVectorElementType().getSizeInBits() < 32)
53311     return SDValue();
53312
53313   // Need a shift by 16.
53314   APInt ShiftAmt;
53315   if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
53316       ShiftAmt != 16)
53317     return SDValue();
53318
53319   SDValue LHS = Src.getOperand(0).getOperand(0);
53320   SDValue RHS = Src.getOperand(0).getOperand(1);
53321
53322   // Count leading sign/zero bits on both inputs - if there are enough then
53323   // truncation back to vXi16 will be cheap - either as a pack/shuffle
53324   // sequence or using AVX512 truncations. If the inputs are sext/zext then the
53325   // truncations may actually be free by peeking through to the ext source.
53326   auto IsSext = [&DAG](SDValue V) {
53327     return DAG.ComputeMaxSignificantBits(V) <= 16;
53328   };
53329   auto IsZext = [&DAG](SDValue V) {
53330     return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
53331   };
53332
53333   bool IsSigned = IsSext(LHS) && IsSext(RHS);
53334   bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
53335   if (!IsSigned && !IsUnsigned)
53336     return SDValue();
53337
53338   // Check if both inputs are extensions, which will be removed by truncation.
53339   bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
53340                          LHS.getOpcode() == ISD::ZERO_EXTEND) &&
53341                         (RHS.getOpcode() == ISD::SIGN_EXTEND ||
53342                          RHS.getOpcode() == ISD::ZERO_EXTEND) &&
53343                         LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
53344                         RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
53345
53346   // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
53347   // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
53348   // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
53349   // will have to split anyway.
53350   unsigned InSizeInBits = InVT.getSizeInBits();
53351   if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
53352       !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
53353       (InSizeInBits % 16) == 0) {
53354     EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53355                                 InVT.getSizeInBits() / 16);
53356     SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
53357                               DAG.getBitcast(BCVT, RHS));
53358     return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
53359   }
53360
53361   // Truncate back to source type.
53362   LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
53363   RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
53364
53365   unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
53366   return DAG.getNode(Opc, DL, VT, LHS, RHS);
53367 }
53368
53369 // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
53370 // from one vector with signed bytes from another vector, adds together
53371 // adjacent pairs of 16-bit products, and saturates the result before
53372 // truncating to 16-bits.
53373 //
53374 // Which looks something like this:
53375 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
53376 //                 (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
53377 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
53378                                const X86Subtarget &Subtarget,
53379                                const SDLoc &DL) {
53380   if (!VT.isVector() || !Subtarget.hasSSSE3())
53381     return SDValue();
53382
53383   unsigned NumElems = VT.getVectorNumElements();
53384   EVT ScalarVT = VT.getVectorElementType();
53385   if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
53386     return SDValue();
53387
53388   SDValue SSatVal = detectSSatPattern(In, VT);
53389   if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
53390     return SDValue();
53391
53392   // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
53393   // of multiplies from even/odd elements.
53394   SDValue N0 = SSatVal.getOperand(0);
53395   SDValue N1 = SSatVal.getOperand(1);
53396
53397   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
53398     return SDValue();
53399
53400   SDValue N00 = N0.getOperand(0);
53401   SDValue N01 = N0.getOperand(1);
53402   SDValue N10 = N1.getOperand(0);
53403   SDValue N11 = N1.getOperand(1);
53404
53405   // TODO: Handle constant vectors and use knownbits/computenumsignbits?
53406   // Canonicalize zero_extend to LHS.
53407   if (N01.getOpcode() == ISD::ZERO_EXTEND)
53408     std::swap(N00, N01);
53409   if (N11.getOpcode() == ISD::ZERO_EXTEND)
53410     std::swap(N10, N11);
53411
53412   // Ensure we have a zero_extend and a sign_extend.
53413   if (N00.getOpcode() != ISD::ZERO_EXTEND ||
53414       N01.getOpcode() != ISD::SIGN_EXTEND ||
53415       N10.getOpcode() != ISD::ZERO_EXTEND ||
53416       N11.getOpcode() != ISD::SIGN_EXTEND)
53417     return SDValue();
53418
53419   // Peek through the extends.
53420   N00 = N00.getOperand(0);
53421   N01 = N01.getOperand(0);
53422   N10 = N10.getOperand(0);
53423   N11 = N11.getOperand(0);
53424
53425   // Ensure the extend is from vXi8.
53426   if (N00.getValueType().getVectorElementType() != MVT::i8 ||
53427       N01.getValueType().getVectorElementType() != MVT::i8 ||
53428       N10.getValueType().getVectorElementType() != MVT::i8 ||
53429       N11.getValueType().getVectorElementType() != MVT::i8)
53430     return SDValue();
53431
53432   // All inputs should be build_vectors.
53433   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
53434       N01.getOpcode() != ISD::BUILD_VECTOR ||
53435       N10.getOpcode() != ISD::BUILD_VECTOR ||
53436       N11.getOpcode() != ISD::BUILD_VECTOR)
53437     return SDValue();
53438
53439   // N00/N10 are zero extended. N01/N11 are sign extended.
53440
53441   // For each element, we need to ensure we have an odd element from one vector
53442   // multiplied by the odd element of another vector and the even element from
53443   // one of the same vectors being multiplied by the even element from the
53444   // other vector. So we need to make sure for each element i, this operator
53445   // is being performed:
53446   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
53447   SDValue ZExtIn, SExtIn;
53448   for (unsigned i = 0; i != NumElems; ++i) {
53449     SDValue N00Elt = N00.getOperand(i);
53450     SDValue N01Elt = N01.getOperand(i);
53451     SDValue N10Elt = N10.getOperand(i);
53452     SDValue N11Elt = N11.getOperand(i);
53453     // TODO: Be more tolerant to undefs.
53454     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53455         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53456         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53457         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
53458       return SDValue();
53459     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
53460     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
53461     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
53462     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
53463     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
53464       return SDValue();
53465     unsigned IdxN00 = ConstN00Elt->getZExtValue();
53466     unsigned IdxN01 = ConstN01Elt->getZExtValue();
53467     unsigned IdxN10 = ConstN10Elt->getZExtValue();
53468     unsigned IdxN11 = ConstN11Elt->getZExtValue();
53469     // Add is commutative so indices can be reordered.
53470     if (IdxN00 > IdxN10) {
53471       std::swap(IdxN00, IdxN10);
53472       std::swap(IdxN01, IdxN11);
53473     }
53474     // N0 indices be the even element. N1 indices must be the next odd element.
53475     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
53476         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
53477       return SDValue();
53478     SDValue N00In = N00Elt.getOperand(0);
53479     SDValue N01In = N01Elt.getOperand(0);
53480     SDValue N10In = N10Elt.getOperand(0);
53481     SDValue N11In = N11Elt.getOperand(0);
53482     // First time we find an input capture it.
53483     if (!ZExtIn) {
53484       ZExtIn = N00In;
53485       SExtIn = N01In;
53486     }
53487     if (ZExtIn != N00In || SExtIn != N01In ||
53488         ZExtIn != N10In || SExtIn != N11In)
53489       return SDValue();
53490   }
53491
53492   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
53493                          ArrayRef<SDValue> Ops) {
53494     // Shrink by adding truncate nodes and let DAGCombine fold with the
53495     // sources.
53496     EVT InVT = Ops[0].getValueType();
53497     assert(InVT.getScalarType() == MVT::i8 &&
53498            "Unexpected scalar element type");
53499     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
53500     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53501                                  InVT.getVectorNumElements() / 2);
53502     return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
53503   };
53504   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
53505                           PMADDBuilder);
53506 }
53507
53508 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
53509                                const X86Subtarget &Subtarget) {
53510   EVT VT = N->getValueType(0);
53511   SDValue Src = N->getOperand(0);
53512   SDLoc DL(N);
53513
53514   // Attempt to pre-truncate inputs to arithmetic ops instead.
53515   if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
53516     return V;
53517
53518   // Try to detect AVG pattern first.
53519   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
53520     return Avg;
53521
53522   // Try to detect PMADD
53523   if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
53524     return PMAdd;
53525
53526   // Try to combine truncation with signed/unsigned saturation.
53527   if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
53528     return Val;
53529
53530   // Try to combine PMULHUW/PMULHW for vXi16.
53531   if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
53532     return V;
53533
53534   // The bitcast source is a direct mmx result.
53535   // Detect bitcasts between i32 to x86mmx
53536   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
53537     SDValue BCSrc = Src.getOperand(0);
53538     if (BCSrc.getValueType() == MVT::x86mmx)
53539       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
53540   }
53541
53542   // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
53543   if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
53544     return V;
53545
53546   return combineVectorTruncation(N, DAG, Subtarget);
53547 }
53548
53549 static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
53550                              TargetLowering::DAGCombinerInfo &DCI) {
53551   EVT VT = N->getValueType(0);
53552   SDValue In = N->getOperand(0);
53553   SDLoc DL(N);
53554
53555   if (SDValue SSatVal = detectSSatPattern(In, VT))
53556     return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
53557   if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
53558     return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
53559
53560   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53561   APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
53562   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53563     return SDValue(N, 0);
53564
53565   return SDValue();
53566 }
53567
53568 /// Returns the negated value if the node \p N flips sign of FP value.
53569 ///
53570 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
53571 /// or FSUB(0, x)
53572 /// AVX512F does not have FXOR, so FNEG is lowered as
53573 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
53574 /// In this case we go though all bitcasts.
53575 /// This also recognizes splat of a negated value and returns the splat of that
53576 /// value.
53577 static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
53578   if (N->getOpcode() == ISD::FNEG)
53579     return N->getOperand(0);
53580
53581   // Don't recurse exponentially.
53582   if (Depth > SelectionDAG::MaxRecursionDepth)
53583     return SDValue();
53584
53585   unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
53586
53587   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
53588   EVT VT = Op->getValueType(0);
53589
53590   // Make sure the element size doesn't change.
53591   if (VT.getScalarSizeInBits() != ScalarSize)
53592     return SDValue();
53593
53594   unsigned Opc = Op.getOpcode();
53595   switch (Opc) {
53596   case ISD::VECTOR_SHUFFLE: {
53597     // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
53598     // of this is VECTOR_SHUFFLE(-VEC1, UNDEF).  The mask can be anything here.
53599     if (!Op.getOperand(1).isUndef())
53600       return SDValue();
53601     if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
53602       if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
53603         return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
53604                                     cast<ShuffleVectorSDNode>(Op)->getMask());
53605     break;
53606   }
53607   case ISD::INSERT_VECTOR_ELT: {
53608     // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
53609     // -V, INDEX).
53610     SDValue InsVector = Op.getOperand(0);
53611     SDValue InsVal = Op.getOperand(1);
53612     if (!InsVector.isUndef())
53613       return SDValue();
53614     if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
53615       if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
53616         return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
53617                            NegInsVal, Op.getOperand(2));
53618     break;
53619   }
53620   case ISD::FSUB:
53621   case ISD::XOR:
53622   case X86ISD::FXOR: {
53623     SDValue Op1 = Op.getOperand(1);
53624     SDValue Op0 = Op.getOperand(0);
53625
53626     // For XOR and FXOR, we want to check if constant
53627     // bits of Op1 are sign bit masks. For FSUB, we
53628     // have to check if constant bits of Op0 are sign
53629     // bit masks and hence we swap the operands.
53630     if (Opc == ISD::FSUB)
53631       std::swap(Op0, Op1);
53632
53633     APInt UndefElts;
53634     SmallVector<APInt, 16> EltBits;
53635     // Extract constant bits and see if they are all
53636     // sign bit masks. Ignore the undef elements.
53637     if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
53638                                       /* AllowWholeUndefs */ true,
53639                                       /* AllowPartialUndefs */ false)) {
53640       for (unsigned I = 0, E = EltBits.size(); I < E; I++)
53641         if (!UndefElts[I] && !EltBits[I].isSignMask())
53642           return SDValue();
53643
53644       // Only allow bitcast from correctly-sized constant.
53645       Op0 = peekThroughBitcasts(Op0);
53646       if (Op0.getScalarValueSizeInBits() == ScalarSize)
53647         return Op0;
53648     }
53649     break;
53650   } // case
53651   } // switch
53652
53653   return SDValue();
53654 }
53655
53656 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
53657                                 bool NegRes) {
53658   if (NegMul) {
53659     switch (Opcode) {
53660     default: llvm_unreachable("Unexpected opcode");
53661     case ISD::FMA:              Opcode = X86ISD::FNMADD;        break;
53662     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FNMADD; break;
53663     case X86ISD::FMADD_RND:     Opcode = X86ISD::FNMADD_RND;    break;
53664     case X86ISD::FMSUB:         Opcode = X86ISD::FNMSUB;        break;
53665     case X86ISD::STRICT_FMSUB:  Opcode = X86ISD::STRICT_FNMSUB; break;
53666     case X86ISD::FMSUB_RND:     Opcode = X86ISD::FNMSUB_RND;    break;
53667     case X86ISD::FNMADD:        Opcode = ISD::FMA;              break;
53668     case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA;       break;
53669     case X86ISD::FNMADD_RND:    Opcode = X86ISD::FMADD_RND;     break;
53670     case X86ISD::FNMSUB:        Opcode = X86ISD::FMSUB;         break;
53671     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB;  break;
53672     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FMSUB_RND;     break;
53673     }
53674   }
53675
53676   if (NegAcc) {
53677     switch (Opcode) {
53678     default: llvm_unreachable("Unexpected opcode");
53679     case ISD::FMA:              Opcode = X86ISD::FMSUB;         break;
53680     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FMSUB;  break;
53681     case X86ISD::FMADD_RND:     Opcode = X86ISD::FMSUB_RND;     break;
53682     case X86ISD::FMSUB:         Opcode = ISD::FMA;              break;
53683     case X86ISD::STRICT_FMSUB:  Opcode = ISD::STRICT_FMA;       break;
53684     case X86ISD::FMSUB_RND:     Opcode = X86ISD::FMADD_RND;     break;
53685     case X86ISD::FNMADD:        Opcode = X86ISD::FNMSUB;        break;
53686     case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
53687     case X86ISD::FNMADD_RND:    Opcode = X86ISD::FNMSUB_RND;    break;
53688     case X86ISD::FNMSUB:        Opcode = X86ISD::FNMADD;        break;
53689     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
53690     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FNMADD_RND;    break;
53691     case X86ISD::FMADDSUB:      Opcode = X86ISD::FMSUBADD;      break;
53692     case X86ISD::FMADDSUB_RND:  Opcode = X86ISD::FMSUBADD_RND;  break;
53693     case X86ISD::FMSUBADD:      Opcode = X86ISD::FMADDSUB;      break;
53694     case X86ISD::FMSUBADD_RND:  Opcode = X86ISD::FMADDSUB_RND;  break;
53695     }
53696   }
53697
53698   if (NegRes) {
53699     switch (Opcode) {
53700     // For accuracy reason, we never combine fneg and fma under strict FP.
53701     default: llvm_unreachable("Unexpected opcode");
53702     case ISD::FMA:             Opcode = X86ISD::FNMSUB;       break;
53703     case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
53704     case X86ISD::FMSUB:        Opcode = X86ISD::FNMADD;       break;
53705     case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMADD_RND;   break;
53706     case X86ISD::FNMADD:       Opcode = X86ISD::FMSUB;        break;
53707     case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMSUB_RND;    break;
53708     case X86ISD::FNMSUB:       Opcode = ISD::FMA;             break;
53709     case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMADD_RND;    break;
53710     }
53711   }
53712
53713   return Opcode;
53714 }
53715
53716 /// Do target-specific dag combines on floating point negations.
53717 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
53718                            TargetLowering::DAGCombinerInfo &DCI,
53719                            const X86Subtarget &Subtarget) {
53720   EVT OrigVT = N->getValueType(0);
53721   SDValue Arg = isFNEG(DAG, N);
53722   if (!Arg)
53723     return SDValue();
53724
53725   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53726   EVT VT = Arg.getValueType();
53727   EVT SVT = VT.getScalarType();
53728   SDLoc DL(N);
53729
53730   // Let legalize expand this if it isn't a legal type yet.
53731   if (!TLI.isTypeLegal(VT))
53732     return SDValue();
53733
53734   // If we're negating a FMUL node on a target with FMA, then we can avoid the
53735   // use of a constant by performing (-0 - A*B) instead.
53736   // FIXME: Check rounding control flags as well once it becomes available.
53737   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
53738       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
53739     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
53740     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
53741                                   Arg.getOperand(1), Zero);
53742     return DAG.getBitcast(OrigVT, NewNode);
53743   }
53744
53745   bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53746   bool LegalOperations = !DCI.isBeforeLegalizeOps();
53747   if (SDValue NegArg =
53748           TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
53749     return DAG.getBitcast(OrigVT, NegArg);
53750
53751   return SDValue();
53752 }
53753
53754 SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
53755                                                 bool LegalOperations,
53756                                                 bool ForCodeSize,
53757                                                 NegatibleCost &Cost,
53758                                                 unsigned Depth) const {
53759   // fneg patterns are removable even if they have multiple uses.
53760   if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
53761     Cost = NegatibleCost::Cheaper;
53762     return DAG.getBitcast(Op.getValueType(), Arg);
53763   }
53764
53765   EVT VT = Op.getValueType();
53766   EVT SVT = VT.getScalarType();
53767   unsigned Opc = Op.getOpcode();
53768   SDNodeFlags Flags = Op.getNode()->getFlags();
53769   switch (Opc) {
53770   case ISD::FMA:
53771   case X86ISD::FMSUB:
53772   case X86ISD::FNMADD:
53773   case X86ISD::FNMSUB:
53774   case X86ISD::FMADD_RND:
53775   case X86ISD::FMSUB_RND:
53776   case X86ISD::FNMADD_RND:
53777   case X86ISD::FNMSUB_RND: {
53778     if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
53779         !(SVT == MVT::f32 || SVT == MVT::f64) ||
53780         !isOperationLegal(ISD::FMA, VT))
53781       break;
53782
53783     // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
53784     // if it may have signed zeros.
53785     if (!Flags.hasNoSignedZeros())
53786       break;
53787
53788     // This is always negatible for free but we might be able to remove some
53789     // extra operand negations as well.
53790     SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
53791     for (int i = 0; i != 3; ++i)
53792       NewOps[i] = getCheaperNegatedExpression(
53793           Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
53794
53795     bool NegA = !!NewOps[0];
53796     bool NegB = !!NewOps[1];
53797     bool NegC = !!NewOps[2];
53798     unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
53799
53800     Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
53801                                   : NegatibleCost::Neutral;
53802
53803     // Fill in the non-negated ops with the original values.
53804     for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
53805       if (!NewOps[i])
53806         NewOps[i] = Op.getOperand(i);
53807     return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
53808   }
53809   case X86ISD::FRCP:
53810     if (SDValue NegOp0 =
53811             getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
53812                                  ForCodeSize, Cost, Depth + 1))
53813       return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
53814     break;
53815   }
53816
53817   return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
53818                                               ForCodeSize, Cost, Depth);
53819 }
53820
53821 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
53822                                  const X86Subtarget &Subtarget) {
53823   MVT VT = N->getSimpleValueType(0);
53824   // If we have integer vector types available, use the integer opcodes.
53825   if (!VT.isVector() || !Subtarget.hasSSE2())
53826     return SDValue();
53827
53828   SDLoc dl(N);
53829
53830   unsigned IntBits = VT.getScalarSizeInBits();
53831   MVT IntSVT = MVT::getIntegerVT(IntBits);
53832   MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
53833
53834   SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
53835   SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
53836   unsigned IntOpcode;
53837   switch (N->getOpcode()) {
53838   default: llvm_unreachable("Unexpected FP logic op");
53839   case X86ISD::FOR:   IntOpcode = ISD::OR; break;
53840   case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;
53841   case X86ISD::FAND:  IntOpcode = ISD::AND; break;
53842   case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
53843   }
53844   SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
53845   return DAG.getBitcast(VT, IntOp);
53846 }
53847
53848
53849 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
53850 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
53851   if (N->getOpcode() != ISD::XOR)
53852     return SDValue();
53853
53854   SDValue LHS = N->getOperand(0);
53855   if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
53856     return SDValue();
53857
53858   X86::CondCode NewCC = X86::GetOppositeBranchCondition(
53859       X86::CondCode(LHS->getConstantOperandVal(0)));
53860   SDLoc DL(N);
53861   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
53862 }
53863
53864 static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,
53865                                  const X86Subtarget &Subtarget) {
53866   assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
53867          "Invalid opcode for combing with CTLZ");
53868   if (Subtarget.hasFastLZCNT())
53869     return SDValue();
53870
53871   EVT VT = N->getValueType(0);
53872   if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
53873       (VT != MVT::i64 || !Subtarget.is64Bit()))
53874     return SDValue();
53875
53876   SDValue N0 = N->getOperand(0);
53877   SDValue N1 = N->getOperand(1);
53878
53879   if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
53880       N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)
53881     return SDValue();
53882
53883   SDValue OpCTLZ;
53884   SDValue OpSizeTM1;
53885
53886   if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
53887     OpCTLZ = N1;
53888     OpSizeTM1 = N0;
53889   } else if (N->getOpcode() == ISD::SUB) {
53890     return SDValue();
53891   } else {
53892     OpCTLZ = N0;
53893     OpSizeTM1 = N1;
53894   }
53895
53896   if (!OpCTLZ.hasOneUse())
53897     return SDValue();
53898   auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
53899   if (!C)
53900     return SDValue();
53901
53902   if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
53903     return SDValue();
53904   SDLoc DL(N);
53905   EVT OpVT = VT;
53906   SDValue Op = OpCTLZ.getOperand(0);
53907   if (VT == MVT::i8) {
53908     // Zero extend to i32 since there is not an i8 bsr.
53909     OpVT = MVT::i32;
53910     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
53911   }
53912
53913   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
53914   Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
53915   if (VT == MVT::i8)
53916     Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
53917
53918   return Op;
53919 }
53920
53921 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
53922                           TargetLowering::DAGCombinerInfo &DCI,
53923                           const X86Subtarget &Subtarget) {
53924   SDValue N0 = N->getOperand(0);
53925   SDValue N1 = N->getOperand(1);
53926   EVT VT = N->getValueType(0);
53927
53928   // If this is SSE1 only convert to FXOR to avoid scalarization.
53929   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
53930     return DAG.getBitcast(MVT::v4i32,
53931                           DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
53932                                       DAG.getBitcast(MVT::v4f32, N0),
53933                                       DAG.getBitcast(MVT::v4f32, N1)));
53934   }
53935
53936   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
53937     return Cmp;
53938
53939   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
53940     return R;
53941
53942   if (SDValue R = combineBitOpWithShift(N, DAG))
53943     return R;
53944
53945   if (SDValue R = combineBitOpWithPACK(N, DAG))
53946     return R;
53947
53948   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
53949     return FPLogic;
53950
53951   if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
53952     return R;
53953
53954   if (DCI.isBeforeLegalizeOps())
53955     return SDValue();
53956
53957   if (SDValue SetCC = foldXor1SetCC(N, DAG))
53958     return SetCC;
53959
53960   if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
53961     return R;
53962
53963   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
53964     return RV;
53965
53966   // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
53967   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53968   if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
53969       N0.getOperand(0).getValueType().isVector() &&
53970       N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
53971       TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
53972     return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
53973                                          N0.getOperand(0).getValueType()));
53974   }
53975
53976   // Handle AVX512 mask widening.
53977   // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
53978   if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
53979       VT.getVectorElementType() == MVT::i1 &&
53980       N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
53981       TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
53982     return DAG.getNode(
53983         ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
53984         DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
53985         N0.getOperand(2));
53986   }
53987
53988   // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
53989   // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
53990   // TODO: Under what circumstances could this be performed in DAGCombine?
53991   if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
53992       N0.getOperand(0).getOpcode() == N->getOpcode()) {
53993     SDValue TruncExtSrc = N0.getOperand(0);
53994     auto *N1C = dyn_cast<ConstantSDNode>(N1);
53995     auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
53996     if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
53997       SDLoc DL(N);
53998       SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
53999       SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
54000       return DAG.getNode(ISD::XOR, DL, VT, LHS,
54001                          DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
54002     }
54003   }
54004
54005   if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
54006     return R;
54007
54008   return combineFneg(N, DAG, DCI, Subtarget);
54009 }
54010
54011 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
54012                             TargetLowering::DAGCombinerInfo &DCI,
54013                             const X86Subtarget &Subtarget) {
54014   EVT VT = N->getValueType(0);
54015   unsigned NumBits = VT.getSizeInBits();
54016
54017   // TODO - Constant Folding.
54018
54019   // Simplify the inputs.
54020   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54021   APInt DemandedMask(APInt::getAllOnes(NumBits));
54022   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54023     return SDValue(N, 0);
54024
54025   return SDValue();
54026 }
54027
54028 static bool isNullFPScalarOrVectorConst(SDValue V) {
54029   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
54030 }
54031
54032 /// If a value is a scalar FP zero or a vector FP zero (potentially including
54033 /// undefined elements), return a zero constant that may be used to fold away
54034 /// that value. In the case of a vector, the returned constant will not contain
54035 /// undefined elements even if the input parameter does. This makes it suitable
54036 /// to be used as a replacement operand with operations (eg, bitwise-and) where
54037 /// an undef should not propagate.
54038 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
54039                                         const X86Subtarget &Subtarget) {
54040   if (!isNullFPScalarOrVectorConst(V))
54041     return SDValue();
54042
54043   if (V.getValueType().isVector())
54044     return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
54045
54046   return V;
54047 }
54048
54049 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
54050                                       const X86Subtarget &Subtarget) {
54051   SDValue N0 = N->getOperand(0);
54052   SDValue N1 = N->getOperand(1);
54053   EVT VT = N->getValueType(0);
54054   SDLoc DL(N);
54055
54056   // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
54057   if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
54058         (VT == MVT::f64 && Subtarget.hasSSE2()) ||
54059         (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
54060     return SDValue();
54061
54062   auto isAllOnesConstantFP = [](SDValue V) {
54063     if (V.getSimpleValueType().isVector())
54064       return ISD::isBuildVectorAllOnes(V.getNode());
54065     auto *C = dyn_cast<ConstantFPSDNode>(V);
54066     return C && C->getConstantFPValue()->isAllOnesValue();
54067   };
54068
54069   // fand (fxor X, -1), Y --> fandn X, Y
54070   if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
54071     return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
54072
54073   // fand X, (fxor Y, -1) --> fandn Y, X
54074   if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
54075     return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
54076
54077   return SDValue();
54078 }
54079
54080 /// Do target-specific dag combines on X86ISD::FAND nodes.
54081 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
54082                            const X86Subtarget &Subtarget) {
54083   // FAND(0.0, x) -> 0.0
54084   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
54085     return V;
54086
54087   // FAND(x, 0.0) -> 0.0
54088   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
54089     return V;
54090
54091   if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
54092     return V;
54093
54094   return lowerX86FPLogicOp(N, DAG, Subtarget);
54095 }
54096
54097 /// Do target-specific dag combines on X86ISD::FANDN nodes.
54098 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
54099                             const X86Subtarget &Subtarget) {
54100   // FANDN(0.0, x) -> x
54101   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
54102     return N->getOperand(1);
54103
54104   // FANDN(x, 0.0) -> 0.0
54105   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
54106     return V;
54107
54108   return lowerX86FPLogicOp(N, DAG, Subtarget);
54109 }
54110
54111 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
54112 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
54113                           TargetLowering::DAGCombinerInfo &DCI,
54114                           const X86Subtarget &Subtarget) {
54115   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
54116
54117   // F[X]OR(0.0, x) -> x
54118   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
54119     return N->getOperand(1);
54120
54121   // F[X]OR(x, 0.0) -> x
54122   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
54123     return N->getOperand(0);
54124
54125   if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
54126     return NewVal;
54127
54128   return lowerX86FPLogicOp(N, DAG, Subtarget);
54129 }
54130
54131 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
54132 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
54133   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
54134
54135   // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
54136   if (!DAG.getTarget().Options.NoNaNsFPMath ||
54137       !DAG.getTarget().Options.NoSignedZerosFPMath)
54138     return SDValue();
54139
54140   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
54141   // into FMINC and FMAXC, which are Commutative operations.
54142   unsigned NewOp = 0;
54143   switch (N->getOpcode()) {
54144     default: llvm_unreachable("unknown opcode");
54145     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
54146     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
54147   }
54148
54149   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
54150                      N->getOperand(0), N->getOperand(1));
54151 }
54152
54153 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
54154                                      const X86Subtarget &Subtarget) {
54155   EVT VT = N->getValueType(0);
54156   if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))
54157     return SDValue();
54158
54159   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54160
54161   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
54162         (Subtarget.hasSSE2() && VT == MVT::f64) ||
54163         (Subtarget.hasFP16() && VT == MVT::f16) ||
54164         (VT.isVector() && TLI.isTypeLegal(VT))))
54165     return SDValue();
54166
54167   SDValue Op0 = N->getOperand(0);
54168   SDValue Op1 = N->getOperand(1);
54169   SDLoc DL(N);
54170   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
54171
54172   // If we don't have to respect NaN inputs, this is a direct translation to x86
54173   // min/max instructions.
54174   if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
54175     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
54176
54177   // If one of the operands is known non-NaN use the native min/max instructions
54178   // with the non-NaN input as second operand.
54179   if (DAG.isKnownNeverNaN(Op1))
54180     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
54181   if (DAG.isKnownNeverNaN(Op0))
54182     return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
54183
54184   // If we have to respect NaN inputs, this takes at least 3 instructions.
54185   // Favor a library call when operating on a scalar and minimizing code size.
54186   if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
54187     return SDValue();
54188
54189   EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
54190                                          VT);
54191
54192   // There are 4 possibilities involving NaN inputs, and these are the required
54193   // outputs:
54194   //                   Op1
54195   //               Num     NaN
54196   //            ----------------
54197   //       Num  |  Max  |  Op0 |
54198   // Op0        ----------------
54199   //       NaN  |  Op1  |  NaN |
54200   //            ----------------
54201   //
54202   // The SSE FP max/min instructions were not designed for this case, but rather
54203   // to implement:
54204   //   Min = Op1 < Op0 ? Op1 : Op0
54205   //   Max = Op1 > Op0 ? Op1 : Op0
54206   //
54207   // So they always return Op0 if either input is a NaN. However, we can still
54208   // use those instructions for fmaxnum by selecting away a NaN input.
54209
54210   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
54211   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
54212   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
54213
54214   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
54215   // are NaN, the NaN value of Op1 is the result.
54216   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
54217 }
54218
54219 static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
54220                                    TargetLowering::DAGCombinerInfo &DCI) {
54221   EVT VT = N->getValueType(0);
54222   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54223
54224   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
54225   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
54226     return SDValue(N, 0);
54227
54228   // Convert a full vector load into vzload when not all bits are needed.
54229   SDValue In = N->getOperand(0);
54230   MVT InVT = In.getSimpleValueType();
54231   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
54232       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
54233     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
54234     LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
54235     unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
54236     MVT MemVT = MVT::getIntegerVT(NumBits);
54237     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
54238     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
54239       SDLoc dl(N);
54240       SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
54241                                     DAG.getBitcast(InVT, VZLoad));
54242       DCI.CombineTo(N, Convert);
54243       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54244       DCI.recursivelyDeleteUnusedNodes(LN);
54245       return SDValue(N, 0);
54246     }
54247   }
54248
54249   return SDValue();
54250 }
54251
54252 static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
54253                                      TargetLowering::DAGCombinerInfo &DCI) {
54254   bool IsStrict = N->isTargetStrictFPOpcode();
54255   EVT VT = N->getValueType(0);
54256
54257   // Convert a full vector load into vzload when not all bits are needed.
54258   SDValue In = N->getOperand(IsStrict ? 1 : 0);
54259   MVT InVT = In.getSimpleValueType();
54260   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
54261       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
54262     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
54263     LoadSDNode *LN = cast<LoadSDNode>(In);
54264     unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
54265     MVT MemVT = MVT::getFloatingPointVT(NumBits);
54266     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
54267     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
54268       SDLoc dl(N);
54269       if (IsStrict) {
54270         SDValue Convert =
54271             DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
54272                         {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
54273         DCI.CombineTo(N, Convert, Convert.getValue(1));
54274       } else {
54275         SDValue Convert =
54276             DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
54277         DCI.CombineTo(N, Convert);
54278       }
54279       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54280       DCI.recursivelyDeleteUnusedNodes(LN);
54281       return SDValue(N, 0);
54282     }
54283   }
54284
54285   return SDValue();
54286 }
54287
54288 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
54289 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
54290                             TargetLowering::DAGCombinerInfo &DCI,
54291                             const X86Subtarget &Subtarget) {
54292   SDValue N0 = N->getOperand(0);
54293   SDValue N1 = N->getOperand(1);
54294   MVT VT = N->getSimpleValueType(0);
54295   int NumElts = VT.getVectorNumElements();
54296   unsigned EltSizeInBits = VT.getScalarSizeInBits();
54297
54298   // ANDNP(undef, x) -> 0
54299   // ANDNP(x, undef) -> 0
54300   if (N0.isUndef() || N1.isUndef())
54301     return DAG.getConstant(0, SDLoc(N), VT);
54302
54303   // ANDNP(0, x) -> x
54304   if (ISD::isBuildVectorAllZeros(N0.getNode()))
54305     return N1;
54306
54307   // ANDNP(x, 0) -> 0
54308   if (ISD::isBuildVectorAllZeros(N1.getNode()))
54309     return DAG.getConstant(0, SDLoc(N), VT);
54310
54311   // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
54312   if (ISD::isBuildVectorAllOnes(N1.getNode()))
54313     return DAG.getNOT(SDLoc(N), N0, VT);
54314
54315   // Turn ANDNP back to AND if input is inverted.
54316   if (SDValue Not = IsNOT(N0, DAG))
54317     return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
54318
54319   // Constant Folding
54320   APInt Undefs0, Undefs1;
54321   SmallVector<APInt> EltBits0, EltBits1;
54322   if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {
54323     SDLoc DL(N);
54324     if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {
54325       SmallVector<APInt> ResultBits;
54326       for (int I = 0; I != NumElts; ++I)
54327         ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
54328       return getConstVector(ResultBits, VT, DAG, DL);
54329     }
54330
54331     // Constant fold NOT(N0) to allow us to use AND.
54332     // Ensure this is only performed if we can confirm that the bitcasted source
54333     // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
54334     if (N0->hasOneUse()) {
54335       SDValue BC0 = peekThroughOneUseBitcasts(N0);
54336       if (BC0.getOpcode() != ISD::BITCAST) {
54337         for (APInt &Elt : EltBits0)
54338           Elt = ~Elt;
54339         SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
54340         return DAG.getNode(ISD::AND, DL, VT, Not, N1);
54341       }
54342     }
54343   }
54344
54345   // Attempt to recursively combine a bitmask ANDNP with shuffles.
54346   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
54347     SDValue Op(N, 0);
54348     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
54349       return Res;
54350
54351     // If either operand is a constant mask, then only the elements that aren't
54352     // zero are actually demanded by the other operand.
54353     auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
54354       APInt UndefElts;
54355       SmallVector<APInt> EltBits;
54356       APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
54357       APInt DemandedElts = APInt::getAllOnes(NumElts);
54358       if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
54359                                         EltBits)) {
54360         DemandedBits.clearAllBits();
54361         DemandedElts.clearAllBits();
54362         for (int I = 0; I != NumElts; ++I) {
54363           if (UndefElts[I]) {
54364             // We can't assume an undef src element gives an undef dst - the
54365             // other src might be zero.
54366             DemandedBits.setAllBits();
54367             DemandedElts.setBit(I);
54368           } else if ((Invert && !EltBits[I].isAllOnes()) ||
54369                      (!Invert && !EltBits[I].isZero())) {
54370             DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
54371             DemandedElts.setBit(I);
54372           }
54373         }
54374       }
54375       return std::make_pair(DemandedBits, DemandedElts);
54376     };
54377     APInt Bits0, Elts0;
54378     APInt Bits1, Elts1;
54379     std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
54380     std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
54381
54382     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54383     if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
54384         TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
54385         TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
54386         TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
54387       if (N->getOpcode() != ISD::DELETED_NODE)
54388         DCI.AddToWorklist(N);
54389       return SDValue(N, 0);
54390     }
54391   }
54392
54393   return SDValue();
54394 }
54395
54396 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
54397                          TargetLowering::DAGCombinerInfo &DCI) {
54398   SDValue N1 = N->getOperand(1);
54399
54400   // BT ignores high bits in the bit index operand.
54401   unsigned BitWidth = N1.getValueSizeInBits();
54402   APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
54403   if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
54404     if (N->getOpcode() != ISD::DELETED_NODE)
54405       DCI.AddToWorklist(N);
54406     return SDValue(N, 0);
54407   }
54408
54409   return SDValue();
54410 }
54411
54412 static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
54413                                TargetLowering::DAGCombinerInfo &DCI) {
54414   bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
54415   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
54416
54417   if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
54418     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54419     APInt DemandedElts = APInt::getLowBitsSet(8, 4);
54420     if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
54421       if (N->getOpcode() != ISD::DELETED_NODE)
54422         DCI.AddToWorklist(N);
54423       return SDValue(N, 0);
54424     }
54425
54426     // Convert a full vector load into vzload when not all bits are needed.
54427     if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
54428       LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
54429       if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
54430         SDLoc dl(N);
54431         if (IsStrict) {
54432           SDValue Convert = DAG.getNode(
54433               N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
54434               {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
54435           DCI.CombineTo(N, Convert, Convert.getValue(1));
54436         } else {
54437           SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
54438                                         DAG.getBitcast(MVT::v8i16, VZLoad));
54439           DCI.CombineTo(N, Convert);
54440         }
54441
54442         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
54443         DCI.recursivelyDeleteUnusedNodes(LN);
54444         return SDValue(N, 0);
54445       }
54446     }
54447   }
54448
54449   return SDValue();
54450 }
54451
54452 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
54453 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
54454   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
54455
54456   EVT DstVT = N->getValueType(0);
54457
54458   SDValue N0 = N->getOperand(0);
54459   SDValue N1 = N->getOperand(1);
54460   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54461
54462   if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
54463     return SDValue();
54464
54465   // Look through single use any_extends / truncs.
54466   SDValue IntermediateBitwidthOp;
54467   if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
54468       N0.hasOneUse()) {
54469     IntermediateBitwidthOp = N0;
54470     N0 = N0.getOperand(0);
54471   }
54472
54473   // See if we have a single use cmov.
54474   if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
54475     return SDValue();
54476
54477   SDValue CMovOp0 = N0.getOperand(0);
54478   SDValue CMovOp1 = N0.getOperand(1);
54479
54480   // Make sure both operands are constants.
54481   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
54482       !isa<ConstantSDNode>(CMovOp1.getNode()))
54483     return SDValue();
54484
54485   SDLoc DL(N);
54486
54487   // If we looked through an any_extend/trunc above, add one to the constants.
54488   if (IntermediateBitwidthOp) {
54489     unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
54490     CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
54491     CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
54492   }
54493
54494   CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
54495   CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
54496
54497   EVT CMovVT = DstVT;
54498   // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
54499   if (DstVT == MVT::i16) {
54500     CMovVT = MVT::i32;
54501     CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
54502     CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
54503   }
54504
54505   SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
54506                              N0.getOperand(2), N0.getOperand(3));
54507
54508   if (CMovVT != DstVT)
54509     CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
54510
54511   return CMov;
54512 }
54513
54514 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
54515                                       const X86Subtarget &Subtarget) {
54516   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
54517
54518   if (SDValue V = combineSextInRegCmov(N, DAG))
54519     return V;
54520
54521   EVT VT = N->getValueType(0);
54522   SDValue N0 = N->getOperand(0);
54523   SDValue N1 = N->getOperand(1);
54524   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
54525   SDLoc dl(N);
54526
54527   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
54528   // both SSE and AVX2 since there is no sign-extended shift right
54529   // operation on a vector with 64-bit elements.
54530   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
54531   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
54532   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
54533                            N0.getOpcode() == ISD::SIGN_EXTEND)) {
54534     SDValue N00 = N0.getOperand(0);
54535
54536     // EXTLOAD has a better solution on AVX2,
54537     // it may be replaced with X86ISD::VSEXT node.
54538     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
54539       if (!ISD::isNormalLoad(N00.getNode()))
54540         return SDValue();
54541
54542     // Attempt to promote any comparison mask ops before moving the
54543     // SIGN_EXTEND_INREG in the way.
54544     if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
54545       return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
54546
54547     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
54548       SDValue Tmp =
54549           DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
54550       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
54551     }
54552   }
54553   return SDValue();
54554 }
54555
54556 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
54557 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
54558 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
54559 /// opportunities to combine math ops, use an LEA, or use a complex addressing
54560 /// mode. This can eliminate extend, add, and shift instructions.
54561 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
54562                                    const X86Subtarget &Subtarget) {
54563   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
54564       Ext->getOpcode() != ISD::ZERO_EXTEND)
54565     return SDValue();
54566
54567   // TODO: This should be valid for other integer types.
54568   EVT VT = Ext->getValueType(0);
54569   if (VT != MVT::i64)
54570     return SDValue();
54571
54572   SDValue Add = Ext->getOperand(0);
54573   if (Add.getOpcode() != ISD::ADD)
54574     return SDValue();
54575
54576   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
54577   bool NSW = Add->getFlags().hasNoSignedWrap();
54578   bool NUW = Add->getFlags().hasNoUnsignedWrap();
54579
54580   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
54581   // into the 'zext'
54582   if ((Sext && !NSW) || (!Sext && !NUW))
54583     return SDValue();
54584
54585   // Having a constant operand to the 'add' ensures that we are not increasing
54586   // the instruction count because the constant is extended for free below.
54587   // A constant operand can also become the displacement field of an LEA.
54588   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
54589   if (!AddOp1)
54590     return SDValue();
54591
54592   // Don't make the 'add' bigger if there's no hope of combining it with some
54593   // other 'add' or 'shl' instruction.
54594   // TODO: It may be profitable to generate simpler LEA instructions in place
54595   // of single 'add' instructions, but the cost model for selecting an LEA
54596   // currently has a high threshold.
54597   bool HasLEAPotential = false;
54598   for (auto *User : Ext->uses()) {
54599     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
54600       HasLEAPotential = true;
54601       break;
54602     }
54603   }
54604   if (!HasLEAPotential)
54605     return SDValue();
54606
54607   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
54608   int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
54609   SDValue AddOp0 = Add.getOperand(0);
54610   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
54611   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
54612
54613   // The wider add is guaranteed to not wrap because both operands are
54614   // sign-extended.
54615   SDNodeFlags Flags;
54616   Flags.setNoSignedWrap(NSW);
54617   Flags.setNoUnsignedWrap(NUW);
54618   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
54619 }
54620
54621 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
54622 // operands and the result of CMOV is not used anywhere else - promote CMOV
54623 // itself instead of promoting its result. This could be beneficial, because:
54624 //     1) X86TargetLowering::EmitLoweredSelect later can do merging of two
54625 //        (or more) pseudo-CMOVs only when they go one-after-another and
54626 //        getting rid of result extension code after CMOV will help that.
54627 //     2) Promotion of constant CMOV arguments is free, hence the
54628 //        {ANY,SIGN,ZERO}_EXTEND will just be deleted.
54629 //     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
54630 //        promotion is also good in terms of code-size.
54631 //        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
54632 //         promotion).
54633 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
54634   SDValue CMovN = Extend->getOperand(0);
54635   if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
54636     return SDValue();
54637
54638   EVT TargetVT = Extend->getValueType(0);
54639   unsigned ExtendOpcode = Extend->getOpcode();
54640   SDLoc DL(Extend);
54641
54642   EVT VT = CMovN.getValueType();
54643   SDValue CMovOp0 = CMovN.getOperand(0);
54644   SDValue CMovOp1 = CMovN.getOperand(1);
54645
54646   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
54647       !isa<ConstantSDNode>(CMovOp1.getNode()))
54648     return SDValue();
54649
54650   // Only extend to i32 or i64.
54651   if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
54652     return SDValue();
54653
54654   // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
54655   // are free.
54656   if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
54657     return SDValue();
54658
54659   // If this a zero extend to i64, we should only extend to i32 and use a free
54660   // zero extend to finish.
54661   EVT ExtendVT = TargetVT;
54662   if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
54663     ExtendVT = MVT::i32;
54664
54665   CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
54666   CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
54667
54668   SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
54669                             CMovN.getOperand(2), CMovN.getOperand(3));
54670
54671   // Finish extending if needed.
54672   if (ExtendVT != TargetVT)
54673     Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
54674
54675   return Res;
54676 }
54677
54678 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
54679 // result type.
54680 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
54681                                const X86Subtarget &Subtarget) {
54682   SDValue N0 = N->getOperand(0);
54683   EVT VT = N->getValueType(0);
54684   SDLoc dl(N);
54685
54686   // Only do this combine with AVX512 for vector extends.
54687   if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
54688     return SDValue();
54689
54690   // Only combine legal element types.
54691   EVT SVT = VT.getVectorElementType();
54692   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
54693       SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
54694     return SDValue();
54695
54696   // We don't have CMPP Instruction for vxf16
54697   if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
54698     return SDValue();
54699   // We can only do this if the vector size in 256 bits or less.
54700   unsigned Size = VT.getSizeInBits();
54701   if (Size > 256 && Subtarget.useAVX512Regs())
54702     return SDValue();
54703
54704   // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
54705   // that's the only integer compares with we have.
54706   ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
54707   if (ISD::isUnsignedIntSetCC(CC))
54708     return SDValue();
54709
54710   // Only do this combine if the extension will be fully consumed by the setcc.
54711   EVT N00VT = N0.getOperand(0).getValueType();
54712   EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
54713   if (Size != MatchingVecType.getSizeInBits())
54714     return SDValue();
54715
54716   SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
54717
54718   if (N->getOpcode() == ISD::ZERO_EXTEND)
54719     Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
54720
54721   return Res;
54722 }
54723
54724 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
54725                            TargetLowering::DAGCombinerInfo &DCI,
54726                            const X86Subtarget &Subtarget) {
54727   SDValue N0 = N->getOperand(0);
54728   EVT VT = N->getValueType(0);
54729   SDLoc DL(N);
54730
54731   // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
54732   if (!DCI.isBeforeLegalizeOps() &&
54733       N0.getOpcode() == X86ISD::SETCC_CARRY) {
54734     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
54735                                  N0->getOperand(1));
54736     bool ReplaceOtherUses = !N0.hasOneUse();
54737     DCI.CombineTo(N, Setcc);
54738     // Replace other uses with a truncate of the widened setcc_carry.
54739     if (ReplaceOtherUses) {
54740       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
54741                                   N0.getValueType(), Setcc);
54742       DCI.CombineTo(N0.getNode(), Trunc);
54743     }
54744
54745     return SDValue(N, 0);
54746   }
54747
54748   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
54749     return NewCMov;
54750
54751   if (!DCI.isBeforeLegalizeOps())
54752     return SDValue();
54753
54754   if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
54755     return V;
54756
54757   if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
54758                                                  DAG, DCI, Subtarget))
54759     return V;
54760
54761   if (VT.isVector()) {
54762     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
54763       return R;
54764
54765     if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
54766       return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
54767   }
54768
54769   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
54770     return NewAdd;
54771
54772   return SDValue();
54773 }
54774
54775 // Inverting a constant vector is profitable if it can be eliminated and the
54776 // inverted vector is already present in DAG. Otherwise, it will be loaded
54777 // anyway.
54778 //
54779 // We determine which of the values can be completely eliminated and invert it.
54780 // If both are eliminable, select a vector with the first negative element.
54781 static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG) {
54782   assert(ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()) &&
54783          "ConstantFP build vector expected");
54784   // Check if we can eliminate V. We assume if a value is only used in FMAs, we
54785   // can eliminate it. Since this function is invoked for each FMA with this
54786   // vector.
54787   auto IsNotFMA = [](SDNode *Use) {
54788     return Use->getOpcode() != ISD::FMA && Use->getOpcode() != ISD::STRICT_FMA;
54789   };
54790   if (llvm::any_of(V->uses(), IsNotFMA))
54791     return SDValue();
54792
54793   SmallVector<SDValue, 8> Ops;
54794   EVT VT = V.getValueType();
54795   EVT EltVT = VT.getVectorElementType();
54796   for (auto Op : V->op_values()) {
54797     if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
54798       Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
54799     } else {
54800       assert(Op.isUndef());
54801       Ops.push_back(DAG.getUNDEF(EltVT));
54802     }
54803   }
54804
54805   SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
54806   if (!NV)
54807     return SDValue();
54808
54809   // If an inverted version cannot be eliminated, choose it instead of the
54810   // original version.
54811   if (llvm::any_of(NV->uses(), IsNotFMA))
54812     return SDValue(NV, 0);
54813
54814   // If the inverted version also can be eliminated, we have to consistently
54815   // prefer one of the values. We prefer a constant with a negative value on
54816   // the first place.
54817   // N.B. We need to skip undefs that may precede a value.
54818   for (auto op : V->op_values()) {
54819     if (auto *Cst = dyn_cast<ConstantFPSDNode>(op)) {
54820       if (Cst->isNegative())
54821         return SDValue();
54822       break;
54823     }
54824   }
54825   return SDValue(NV, 0);
54826 }
54827
54828 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
54829                           TargetLowering::DAGCombinerInfo &DCI,
54830                           const X86Subtarget &Subtarget) {
54831   SDLoc dl(N);
54832   EVT VT = N->getValueType(0);
54833   bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
54834
54835   // Let legalize expand this if it isn't a legal type yet.
54836   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54837   if (!TLI.isTypeLegal(VT))
54838     return SDValue();
54839
54840   SDValue A = N->getOperand(IsStrict ? 1 : 0);
54841   SDValue B = N->getOperand(IsStrict ? 2 : 1);
54842   SDValue C = N->getOperand(IsStrict ? 3 : 2);
54843
54844   // If the operation allows fast-math and the target does not support FMA,
54845   // split this into mul+add to avoid libcall(s).
54846   SDNodeFlags Flags = N->getFlags();
54847   if (!IsStrict && Flags.hasAllowReassociation() &&
54848       TLI.isOperationExpand(ISD::FMA, VT)) {
54849     SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
54850     return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
54851   }
54852
54853   EVT ScalarVT = VT.getScalarType();
54854   if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
54855        !Subtarget.hasAnyFMA()) &&
54856       !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
54857     return SDValue();
54858
54859   auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
54860     bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54861     bool LegalOperations = !DCI.isBeforeLegalizeOps();
54862     if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
54863                                                        CodeSize)) {
54864       V = NegV;
54865       return true;
54866     }
54867     // Look through extract_vector_elts. If it comes from an FNEG, create a
54868     // new extract from the FNEG input.
54869     if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
54870         isNullConstant(V.getOperand(1))) {
54871       SDValue Vec = V.getOperand(0);
54872       if (SDValue NegV = TLI.getCheaperNegatedExpression(
54873               Vec, DAG, LegalOperations, CodeSize)) {
54874         V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
54875                         NegV, V.getOperand(1));
54876         return true;
54877       }
54878     }
54879     // Lookup if there is an inverted version of constant vector V in DAG.
54880     if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
54881       if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
54882         V = NegV;
54883         return true;
54884       }
54885     }
54886     return false;
54887   };
54888
54889   // Do not convert the passthru input of scalar intrinsics.
54890   // FIXME: We could allow negations of the lower element only.
54891   bool NegA = invertIfNegative(A);
54892   bool NegB = invertIfNegative(B);
54893   bool NegC = invertIfNegative(C);
54894
54895   if (!NegA && !NegB && !NegC)
54896     return SDValue();
54897
54898   unsigned NewOpcode =
54899       negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
54900
54901   // Propagate fast-math-flags to new FMA node.
54902   SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
54903   if (IsStrict) {
54904     assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
54905     return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
54906                        {N->getOperand(0), A, B, C});
54907   } else {
54908     if (N->getNumOperands() == 4)
54909       return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
54910     return DAG.getNode(NewOpcode, dl, VT, A, B, C);
54911   }
54912 }
54913
54914 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
54915 // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
54916 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
54917                                TargetLowering::DAGCombinerInfo &DCI) {
54918   SDLoc dl(N);
54919   EVT VT = N->getValueType(0);
54920   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54921   bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
54922   bool LegalOperations = !DCI.isBeforeLegalizeOps();
54923
54924   SDValue N2 = N->getOperand(2);
54925
54926   SDValue NegN2 =
54927       TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
54928   if (!NegN2)
54929     return SDValue();
54930   unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
54931
54932   if (N->getNumOperands() == 4)
54933     return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54934                        NegN2, N->getOperand(3));
54935   return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
54936                      NegN2);
54937 }
54938
54939 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
54940                            TargetLowering::DAGCombinerInfo &DCI,
54941                            const X86Subtarget &Subtarget) {
54942   SDLoc dl(N);
54943   SDValue N0 = N->getOperand(0);
54944   EVT VT = N->getValueType(0);
54945
54946   // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
54947   // FIXME: Is this needed? We don't seem to have any tests for it.
54948   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
54949       N0.getOpcode() == X86ISD::SETCC_CARRY) {
54950     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
54951                                  N0->getOperand(1));
54952     bool ReplaceOtherUses = !N0.hasOneUse();
54953     DCI.CombineTo(N, Setcc);
54954     // Replace other uses with a truncate of the widened setcc_carry.
54955     if (ReplaceOtherUses) {
54956       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
54957                                   N0.getValueType(), Setcc);
54958       DCI.CombineTo(N0.getNode(), Trunc);
54959     }
54960
54961     return SDValue(N, 0);
54962   }
54963
54964   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
54965     return NewCMov;
54966
54967   if (DCI.isBeforeLegalizeOps())
54968     if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
54969       return V;
54970
54971   if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
54972                                                  DAG, DCI, Subtarget))
54973     return V;
54974
54975   if (VT.isVector())
54976     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
54977       return R;
54978
54979   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
54980     return NewAdd;
54981
54982   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
54983     return R;
54984
54985   // TODO: Combine with any target/faux shuffle.
54986   if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
54987       VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
54988     SDValue N00 = N0.getOperand(0);
54989     SDValue N01 = N0.getOperand(1);
54990     unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
54991     APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
54992     if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
54993         (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
54994       return concatSubVectors(N00, N01, DAG, dl);
54995     }
54996   }
54997
54998   return SDValue();
54999 }
55000
55001 /// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
55002 /// pre-promote its result type since vXi1 vectors don't get promoted
55003 /// during type legalization.
55004 static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,
55005                                         SDValue RHS, ISD::CondCode CC,
55006                                         const SDLoc &DL, SelectionDAG &DAG,
55007                                         const X86Subtarget &Subtarget) {
55008   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
55009       VT.getVectorElementType() == MVT::i1 &&
55010       (OpVT.getVectorElementType() == MVT::i8 ||
55011        OpVT.getVectorElementType() == MVT::i16)) {
55012     SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
55013     return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
55014   }
55015   return SDValue();
55016 }
55017
55018 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
55019                             TargetLowering::DAGCombinerInfo &DCI,
55020                             const X86Subtarget &Subtarget) {
55021   const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
55022   const SDValue LHS = N->getOperand(0);
55023   const SDValue RHS = N->getOperand(1);
55024   EVT VT = N->getValueType(0);
55025   EVT OpVT = LHS.getValueType();
55026   SDLoc DL(N);
55027
55028   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
55029     if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
55030                                                     Subtarget))
55031       return V;
55032
55033     if (VT == MVT::i1) {
55034       X86::CondCode X86CC;
55035       if (SDValue V =
55036               MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
55037         return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
55038     }
55039
55040     if (OpVT.isScalarInteger()) {
55041       // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
55042       // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
55043       auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
55044         if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
55045           if (N0.getOperand(0) == N1)
55046             return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
55047                                N0.getOperand(1));
55048           if (N0.getOperand(1) == N1)
55049             return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
55050                                N0.getOperand(0));
55051         }
55052         return SDValue();
55053       };
55054       if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
55055         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55056       if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
55057         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55058
55059       // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
55060       // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
55061       auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
55062         if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
55063           if (N0.getOperand(0) == N1)
55064             return DAG.getNode(ISD::AND, DL, OpVT, N1,
55065                                DAG.getNOT(DL, N0.getOperand(1), OpVT));
55066           if (N0.getOperand(1) == N1)
55067             return DAG.getNode(ISD::AND, DL, OpVT, N1,
55068                                DAG.getNOT(DL, N0.getOperand(0), OpVT));
55069         }
55070         return SDValue();
55071       };
55072       if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
55073         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55074       if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
55075         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
55076
55077       // cmpeq(trunc(x),C) --> cmpeq(x,C)
55078       // cmpne(trunc(x),C) --> cmpne(x,C)
55079       // iff x upper bits are zero.
55080       if (LHS.getOpcode() == ISD::TRUNCATE &&
55081           LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
55082           isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
55083         EVT SrcVT = LHS.getOperand(0).getValueType();
55084         APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
55085                                                 OpVT.getScalarSizeInBits());
55086         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55087         auto *C = cast<ConstantSDNode>(RHS);
55088         if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
55089             TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
55090           return DAG.getSetCC(DL, VT, LHS.getOperand(0),
55091                               DAG.getConstant(C->getAPIntValue().zextOrTrunc(
55092                                                   SrcVT.getScalarSizeInBits()),
55093                                               DL, SrcVT),
55094                               CC);
55095       }
55096
55097       // With C as a power of 2 and C != 0 and C != INT_MIN:
55098       //    icmp eq Abs(X) C ->
55099       //        (icmp eq A, C) | (icmp eq A, -C)
55100       //    icmp ne Abs(X) C ->
55101       //        (icmp ne A, C) & (icmp ne A, -C)
55102       // Both of these patterns can be better optimized in
55103       // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
55104       // integers which is checked above.
55105       if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
55106         if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
55107           const APInt &CInt = C->getAPIntValue();
55108           // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
55109           if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
55110             SDValue BaseOp = LHS.getOperand(0);
55111             SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
55112             SDValue SETCC1 = DAG.getSetCC(
55113                 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
55114             return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
55115                                SETCC0, SETCC1);
55116           }
55117         }
55118       }
55119     }
55120   }
55121
55122   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
55123       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
55124     // Using temporaries to avoid messing up operand ordering for later
55125     // transformations if this doesn't work.
55126     SDValue Op0 = LHS;
55127     SDValue Op1 = RHS;
55128     ISD::CondCode TmpCC = CC;
55129     // Put build_vector on the right.
55130     if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
55131       std::swap(Op0, Op1);
55132       TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
55133     }
55134
55135     bool IsSEXT0 =
55136         (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
55137         (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
55138     bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
55139
55140     if (IsSEXT0 && IsVZero1) {
55141       assert(VT == Op0.getOperand(0).getValueType() &&
55142              "Unexpected operand type");
55143       if (TmpCC == ISD::SETGT)
55144         return DAG.getConstant(0, DL, VT);
55145       if (TmpCC == ISD::SETLE)
55146         return DAG.getConstant(1, DL, VT);
55147       if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
55148         return DAG.getNOT(DL, Op0.getOperand(0), VT);
55149
55150       assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
55151              "Unexpected condition code!");
55152       return Op0.getOperand(0);
55153     }
55154   }
55155
55156   // Try and make unsigned vector comparison signed. On pre AVX512 targets there
55157   // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
55158   // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
55159   // a mask, there are signed AVX512 comparisons).
55160   if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
55161     bool CanMakeSigned = false;
55162     if (ISD::isUnsignedIntSetCC(CC)) {
55163       KnownBits CmpKnown =
55164           DAG.computeKnownBits(LHS).intersectWith(DAG.computeKnownBits(RHS));
55165       // If we know LHS/RHS share the same sign bit at each element we can
55166       // make this signed.
55167       // NOTE: `computeKnownBits` on a vector type aggregates common bits
55168       // across all lanes. So a pattern where the sign varies from lane to
55169       // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
55170       // missed. We could get around this by demanding each lane
55171       // independently, but this isn't the most important optimization and
55172       // that may eat into compile time.
55173       CanMakeSigned =
55174           CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
55175     }
55176     if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
55177       SDValue LHSOut = LHS;
55178       SDValue RHSOut = RHS;
55179       ISD::CondCode NewCC = CC;
55180       switch (CC) {
55181       case ISD::SETGE:
55182       case ISD::SETUGE:
55183         if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
55184                                                   /*NSW*/ true))
55185           LHSOut = NewLHS;
55186         else if (SDValue NewRHS = incDecVectorConstant(
55187                      RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
55188           RHSOut = NewRHS;
55189         else
55190           break;
55191
55192         [[fallthrough]];
55193       case ISD::SETUGT:
55194         NewCC = ISD::SETGT;
55195         break;
55196
55197       case ISD::SETLE:
55198       case ISD::SETULE:
55199         if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
55200                                                   /*NSW*/ true))
55201           LHSOut = NewLHS;
55202         else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
55203                                                        /*NSW*/ true))
55204           RHSOut = NewRHS;
55205         else
55206           break;
55207
55208         [[fallthrough]];
55209       case ISD::SETULT:
55210         // Will be swapped to SETGT in LowerVSETCC*.
55211         NewCC = ISD::SETLT;
55212         break;
55213       default:
55214         break;
55215       }
55216       if (NewCC != CC) {
55217         if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
55218                                                  NewCC, DL, DAG, Subtarget))
55219           return R;
55220         return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
55221       }
55222     }
55223   }
55224
55225   if (SDValue R =
55226           truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
55227     return R;
55228
55229   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
55230   // to avoid scalarization via legalization because v4i32 is not a legal type.
55231   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
55232       LHS.getValueType() == MVT::v4f32)
55233     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
55234
55235   // X pred 0.0 --> X pred -X
55236   // If the negation of X already exists, use it in the comparison. This removes
55237   // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
55238   // instructions in patterns with a 'select' node.
55239   if (isNullFPScalarOrVectorConst(RHS)) {
55240     SDVTList FNegVT = DAG.getVTList(OpVT);
55241     if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
55242       return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
55243   }
55244
55245   return SDValue();
55246 }
55247
55248 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
55249                              TargetLowering::DAGCombinerInfo &DCI,
55250                              const X86Subtarget &Subtarget) {
55251   SDValue Src = N->getOperand(0);
55252   MVT SrcVT = Src.getSimpleValueType();
55253   MVT VT = N->getSimpleValueType(0);
55254   unsigned NumBits = VT.getScalarSizeInBits();
55255   unsigned NumElts = SrcVT.getVectorNumElements();
55256   unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
55257   assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
55258
55259   // Perform constant folding.
55260   APInt UndefElts;
55261   SmallVector<APInt, 32> EltBits;
55262   if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {
55263     APInt Imm(32, 0);
55264     for (unsigned Idx = 0; Idx != NumElts; ++Idx)
55265       if (!UndefElts[Idx] && EltBits[Idx].isNegative())
55266         Imm.setBit(Idx);
55267
55268     return DAG.getConstant(Imm, SDLoc(N), VT);
55269   }
55270
55271   // Look through int->fp bitcasts that don't change the element width.
55272   unsigned EltWidth = SrcVT.getScalarSizeInBits();
55273   if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
55274       Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
55275     return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
55276
55277   // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
55278   // with scalar comparisons.
55279   if (SDValue NotSrc = IsNOT(Src, DAG)) {
55280     SDLoc DL(N);
55281     APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
55282     NotSrc = DAG.getBitcast(SrcVT, NotSrc);
55283     return DAG.getNode(ISD::XOR, DL, VT,
55284                        DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
55285                        DAG.getConstant(NotMask, DL, VT));
55286   }
55287
55288   // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
55289   // results with scalar comparisons.
55290   if (Src.getOpcode() == X86ISD::PCMPGT &&
55291       ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
55292     SDLoc DL(N);
55293     APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
55294     return DAG.getNode(ISD::XOR, DL, VT,
55295                        DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
55296                        DAG.getConstant(NotMask, DL, VT));
55297   }
55298
55299   // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
55300   // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
55301   // iff pow2splat(c1).
55302   // Use KnownBits to determine if only a single bit is non-zero
55303   // in each element (pow2 or zero), and shift that bit to the msb.
55304   if (Src.getOpcode() == X86ISD::PCMPEQ) {
55305     KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
55306     KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
55307     unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
55308     if (KnownLHS.countMaxPopulation() == 1 &&
55309         (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
55310                                ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
55311       SDLoc DL(N);
55312       MVT ShiftVT = SrcVT;
55313       SDValue ShiftLHS = Src.getOperand(0);
55314       SDValue ShiftRHS = Src.getOperand(1);
55315       if (ShiftVT.getScalarType() == MVT::i8) {
55316         // vXi8 shifts - we only care about the signbit so can use PSLLW.
55317         ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
55318         ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
55319         ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
55320       }
55321       ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
55322                                             ShiftLHS, ShiftAmt, DAG);
55323       ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
55324                                             ShiftRHS, ShiftAmt, DAG);
55325       ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
55326       ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
55327       SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
55328       return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
55329     }
55330   }
55331
55332   // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
55333   if (N->isOnlyUserOf(Src.getNode())) {
55334     SDValue SrcBC = peekThroughOneUseBitcasts(Src);
55335     if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
55336       APInt UndefElts;
55337       SmallVector<APInt, 32> EltBits;
55338       if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
55339                                         UndefElts, EltBits)) {
55340         APInt Mask = APInt::getZero(NumBits);
55341         for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
55342           if (!UndefElts[Idx] && EltBits[Idx].isNegative())
55343             Mask.setBit(Idx);
55344         }
55345         SDLoc DL(N);
55346         SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
55347         SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
55348         return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
55349                            DAG.getConstant(Mask, DL, VT));
55350       }
55351     }
55352   }
55353
55354   // Simplify the inputs.
55355   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55356   APInt DemandedMask(APInt::getAllOnes(NumBits));
55357   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55358     return SDValue(N, 0);
55359
55360   return SDValue();
55361 }
55362
55363 static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG,
55364                             TargetLowering::DAGCombinerInfo &DCI,
55365                             const X86Subtarget &Subtarget) {
55366   MVT VT = N->getSimpleValueType(0);
55367   unsigned NumBits = VT.getScalarSizeInBits();
55368
55369   // Simplify the inputs.
55370   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55371   APInt DemandedMask(APInt::getAllOnes(NumBits));
55372   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55373     return SDValue(N, 0);
55374
55375   return SDValue();
55376 }
55377
55378 static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
55379                                        TargetLowering::DAGCombinerInfo &DCI,
55380                                        const X86Subtarget &Subtarget) {
55381   auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
55382   SDValue BasePtr = MemOp->getBasePtr();
55383   SDValue Index = MemOp->getIndex();
55384   SDValue Scale = MemOp->getScale();
55385   SDValue Mask = MemOp->getMask();
55386
55387   // Attempt to fold an index scale into the scale value directly.
55388   // For smaller indices, implicit sext is performed BEFORE scale, preventing
55389   // this fold under most circumstances.
55390   // TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?
55391   if ((Index.getOpcode() == X86ISD::VSHLI ||
55392        (Index.getOpcode() == ISD::ADD &&
55393         Index.getOperand(0) == Index.getOperand(1))) &&
55394       isa<ConstantSDNode>(Scale) &&
55395       BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {
55396     unsigned ShiftAmt =
55397         Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);
55398     uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
55399     uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);
55400     if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {
55401       SDValue NewIndex = Index.getOperand(0);
55402       SDValue NewScale =
55403           DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());
55404       if (N->getOpcode() == X86ISD::MGATHER)
55405         return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,
55406                                  MemOp->getOperand(1), Mask,
55407                                  MemOp->getBasePtr(), NewIndex, NewScale,
55408                                  MemOp->getChain(), Subtarget);
55409       if (N->getOpcode() == X86ISD::MSCATTER)
55410         return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,
55411                               MemOp->getOperand(1), Mask, MemOp->getBasePtr(),
55412                               NewIndex, NewScale, MemOp->getChain(), Subtarget);
55413     }
55414   }
55415
55416   // With vector masks we only demand the upper bit of the mask.
55417   if (Mask.getScalarValueSizeInBits() != 1) {
55418     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55419     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
55420     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
55421       if (N->getOpcode() != ISD::DELETED_NODE)
55422         DCI.AddToWorklist(N);
55423       return SDValue(N, 0);
55424     }
55425   }
55426
55427   return SDValue();
55428 }
55429
55430 static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
55431                                     SDValue Index, SDValue Base, SDValue Scale,
55432                                     SelectionDAG &DAG) {
55433   SDLoc DL(GorS);
55434
55435   if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
55436     SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
55437                       Gather->getMask(), Base, Index, Scale } ;
55438     return DAG.getMaskedGather(Gather->getVTList(),
55439                                Gather->getMemoryVT(), DL, Ops,
55440                                Gather->getMemOperand(),
55441                                Gather->getIndexType(),
55442                                Gather->getExtensionType());
55443   }
55444   auto *Scatter = cast<MaskedScatterSDNode>(GorS);
55445   SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
55446                     Scatter->getMask(), Base, Index, Scale };
55447   return DAG.getMaskedScatter(Scatter->getVTList(),
55448                               Scatter->getMemoryVT(), DL,
55449                               Ops, Scatter->getMemOperand(),
55450                               Scatter->getIndexType(),
55451                               Scatter->isTruncatingStore());
55452 }
55453
55454 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
55455                                     TargetLowering::DAGCombinerInfo &DCI) {
55456   SDLoc DL(N);
55457   auto *GorS = cast<MaskedGatherScatterSDNode>(N);
55458   SDValue Index = GorS->getIndex();
55459   SDValue Base = GorS->getBasePtr();
55460   SDValue Scale = GorS->getScale();
55461
55462   if (DCI.isBeforeLegalize()) {
55463     unsigned IndexWidth = Index.getScalarValueSizeInBits();
55464
55465     // Shrink constant indices if they are larger than 32-bits.
55466     // Only do this before legalize types since v2i64 could become v2i32.
55467     // FIXME: We could check that the type is legal if we're after legalize
55468     // types, but then we would need to construct test cases where that happens.
55469     // FIXME: We could support more than just constant vectors, but we need to
55470     // careful with costing. A truncate that can be optimized out would be fine.
55471     // Otherwise we might only want to create a truncate if it avoids a split.
55472     if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
55473       if (BV->isConstant() && IndexWidth > 32 &&
55474           DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
55475         EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
55476         Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
55477         return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55478       }
55479     }
55480
55481     // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
55482     // there are sufficient sign bits. Only do this before legalize types to
55483     // avoid creating illegal types in truncate.
55484     if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
55485          Index.getOpcode() == ISD::ZERO_EXTEND) &&
55486         IndexWidth > 32 &&
55487         Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
55488         DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
55489       EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
55490       Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
55491       return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55492     }
55493   }
55494
55495   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55496   EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
55497   // Try to move splat constant adders from the index operand to the base
55498   // pointer operand. Taking care to multiply by the scale. We can only do
55499   // this when index element type is the same as the pointer type.
55500   // Otherwise we need to be sure the math doesn't wrap before the scale.
55501   if (Index.getOpcode() == ISD::ADD &&
55502       Index.getValueType().getVectorElementType() == PtrVT &&
55503       isa<ConstantSDNode>(Scale)) {
55504     uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
55505     if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
55506       BitVector UndefElts;
55507       if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
55508         // FIXME: Allow non-constant?
55509         if (UndefElts.none()) {
55510           // Apply the scale.
55511           APInt Adder = C->getAPIntValue() * ScaleAmt;
55512           // Add it to the existing base.
55513           Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
55514                              DAG.getConstant(Adder, DL, PtrVT));
55515           Index = Index.getOperand(0);
55516           return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55517         }
55518       }
55519
55520       // It's also possible base is just a constant. In that case, just
55521       // replace it with 0 and move the displacement into the index.
55522       if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
55523           isOneConstant(Scale)) {
55524         SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
55525         // Combine the constant build_vector and the constant base.
55526         Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
55527                             Index.getOperand(1), Splat);
55528         // Add to the LHS of the original Index add.
55529         Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
55530                             Index.getOperand(0), Splat);
55531         Base = DAG.getConstant(0, DL, Base.getValueType());
55532         return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55533       }
55534     }
55535   }
55536
55537   if (DCI.isBeforeLegalizeOps()) {
55538     unsigned IndexWidth = Index.getScalarValueSizeInBits();
55539
55540     // Make sure the index is either i32 or i64
55541     if (IndexWidth != 32 && IndexWidth != 64) {
55542       MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
55543       EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
55544       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
55545       return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
55546     }
55547   }
55548
55549   // With vector masks we only demand the upper bit of the mask.
55550   SDValue Mask = GorS->getMask();
55551   if (Mask.getScalarValueSizeInBits() != 1) {
55552     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55553     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
55554     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
55555       if (N->getOpcode() != ISD::DELETED_NODE)
55556         DCI.AddToWorklist(N);
55557       return SDValue(N, 0);
55558     }
55559   }
55560
55561   return SDValue();
55562 }
55563
55564 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
55565 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
55566                                const X86Subtarget &Subtarget) {
55567   SDLoc DL(N);
55568   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
55569   SDValue EFLAGS = N->getOperand(1);
55570
55571   // Try to simplify the EFLAGS and condition code operands.
55572   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
55573     return getSETCC(CC, Flags, DL, DAG);
55574
55575   return SDValue();
55576 }
55577
55578 /// Optimize branch condition evaluation.
55579 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
55580                              const X86Subtarget &Subtarget) {
55581   SDLoc DL(N);
55582   SDValue EFLAGS = N->getOperand(3);
55583   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
55584
55585   // Try to simplify the EFLAGS and condition code operands.
55586   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
55587   // RAUW them under us.
55588   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
55589     SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
55590     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
55591                        N->getOperand(1), Cond, Flags);
55592   }
55593
55594   return SDValue();
55595 }
55596
55597 // TODO: Could we move this to DAGCombine?
55598 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
55599                                                   SelectionDAG &DAG) {
55600   // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
55601   // to optimize away operation when it's from a constant.
55602   //
55603   // The general transformation is:
55604   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
55605   //       AND(VECTOR_CMP(x,y), constant2)
55606   //    constant2 = UNARYOP(constant)
55607
55608   // Early exit if this isn't a vector operation, the operand of the
55609   // unary operation isn't a bitwise AND, or if the sizes of the operations
55610   // aren't the same.
55611   EVT VT = N->getValueType(0);
55612   bool IsStrict = N->isStrictFPOpcode();
55613   unsigned NumEltBits = VT.getScalarSizeInBits();
55614   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55615   if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
55616       DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
55617       VT.getSizeInBits() != Op0.getValueSizeInBits())
55618     return SDValue();
55619
55620   // Now check that the other operand of the AND is a constant. We could
55621   // make the transformation for non-constant splats as well, but it's unclear
55622   // that would be a benefit as it would not eliminate any operations, just
55623   // perform one more step in scalar code before moving to the vector unit.
55624   if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
55625     // Bail out if the vector isn't a constant.
55626     if (!BV->isConstant())
55627       return SDValue();
55628
55629     // Everything checks out. Build up the new and improved node.
55630     SDLoc DL(N);
55631     EVT IntVT = BV->getValueType(0);
55632     // Create a new constant of the appropriate type for the transformed
55633     // DAG.
55634     SDValue SourceConst;
55635     if (IsStrict)
55636       SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
55637                                 {N->getOperand(0), SDValue(BV, 0)});
55638     else
55639       SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
55640     // The AND node needs bitcasts to/from an integer vector type around it.
55641     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
55642     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
55643                                  MaskConst);
55644     SDValue Res = DAG.getBitcast(VT, NewAnd);
55645     if (IsStrict)
55646       return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
55647     return Res;
55648   }
55649
55650   return SDValue();
55651 }
55652
55653 /// If we are converting a value to floating-point, try to replace scalar
55654 /// truncate of an extracted vector element with a bitcast. This tries to keep
55655 /// the sequence on XMM registers rather than moving between vector and GPRs.
55656 static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
55657   // TODO: This is currently only used by combineSIntToFP, but it is generalized
55658   //       to allow being called by any similar cast opcode.
55659   // TODO: Consider merging this into lowering: vectorizeExtractedCast().
55660   SDValue Trunc = N->getOperand(0);
55661   if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
55662     return SDValue();
55663
55664   SDValue ExtElt = Trunc.getOperand(0);
55665   if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
55666       !isNullConstant(ExtElt.getOperand(1)))
55667     return SDValue();
55668
55669   EVT TruncVT = Trunc.getValueType();
55670   EVT SrcVT = ExtElt.getValueType();
55671   unsigned DestWidth = TruncVT.getSizeInBits();
55672   unsigned SrcWidth = SrcVT.getSizeInBits();
55673   if (SrcWidth % DestWidth != 0)
55674     return SDValue();
55675
55676   // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
55677   EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
55678   unsigned VecWidth = SrcVecVT.getSizeInBits();
55679   unsigned NumElts = VecWidth / DestWidth;
55680   EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
55681   SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
55682   SDLoc DL(N);
55683   SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
55684                                   BitcastVec, ExtElt.getOperand(1));
55685   return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
55686 }
55687
55688 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
55689                                const X86Subtarget &Subtarget) {
55690   bool IsStrict = N->isStrictFPOpcode();
55691   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55692   EVT VT = N->getValueType(0);
55693   EVT InVT = Op0.getValueType();
55694
55695   // UINT_TO_FP(vXi1~15)  -> UINT_TO_FP(ZEXT(vXi1~15  to vXi16))
55696   // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
55697   // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
55698   if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
55699     unsigned ScalarSize = InVT.getScalarSizeInBits();
55700     if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
55701       return SDValue();
55702     SDLoc dl(N);
55703     EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
55704                                  ScalarSize < 16   ? MVT::i16
55705                                  : ScalarSize < 32 ? MVT::i32
55706                                                    : MVT::i64,
55707                                  InVT.getVectorNumElements());
55708     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
55709     if (IsStrict)
55710       return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
55711                          {N->getOperand(0), P});
55712     return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
55713   }
55714
55715   // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
55716   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
55717   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
55718   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
55719       VT.getScalarType() != MVT::f16) {
55720     SDLoc dl(N);
55721     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
55722     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
55723
55724     // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
55725     if (IsStrict)
55726       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55727                          {N->getOperand(0), P});
55728     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55729   }
55730
55731   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
55732   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
55733   // the optimization here.
55734   if (DAG.SignBitIsZero(Op0)) {
55735     if (IsStrict)
55736       return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
55737                          {N->getOperand(0), Op0});
55738     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
55739   }
55740
55741   return SDValue();
55742 }
55743
55744 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
55745                                TargetLowering::DAGCombinerInfo &DCI,
55746                                const X86Subtarget &Subtarget) {
55747   // First try to optimize away the conversion entirely when it's
55748   // conditionally from a constant. Vectors only.
55749   bool IsStrict = N->isStrictFPOpcode();
55750   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
55751     return Res;
55752
55753   // Now move on to more general possibilities.
55754   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
55755   EVT VT = N->getValueType(0);
55756   EVT InVT = Op0.getValueType();
55757
55758   // SINT_TO_FP(vXi1~15)  -> SINT_TO_FP(SEXT(vXi1~15  to vXi16))
55759   // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
55760   // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
55761   if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
55762     unsigned ScalarSize = InVT.getScalarSizeInBits();
55763     if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
55764       return SDValue();
55765     SDLoc dl(N);
55766     EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
55767                                  ScalarSize < 16   ? MVT::i16
55768                                  : ScalarSize < 32 ? MVT::i32
55769                                                    : MVT::i64,
55770                                  InVT.getVectorNumElements());
55771     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
55772     if (IsStrict)
55773       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55774                          {N->getOperand(0), P});
55775     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55776   }
55777
55778   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
55779   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
55780   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
55781   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
55782       VT.getScalarType() != MVT::f16) {
55783     SDLoc dl(N);
55784     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
55785     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
55786     if (IsStrict)
55787       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55788                          {N->getOperand(0), P});
55789     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
55790   }
55791
55792   // Without AVX512DQ we only support i64 to float scalar conversion. For both
55793   // vectors and scalars, see if we know that the upper bits are all the sign
55794   // bit, in which case we can truncate the input to i32 and convert from that.
55795   if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
55796     unsigned BitWidth = InVT.getScalarSizeInBits();
55797     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
55798     if (NumSignBits >= (BitWidth - 31)) {
55799       EVT TruncVT = MVT::i32;
55800       if (InVT.isVector())
55801         TruncVT = InVT.changeVectorElementType(TruncVT);
55802       SDLoc dl(N);
55803       if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
55804         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
55805         if (IsStrict)
55806           return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
55807                              {N->getOperand(0), Trunc});
55808         return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
55809       }
55810       // If we're after legalize and the type is v2i32 we need to shuffle and
55811       // use CVTSI2P.
55812       assert(InVT == MVT::v2i64 && "Unexpected VT!");
55813       SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
55814       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
55815                                           { 0, 2, -1, -1 });
55816       if (IsStrict)
55817         return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
55818                            {N->getOperand(0), Shuf});
55819       return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
55820     }
55821   }
55822
55823   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
55824   // a 32-bit target where SSE doesn't support i64->FP operations.
55825   if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
55826       Op0.getOpcode() == ISD::LOAD) {
55827     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
55828
55829     // This transformation is not supported if the result type is f16 or f128.
55830     if (VT == MVT::f16 || VT == MVT::f128)
55831       return SDValue();
55832
55833     // If we have AVX512DQ we can use packed conversion instructions unless
55834     // the VT is f80.
55835     if (Subtarget.hasDQI() && VT != MVT::f80)
55836       return SDValue();
55837
55838     if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
55839         Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
55840       std::pair<SDValue, SDValue> Tmp =
55841           Subtarget.getTargetLowering()->BuildFILD(
55842               VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
55843               Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
55844       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
55845       return Tmp.first;
55846     }
55847   }
55848
55849   if (IsStrict)
55850     return SDValue();
55851
55852   if (SDValue V = combineToFPTruncExtElt(N, DAG))
55853     return V;
55854
55855   return SDValue();
55856 }
55857
55858 static bool needCarryOrOverflowFlag(SDValue Flags) {
55859   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
55860
55861   for (const SDNode *User : Flags->uses()) {
55862     X86::CondCode CC;
55863     switch (User->getOpcode()) {
55864     default:
55865       // Be conservative.
55866       return true;
55867     case X86ISD::SETCC:
55868     case X86ISD::SETCC_CARRY:
55869       CC = (X86::CondCode)User->getConstantOperandVal(0);
55870       break;
55871     case X86ISD::BRCOND:
55872     case X86ISD::CMOV:
55873       CC = (X86::CondCode)User->getConstantOperandVal(2);
55874       break;
55875     }
55876
55877     switch (CC) {
55878     default: break;
55879     case X86::COND_A: case X86::COND_AE:
55880     case X86::COND_B: case X86::COND_BE:
55881     case X86::COND_O: case X86::COND_NO:
55882     case X86::COND_G: case X86::COND_GE:
55883     case X86::COND_L: case X86::COND_LE:
55884       return true;
55885     }
55886   }
55887
55888   return false;
55889 }
55890
55891 static bool onlyZeroFlagUsed(SDValue Flags) {
55892   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
55893
55894   for (const SDNode *User : Flags->uses()) {
55895     unsigned CCOpNo;
55896     switch (User->getOpcode()) {
55897     default:
55898       // Be conservative.
55899       return false;
55900     case X86ISD::SETCC:
55901     case X86ISD::SETCC_CARRY:
55902       CCOpNo = 0;
55903       break;
55904     case X86ISD::BRCOND:
55905     case X86ISD::CMOV:
55906       CCOpNo = 2;
55907       break;
55908     }
55909
55910     X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
55911     if (CC != X86::COND_E && CC != X86::COND_NE)
55912       return false;
55913   }
55914
55915   return true;
55916 }
55917
55918 static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
55919   // Only handle test patterns.
55920   if (!isNullConstant(N->getOperand(1)))
55921     return SDValue();
55922
55923   // If we have a CMP of a truncated binop, see if we can make a smaller binop
55924   // and use its flags directly.
55925   // TODO: Maybe we should try promoting compares that only use the zero flag
55926   // first if we can prove the upper bits with computeKnownBits?
55927   SDLoc dl(N);
55928   SDValue Op = N->getOperand(0);
55929   EVT VT = Op.getValueType();
55930
55931   // If we have a constant logical shift that's only used in a comparison
55932   // against zero turn it into an equivalent AND. This allows turning it into
55933   // a TEST instruction later.
55934   if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
55935       Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
55936       onlyZeroFlagUsed(SDValue(N, 0))) {
55937     unsigned BitWidth = VT.getSizeInBits();
55938     const APInt &ShAmt = Op.getConstantOperandAPInt(1);
55939     if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
55940       unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
55941       APInt Mask = Op.getOpcode() == ISD::SRL
55942                        ? APInt::getHighBitsSet(BitWidth, MaskBits)
55943                        : APInt::getLowBitsSet(BitWidth, MaskBits);
55944       if (Mask.isSignedIntN(32)) {
55945         Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
55946                          DAG.getConstant(Mask, dl, VT));
55947         return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55948                            DAG.getConstant(0, dl, VT));
55949       }
55950     }
55951   }
55952
55953   // Peek through any zero-extend if we're only testing for a zero result.
55954   if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
55955     SDValue Src = Op.getOperand(0);
55956     EVT SrcVT = Src.getValueType();
55957     if (SrcVT.getScalarSizeInBits() >= 8 &&
55958         DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
55959       return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
55960                          DAG.getConstant(0, dl, SrcVT));
55961   }
55962
55963   // Look for a truncate.
55964   if (Op.getOpcode() != ISD::TRUNCATE)
55965     return SDValue();
55966
55967   SDValue Trunc = Op;
55968   Op = Op.getOperand(0);
55969
55970   // See if we can compare with zero against the truncation source,
55971   // which should help using the Z flag from many ops. Only do this for
55972   // i32 truncated op to prevent partial-reg compares of promoted ops.
55973   EVT OpVT = Op.getValueType();
55974   APInt UpperBits =
55975       APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
55976   if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
55977       onlyZeroFlagUsed(SDValue(N, 0))) {
55978     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
55979                        DAG.getConstant(0, dl, OpVT));
55980   }
55981
55982   // After this the truncate and arithmetic op must have a single use.
55983   if (!Trunc.hasOneUse() || !Op.hasOneUse())
55984       return SDValue();
55985
55986   unsigned NewOpc;
55987   switch (Op.getOpcode()) {
55988   default: return SDValue();
55989   case ISD::AND:
55990     // Skip and with constant. We have special handling for and with immediate
55991     // during isel to generate test instructions.
55992     if (isa<ConstantSDNode>(Op.getOperand(1)))
55993       return SDValue();
55994     NewOpc = X86ISD::AND;
55995     break;
55996   case ISD::OR:  NewOpc = X86ISD::OR;  break;
55997   case ISD::XOR: NewOpc = X86ISD::XOR; break;
55998   case ISD::ADD:
55999     // If the carry or overflow flag is used, we can't truncate.
56000     if (needCarryOrOverflowFlag(SDValue(N, 0)))
56001       return SDValue();
56002     NewOpc = X86ISD::ADD;
56003     break;
56004   case ISD::SUB:
56005     // If the carry or overflow flag is used, we can't truncate.
56006     if (needCarryOrOverflowFlag(SDValue(N, 0)))
56007       return SDValue();
56008     NewOpc = X86ISD::SUB;
56009     break;
56010   }
56011
56012   // We found an op we can narrow. Truncate its inputs.
56013   SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
56014   SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
56015
56016   // Use a X86 specific opcode to avoid DAG combine messing with it.
56017   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
56018   Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
56019
56020   // For AND, keep a CMP so that we can match the test pattern.
56021   if (NewOpc == X86ISD::AND)
56022     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
56023                        DAG.getConstant(0, dl, VT));
56024
56025   // Return the flags.
56026   return Op.getValue(1);
56027 }
56028
56029 static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
56030                                 TargetLowering::DAGCombinerInfo &DCI) {
56031   assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
56032          "Expected X86ISD::ADD or X86ISD::SUB");
56033
56034   SDLoc DL(N);
56035   SDValue LHS = N->getOperand(0);
56036   SDValue RHS = N->getOperand(1);
56037   MVT VT = LHS.getSimpleValueType();
56038   bool IsSub = X86ISD::SUB == N->getOpcode();
56039   unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
56040
56041   // If we don't use the flag result, simplify back to a generic ADD/SUB.
56042   if (!N->hasAnyUseOfValue(1)) {
56043     SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
56044     return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
56045   }
56046
56047   // Fold any similar generic ADD/SUB opcodes to reuse this node.
56048   auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
56049     SDValue Ops[] = {N0, N1};
56050     SDVTList VTs = DAG.getVTList(N->getValueType(0));
56051     if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
56052       SDValue Op(N, 0);
56053       if (Negate)
56054         Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
56055       DCI.CombineTo(GenericAddSub, Op);
56056     }
56057   };
56058   MatchGeneric(LHS, RHS, false);
56059   MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
56060
56061   // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
56062   // EFLAGS result doesn't change.
56063   return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
56064                                    /*ZeroSecondOpOnly*/ true);
56065 }
56066
56067 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
56068   SDValue LHS = N->getOperand(0);
56069   SDValue RHS = N->getOperand(1);
56070   SDValue BorrowIn = N->getOperand(2);
56071
56072   if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
56073     MVT VT = N->getSimpleValueType(0);
56074     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
56075     return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
56076   }
56077
56078   // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
56079   // iff the flag result is dead.
56080   if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
56081       !N->hasAnyUseOfValue(1))
56082     return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
56083                        LHS.getOperand(1), BorrowIn);
56084
56085   return SDValue();
56086 }
56087
56088 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
56089 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
56090                           TargetLowering::DAGCombinerInfo &DCI) {
56091   SDValue LHS = N->getOperand(0);
56092   SDValue RHS = N->getOperand(1);
56093   SDValue CarryIn = N->getOperand(2);
56094   auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
56095   auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
56096
56097   // Canonicalize constant to RHS.
56098   if (LHSC && !RHSC)
56099     return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
56100                        CarryIn);
56101
56102   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
56103   // the result is either zero or one (depending on the input carry bit).
56104   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
56105   if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
56106       // We don't have a good way to replace an EFLAGS use, so only do this when
56107       // dead right now.
56108       SDValue(N, 1).use_empty()) {
56109     SDLoc DL(N);
56110     EVT VT = N->getValueType(0);
56111     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
56112     SDValue Res1 = DAG.getNode(
56113         ISD::AND, DL, VT,
56114         DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
56115                     DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
56116         DAG.getConstant(1, DL, VT));
56117     return DCI.CombineTo(N, Res1, CarryOut);
56118   }
56119
56120   // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
56121   // iff the flag result is dead.
56122   // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
56123   if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
56124     SDLoc DL(N);
56125     APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
56126     return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
56127                        DAG.getConstant(0, DL, LHS.getValueType()),
56128                        DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
56129   }
56130
56131   if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
56132     MVT VT = N->getSimpleValueType(0);
56133     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
56134     return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
56135   }
56136
56137   // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
56138   // iff the flag result is dead.
56139   if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
56140       !N->hasAnyUseOfValue(1))
56141     return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
56142                        LHS.getOperand(1), CarryIn);
56143
56144   return SDValue();
56145 }
56146
56147 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
56148                             const SDLoc &DL, EVT VT,
56149                             const X86Subtarget &Subtarget) {
56150   // Example of pattern we try to detect:
56151   // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
56152   //(add (build_vector (extract_elt t, 0),
56153   //                   (extract_elt t, 2),
56154   //                   (extract_elt t, 4),
56155   //                   (extract_elt t, 6)),
56156   //     (build_vector (extract_elt t, 1),
56157   //                   (extract_elt t, 3),
56158   //                   (extract_elt t, 5),
56159   //                   (extract_elt t, 7)))
56160
56161   if (!Subtarget.hasSSE2())
56162     return SDValue();
56163
56164   if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
56165       Op1.getOpcode() != ISD::BUILD_VECTOR)
56166     return SDValue();
56167
56168   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
56169       VT.getVectorNumElements() < 4 ||
56170       !isPowerOf2_32(VT.getVectorNumElements()))
56171     return SDValue();
56172
56173   // Check if one of Op0,Op1 is of the form:
56174   // (build_vector (extract_elt Mul, 0),
56175   //               (extract_elt Mul, 2),
56176   //               (extract_elt Mul, 4),
56177   //                   ...
56178   // the other is of the form:
56179   // (build_vector (extract_elt Mul, 1),
56180   //               (extract_elt Mul, 3),
56181   //               (extract_elt Mul, 5),
56182   //                   ...
56183   // and identify Mul.
56184   SDValue Mul;
56185   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
56186     SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
56187             Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
56188     // TODO: Be more tolerant to undefs.
56189     if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
56190         Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
56191         Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
56192         Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
56193       return SDValue();
56194     auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
56195     auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
56196     auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
56197     auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
56198     if (!Const0L || !Const1L || !Const0H || !Const1H)
56199       return SDValue();
56200     unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
56201              Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
56202     // Commutativity of mul allows factors of a product to reorder.
56203     if (Idx0L > Idx1L)
56204       std::swap(Idx0L, Idx1L);
56205     if (Idx0H > Idx1H)
56206       std::swap(Idx0H, Idx1H);
56207     // Commutativity of add allows pairs of factors to reorder.
56208     if (Idx0L > Idx0H) {
56209       std::swap(Idx0L, Idx0H);
56210       std::swap(Idx1L, Idx1H);
56211     }
56212     if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
56213         Idx1H != 2 * i + 3)
56214       return SDValue();
56215     if (!Mul) {
56216       // First time an extract_elt's source vector is visited. Must be a MUL
56217       // with 2X number of vector elements than the BUILD_VECTOR.
56218       // Both extracts must be from same MUL.
56219       Mul = Op0L->getOperand(0);
56220       if (Mul->getOpcode() != ISD::MUL ||
56221           Mul.getValueType().getVectorNumElements() != 2 * e)
56222         return SDValue();
56223     }
56224     // Check that the extract is from the same MUL previously seen.
56225     if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
56226         Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
56227       return SDValue();
56228   }
56229
56230   // Check if the Mul source can be safely shrunk.
56231   ShrinkMode Mode;
56232   if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
56233       Mode == ShrinkMode::MULU16)
56234     return SDValue();
56235
56236   EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
56237                                  VT.getVectorNumElements() * 2);
56238   SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
56239   SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
56240
56241   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
56242                          ArrayRef<SDValue> Ops) {
56243     EVT InVT = Ops[0].getValueType();
56244     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
56245     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
56246                                  InVT.getVectorNumElements() / 2);
56247     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
56248   };
56249   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
56250 }
56251
56252 // Attempt to turn this pattern into PMADDWD.
56253 // (add (mul (sext (build_vector)), (sext (build_vector))),
56254 //      (mul (sext (build_vector)), (sext (build_vector)))
56255 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
56256                               const SDLoc &DL, EVT VT,
56257                               const X86Subtarget &Subtarget) {
56258   if (!Subtarget.hasSSE2())
56259     return SDValue();
56260
56261   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
56262     return SDValue();
56263
56264   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
56265       VT.getVectorNumElements() < 4 ||
56266       !isPowerOf2_32(VT.getVectorNumElements()))
56267     return SDValue();
56268
56269   SDValue N00 = N0.getOperand(0);
56270   SDValue N01 = N0.getOperand(1);
56271   SDValue N10 = N1.getOperand(0);
56272   SDValue N11 = N1.getOperand(1);
56273
56274   // All inputs need to be sign extends.
56275   // TODO: Support ZERO_EXTEND from known positive?
56276   if (N00.getOpcode() != ISD::SIGN_EXTEND ||
56277       N01.getOpcode() != ISD::SIGN_EXTEND ||
56278       N10.getOpcode() != ISD::SIGN_EXTEND ||
56279       N11.getOpcode() != ISD::SIGN_EXTEND)
56280     return SDValue();
56281
56282   // Peek through the extends.
56283   N00 = N00.getOperand(0);
56284   N01 = N01.getOperand(0);
56285   N10 = N10.getOperand(0);
56286   N11 = N11.getOperand(0);
56287
56288   // Must be extending from vXi16.
56289   EVT InVT = N00.getValueType();
56290   if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
56291       N10.getValueType() != InVT || N11.getValueType() != InVT)
56292     return SDValue();
56293
56294   // All inputs should be build_vectors.
56295   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
56296       N01.getOpcode() != ISD::BUILD_VECTOR ||
56297       N10.getOpcode() != ISD::BUILD_VECTOR ||
56298       N11.getOpcode() != ISD::BUILD_VECTOR)
56299     return SDValue();
56300
56301   // For each element, we need to ensure we have an odd element from one vector
56302   // multiplied by the odd element of another vector and the even element from
56303   // one of the same vectors being multiplied by the even element from the
56304   // other vector. So we need to make sure for each element i, this operator
56305   // is being performed:
56306   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
56307   SDValue In0, In1;
56308   for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
56309     SDValue N00Elt = N00.getOperand(i);
56310     SDValue N01Elt = N01.getOperand(i);
56311     SDValue N10Elt = N10.getOperand(i);
56312     SDValue N11Elt = N11.getOperand(i);
56313     // TODO: Be more tolerant to undefs.
56314     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
56315         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
56316         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
56317         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
56318       return SDValue();
56319     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
56320     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
56321     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
56322     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
56323     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
56324       return SDValue();
56325     unsigned IdxN00 = ConstN00Elt->getZExtValue();
56326     unsigned IdxN01 = ConstN01Elt->getZExtValue();
56327     unsigned IdxN10 = ConstN10Elt->getZExtValue();
56328     unsigned IdxN11 = ConstN11Elt->getZExtValue();
56329     // Add is commutative so indices can be reordered.
56330     if (IdxN00 > IdxN10) {
56331       std::swap(IdxN00, IdxN10);
56332       std::swap(IdxN01, IdxN11);
56333     }
56334     // N0 indices be the even element. N1 indices must be the next odd element.
56335     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
56336         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
56337       return SDValue();
56338     SDValue N00In = N00Elt.getOperand(0);
56339     SDValue N01In = N01Elt.getOperand(0);
56340     SDValue N10In = N10Elt.getOperand(0);
56341     SDValue N11In = N11Elt.getOperand(0);
56342
56343     // First time we find an input capture it.
56344     if (!In0) {
56345       In0 = N00In;
56346       In1 = N01In;
56347
56348       // The input vectors must be at least as wide as the output.
56349       // If they are larger than the output, we extract subvector below.
56350       if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
56351           In1.getValueSizeInBits() < VT.getSizeInBits())
56352         return SDValue();
56353     }
56354     // Mul is commutative so the input vectors can be in any order.
56355     // Canonicalize to make the compares easier.
56356     if (In0 != N00In)
56357       std::swap(N00In, N01In);
56358     if (In0 != N10In)
56359       std::swap(N10In, N11In);
56360     if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
56361       return SDValue();
56362   }
56363
56364   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
56365                          ArrayRef<SDValue> Ops) {
56366     EVT OpVT = Ops[0].getValueType();
56367     assert(OpVT.getScalarType() == MVT::i16 &&
56368            "Unexpected scalar element type");
56369     assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
56370     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
56371                                  OpVT.getVectorNumElements() / 2);
56372     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
56373   };
56374
56375   // If the output is narrower than an input, extract the low part of the input
56376   // vector.
56377   EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
56378                                VT.getVectorNumElements() * 2);
56379   if (OutVT16.bitsLT(In0.getValueType())) {
56380     In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
56381                       DAG.getIntPtrConstant(0, DL));
56382   }
56383   if (OutVT16.bitsLT(In1.getValueType())) {
56384     In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
56385                       DAG.getIntPtrConstant(0, DL));
56386   }
56387   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
56388                           PMADDBuilder);
56389 }
56390
56391 // ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
56392 // If upper element in each pair of both VPMADDWD are zero then we can merge
56393 // the operand elements and use the implicit add of VPMADDWD.
56394 // TODO: Add support for VPMADDUBSW (which isn't commutable).
56395 static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
56396                                    const SDLoc &DL, EVT VT) {
56397   if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
56398     return SDValue();
56399
56400   // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
56401   if (VT.getSizeInBits() > 128)
56402     return SDValue();
56403
56404   unsigned NumElts = VT.getVectorNumElements();
56405   MVT OpVT = N0.getOperand(0).getSimpleValueType();
56406   APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
56407   APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
56408
56409   bool Op0HiZero =
56410       DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
56411       DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
56412   bool Op1HiZero =
56413       DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
56414       DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
56415
56416   // TODO: Check for zero lower elements once we have actual codegen that
56417   // creates them.
56418   if (!Op0HiZero || !Op1HiZero)
56419     return SDValue();
56420
56421   // Create a shuffle mask packing the lower elements from each VPMADDWD.
56422   SmallVector<int> Mask;
56423   for (int i = 0; i != (int)NumElts; ++i) {
56424     Mask.push_back(2 * i);
56425     Mask.push_back(2 * (i + NumElts));
56426   }
56427
56428   SDValue LHS =
56429       DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
56430   SDValue RHS =
56431       DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
56432   return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
56433 }
56434
56435 /// CMOV of constants requires materializing constant operands in registers.
56436 /// Try to fold those constants into an 'add' instruction to reduce instruction
56437 /// count. We do this with CMOV rather the generic 'select' because there are
56438 /// earlier folds that may be used to turn select-of-constants into logic hacks.
56439 static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
56440                                        const X86Subtarget &Subtarget) {
56441   // If an operand is zero, add-of-0 gets simplified away, so that's clearly
56442   // better because we eliminate 1-2 instructions. This transform is still
56443   // an improvement without zero operands because we trade 2 move constants and
56444   // 1 add for 2 adds (LEA) as long as the constants can be represented as
56445   // immediate asm operands (fit in 32-bits).
56446   auto isSuitableCmov = [](SDValue V) {
56447     if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
56448       return false;
56449     if (!isa<ConstantSDNode>(V.getOperand(0)) ||
56450         !isa<ConstantSDNode>(V.getOperand(1)))
56451       return false;
56452     return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
56453            (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
56454             V.getConstantOperandAPInt(1).isSignedIntN(32));
56455   };
56456
56457   // Match an appropriate CMOV as the first operand of the add.
56458   SDValue Cmov = N->getOperand(0);
56459   SDValue OtherOp = N->getOperand(1);
56460   if (!isSuitableCmov(Cmov))
56461     std::swap(Cmov, OtherOp);
56462   if (!isSuitableCmov(Cmov))
56463     return SDValue();
56464
56465   // Don't remove a load folding opportunity for the add. That would neutralize
56466   // any improvements from removing constant materializations.
56467   if (X86::mayFoldLoad(OtherOp, Subtarget))
56468     return SDValue();
56469
56470   EVT VT = N->getValueType(0);
56471   SDLoc DL(N);
56472   SDValue FalseOp = Cmov.getOperand(0);
56473   SDValue TrueOp = Cmov.getOperand(1);
56474
56475   // We will push the add through the select, but we can potentially do better
56476   // if we know there is another add in the sequence and this is pointer math.
56477   // In that case, we can absorb an add into the trailing memory op and avoid
56478   // a 3-operand LEA which is likely slower than a 2-operand LEA.
56479   // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
56480   if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
56481       !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
56482       all_of(N->uses(), [&](SDNode *Use) {
56483         auto *MemNode = dyn_cast<MemSDNode>(Use);
56484         return MemNode && MemNode->getBasePtr().getNode() == N;
56485       })) {
56486     // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
56487     // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
56488     //       it is possible that choosing op1 might be better.
56489     SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
56490     FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
56491     TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
56492     Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
56493                        Cmov.getOperand(2), Cmov.getOperand(3));
56494     return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
56495   }
56496
56497   // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
56498   FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
56499   TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
56500   return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
56501                      Cmov.getOperand(3));
56502 }
56503
56504 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
56505                           TargetLowering::DAGCombinerInfo &DCI,
56506                           const X86Subtarget &Subtarget) {
56507   EVT VT = N->getValueType(0);
56508   SDValue Op0 = N->getOperand(0);
56509   SDValue Op1 = N->getOperand(1);
56510   SDLoc DL(N);
56511
56512   if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
56513     return Select;
56514
56515   if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
56516     return MAdd;
56517   if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
56518     return MAdd;
56519   if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
56520     return MAdd;
56521
56522   // Try to synthesize horizontal adds from adds of shuffles.
56523   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
56524     return V;
56525
56526   // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
56527   // (sub Y, (sext (vXi1 X))).
56528   // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
56529   // generic DAG combine without a legal type check, but adding this there
56530   // caused regressions.
56531   if (VT.isVector()) {
56532     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56533     if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
56534         Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
56535         TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
56536       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
56537       return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
56538     }
56539
56540     if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
56541         Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
56542         TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
56543       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
56544       return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
56545     }
56546   }
56547
56548   // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
56549   if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
56550       X86::isZeroNode(Op0.getOperand(1))) {
56551     assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
56552     return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
56553                        Op0.getOperand(0), Op0.getOperand(2));
56554   }
56555
56556   return combineAddOrSubToADCOrSBB(N, DAG);
56557 }
56558
56559 // Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
56560 // condition comes from the subtract node that produced -X. This matches the
56561 // cmov expansion for absolute value. By swapping the operands we convert abs
56562 // to nabs.
56563 static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
56564   SDValue N0 = N->getOperand(0);
56565   SDValue N1 = N->getOperand(1);
56566
56567   if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
56568     return SDValue();
56569
56570   X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
56571   if (CC != X86::COND_S && CC != X86::COND_NS)
56572     return SDValue();
56573
56574   // Condition should come from a negate operation.
56575   SDValue Cond = N1.getOperand(3);
56576   if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
56577     return SDValue();
56578   assert(Cond.getResNo() == 1 && "Unexpected result number");
56579
56580   // Get the X and -X from the negate.
56581   SDValue NegX = Cond.getValue(0);
56582   SDValue X = Cond.getOperand(1);
56583
56584   SDValue FalseOp = N1.getOperand(0);
56585   SDValue TrueOp = N1.getOperand(1);
56586
56587   // Cmov operands should be X and NegX. Order doesn't matter.
56588   if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
56589     return SDValue();
56590
56591   // Build a new CMOV with the operands swapped.
56592   SDLoc DL(N);
56593   MVT VT = N->getSimpleValueType(0);
56594   SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
56595                              N1.getOperand(2), Cond);
56596   // Convert sub to add.
56597   return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
56598 }
56599
56600 static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {
56601   SDValue Op0 = N->getOperand(0);
56602   SDValue Op1 = N->getOperand(1);
56603
56604   // (sub C (zero_extend (setcc)))
56605   // =>
56606   // (add (zero_extend (setcc inverted) C-1))   if C is a nonzero immediate
56607   // Don't disturb (sub 0 setcc), which is easily done with neg.
56608   EVT VT = N->getValueType(0);
56609   auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
56610   if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
56611       !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
56612       Op1.getOperand(0).hasOneUse()) {
56613     SDValue SetCC = Op1.getOperand(0);
56614     X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
56615     X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC);
56616     uint64_t NewImm = Op0C->getZExtValue() - 1;
56617     SDLoc DL(Op1);
56618     SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
56619     NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
56620     return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
56621                        DAG.getConstant(NewImm, DL, VT));
56622   }
56623
56624   return SDValue();
56625 }
56626
56627 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
56628                           TargetLowering::DAGCombinerInfo &DCI,
56629                           const X86Subtarget &Subtarget) {
56630   SDValue Op0 = N->getOperand(0);
56631   SDValue Op1 = N->getOperand(1);
56632
56633   // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
56634   auto IsNonOpaqueConstant = [&](SDValue Op) {
56635     if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
56636       if (auto *Cst = dyn_cast<ConstantSDNode>(C))
56637         return !Cst->isOpaque();
56638       return true;
56639     }
56640     return false;
56641   };
56642
56643   // X86 can't encode an immediate LHS of a sub. See if we can push the
56644   // negation into a preceding instruction. If the RHS of the sub is a XOR with
56645   // one use and a constant, invert the immediate, saving one register.
56646   // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
56647   if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
56648       IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
56649     SDLoc DL(N);
56650     EVT VT = Op0.getValueType();
56651     SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
56652                                  DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
56653     SDValue NewAdd =
56654         DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
56655     return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
56656   }
56657
56658   if (SDValue V = combineSubABS(N, DAG))
56659     return V;
56660
56661   // Try to synthesize horizontal subs from subs of shuffles.
56662   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
56663     return V;
56664
56665   // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
56666   if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
56667       X86::isZeroNode(Op1.getOperand(1))) {
56668     assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
56669     return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
56670                        Op1.getOperand(0), Op1.getOperand(2));
56671   }
56672
56673   // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
56674   // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
56675   if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
56676       !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
56677     assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
56678     SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
56679                               Op1.getOperand(1), Op1.getOperand(2));
56680     return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
56681                        Op1.getOperand(0));
56682   }
56683
56684   if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
56685     return V;
56686
56687   if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
56688     return V;
56689
56690   return combineSubSetcc(N, DAG);
56691 }
56692
56693 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
56694                                     const X86Subtarget &Subtarget) {
56695   MVT VT = N->getSimpleValueType(0);
56696   SDLoc DL(N);
56697
56698   if (N->getOperand(0) == N->getOperand(1)) {
56699     if (N->getOpcode() == X86ISD::PCMPEQ)
56700       return DAG.getConstant(-1, DL, VT);
56701     if (N->getOpcode() == X86ISD::PCMPGT)
56702       return DAG.getConstant(0, DL, VT);
56703   }
56704
56705   return SDValue();
56706 }
56707
56708 /// Helper that combines an array of subvector ops as if they were the operands
56709 /// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
56710 /// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
56711 static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
56712                                       ArrayRef<SDValue> Ops, SelectionDAG &DAG,
56713                                       TargetLowering::DAGCombinerInfo &DCI,
56714                                       const X86Subtarget &Subtarget) {
56715   assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
56716   unsigned EltSizeInBits = VT.getScalarSizeInBits();
56717
56718   if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
56719     return DAG.getUNDEF(VT);
56720
56721   if (llvm::all_of(Ops, [](SDValue Op) {
56722         return ISD::isBuildVectorAllZeros(Op.getNode());
56723       }))
56724     return getZeroVector(VT, Subtarget, DAG, DL);
56725
56726   SDValue Op0 = Ops[0];
56727   bool IsSplat = llvm::all_equal(Ops);
56728
56729   // Repeated subvectors.
56730   if (IsSplat &&
56731       (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
56732     // If this broadcast is inserted into both halves, use a larger broadcast.
56733     if (Op0.getOpcode() == X86ISD::VBROADCAST)
56734       return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
56735
56736     // If this simple subvector or scalar/subvector broadcast_load is inserted
56737     // into both halves, use a larger broadcast_load. Update other uses to use
56738     // an extracted subvector.
56739     if (ISD::isNormalLoad(Op0.getNode()) ||
56740         Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
56741         Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
56742       auto *Mem = cast<MemSDNode>(Op0);
56743       unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
56744                          ? X86ISD::VBROADCAST_LOAD
56745                          : X86ISD::SUBV_BROADCAST_LOAD;
56746       if (SDValue BcastLd =
56747               getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
56748         SDValue BcastSrc =
56749             extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
56750         DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
56751         return BcastLd;
56752       }
56753     }
56754
56755     // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
56756     if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
56757         (Subtarget.hasAVX2() ||
56758          X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),
56759                                               VT.getScalarType(), Subtarget)))
56760       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
56761                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
56762                                      Op0.getOperand(0),
56763                                      DAG.getIntPtrConstant(0, DL)));
56764
56765     // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
56766     if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
56767         (Subtarget.hasAVX2() ||
56768          (EltSizeInBits >= 32 &&
56769           X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
56770         Op0.getOperand(0).getValueType() == VT.getScalarType())
56771       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
56772
56773     // concat_vectors(extract_subvector(broadcast(x)),
56774     //                extract_subvector(broadcast(x))) -> broadcast(x)
56775     if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56776         Op0.getOperand(0).getValueType() == VT) {
56777       if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
56778           Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
56779         return Op0.getOperand(0);
56780     }
56781   }
56782
56783   // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
56784   // Only concat of subvector high halves which vperm2x128 is best at.
56785   // TODO: This should go in combineX86ShufflesRecursively eventually.
56786   if (VT.is256BitVector() && Ops.size() == 2) {
56787     SDValue Src0 = peekThroughBitcasts(Ops[0]);
56788     SDValue Src1 = peekThroughBitcasts(Ops[1]);
56789     if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56790         Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
56791       EVT SrcVT0 = Src0.getOperand(0).getValueType();
56792       EVT SrcVT1 = Src1.getOperand(0).getValueType();
56793       unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
56794       unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
56795       if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
56796           Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
56797           Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
56798         return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
56799                            DAG.getBitcast(VT, Src0.getOperand(0)),
56800                            DAG.getBitcast(VT, Src1.getOperand(0)),
56801                            DAG.getTargetConstant(0x31, DL, MVT::i8));
56802       }
56803     }
56804   }
56805
56806   // Repeated opcode.
56807   // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
56808   // but it currently struggles with different vector widths.
56809   if (llvm::all_of(Ops, [Op0](SDValue Op) {
56810         return Op.getOpcode() == Op0.getOpcode();
56811       })) {
56812     auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
56813       SmallVector<SDValue> Subs;
56814       for (SDValue SubOp : SubOps)
56815         Subs.push_back(SubOp.getOperand(I));
56816       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
56817     };
56818     auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
56819       for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
56820         SDValue Sub = SubOps[I].getOperand(Op);
56821         unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
56822         if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
56823             Sub.getOperand(0).getValueType() != VT ||
56824             Sub.getConstantOperandAPInt(1) != (I * NumSubElts))
56825           return false;
56826       }
56827       return true;
56828     };
56829
56830     unsigned NumOps = Ops.size();
56831     switch (Op0.getOpcode()) {
56832     case X86ISD::VBROADCAST: {
56833       if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
56834             return Op.getOperand(0).getValueType().is128BitVector();
56835           })) {
56836         if (VT == MVT::v4f64 || VT == MVT::v4i64)
56837           return DAG.getNode(X86ISD::UNPCKL, DL, VT,
56838                              ConcatSubOperand(VT, Ops, 0),
56839                              ConcatSubOperand(VT, Ops, 0));
56840         // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
56841         if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
56842           return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
56843                                               : X86ISD::PSHUFD,
56844                              DL, VT, ConcatSubOperand(VT, Ops, 0),
56845                              getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
56846       }
56847       break;
56848     }
56849     case X86ISD::MOVDDUP:
56850     case X86ISD::MOVSHDUP:
56851     case X86ISD::MOVSLDUP: {
56852       if (!IsSplat)
56853         return DAG.getNode(Op0.getOpcode(), DL, VT,
56854                            ConcatSubOperand(VT, Ops, 0));
56855       break;
56856     }
56857     case X86ISD::SHUFP: {
56858       // Add SHUFPD support if/when necessary.
56859       if (!IsSplat && VT.getScalarType() == MVT::f32 &&
56860           llvm::all_of(Ops, [Op0](SDValue Op) {
56861             return Op.getOperand(2) == Op0.getOperand(2);
56862           })) {
56863         return DAG.getNode(Op0.getOpcode(), DL, VT,
56864                            ConcatSubOperand(VT, Ops, 0),
56865                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
56866       }
56867       break;
56868     }
56869     case X86ISD::PSHUFHW:
56870     case X86ISD::PSHUFLW:
56871     case X86ISD::PSHUFD:
56872       if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
56873           Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
56874         return DAG.getNode(Op0.getOpcode(), DL, VT,
56875                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
56876       }
56877       [[fallthrough]];
56878     case X86ISD::VPERMILPI:
56879       if (!IsSplat && VT.getScalarSizeInBits() == 32 &&
56880           (VT.is256BitVector() ||
56881            (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
56882           all_of(Ops, [&Op0](SDValue Op) {
56883             return Op0.getOperand(1) == Op.getOperand(1);
56884           })) {
56885         MVT FloatVT = VT.changeVectorElementType(MVT::f32);
56886         SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
56887         Res =
56888             DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
56889         return DAG.getBitcast(VT, Res);
56890       }
56891       if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
56892         uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
56893         uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
56894         uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
56895         return DAG.getNode(Op0.getOpcode(), DL, VT,
56896                            ConcatSubOperand(VT, Ops, 0),
56897                            DAG.getTargetConstant(Idx, DL, MVT::i8));
56898       }
56899       break;
56900     case X86ISD::PSHUFB:
56901       if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
56902                        (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
56903         return DAG.getNode(Op0.getOpcode(), DL, VT,
56904                            ConcatSubOperand(VT, Ops, 0),
56905                            ConcatSubOperand(VT, Ops, 1));
56906       }
56907       break;
56908     case X86ISD::VPERMV:
56909       if (!IsSplat && NumOps == 2 &&
56910           (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
56911         MVT OpVT = Op0.getSimpleValueType();
56912         int NumSrcElts = OpVT.getVectorNumElements();
56913         SmallVector<int, 64> ConcatMask;
56914         for (unsigned i = 0; i != NumOps; ++i) {
56915           SmallVector<int, 64> SubMask;
56916           SmallVector<SDValue, 2> SubOps;
56917           if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
56918                                     SubMask))
56919             break;
56920           for (int M : SubMask) {
56921             if (0 <= M)
56922               M += i * NumSrcElts;
56923             ConcatMask.push_back(M);
56924           }
56925         }
56926         if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56927           SDValue Src = concatSubVectors(Ops[0].getOperand(1),
56928                                          Ops[1].getOperand(1), DAG, DL);
56929           MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56930           MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56931           SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56932           return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
56933         }
56934       }
56935       break;
56936     case X86ISD::VPERMV3:
56937       if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
56938         MVT OpVT = Op0.getSimpleValueType();
56939         int NumSrcElts = OpVT.getVectorNumElements();
56940         SmallVector<int, 64> ConcatMask;
56941         for (unsigned i = 0; i != NumOps; ++i) {
56942           SmallVector<int, 64> SubMask;
56943           SmallVector<SDValue, 2> SubOps;
56944           if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
56945                                     SubMask))
56946             break;
56947           for (int M : SubMask) {
56948             if (0 <= M) {
56949               M += M < NumSrcElts ? 0 : NumSrcElts;
56950               M += i * NumSrcElts;
56951             }
56952             ConcatMask.push_back(M);
56953           }
56954         }
56955         if (ConcatMask.size() == (NumOps * NumSrcElts)) {
56956           SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
56957                                           Ops[1].getOperand(0), DAG, DL);
56958           SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
56959                                           Ops[1].getOperand(2), DAG, DL);
56960           MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
56961           MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
56962           SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
56963           return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
56964         }
56965       }
56966       break;
56967     case ISD::TRUNCATE:
56968       if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
56969         EVT SrcVT = Ops[0].getOperand(0).getValueType();
56970         if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
56971             SrcVT == Ops[1].getOperand(0).getValueType() &&
56972             Subtarget.useAVX512Regs() &&
56973             Subtarget.getPreferVectorWidth() >= 512 &&
56974             (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
56975           EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
56976           return DAG.getNode(ISD::TRUNCATE, DL, VT,
56977                              ConcatSubOperand(NewSrcVT, Ops, 0));
56978         }
56979       }
56980       break;
56981     case X86ISD::VSHLI:
56982     case X86ISD::VSRLI:
56983       // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
56984       // TODO: Move this to LowerShiftByScalarImmediate?
56985       if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
56986           llvm::all_of(Ops, [](SDValue Op) {
56987             return Op.getConstantOperandAPInt(1) == 32;
56988           })) {
56989         SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
56990         SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
56991         if (Op0.getOpcode() == X86ISD::VSHLI) {
56992           Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56993                                      {8, 0, 8, 2, 8, 4, 8, 6});
56994         } else {
56995           Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
56996                                      {1, 8, 3, 8, 5, 8, 7, 8});
56997         }
56998         return DAG.getBitcast(VT, Res);
56999       }
57000       [[fallthrough]];
57001     case X86ISD::VSRAI:
57002     case X86ISD::VSHL:
57003     case X86ISD::VSRL:
57004     case X86ISD::VSRA:
57005       if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
57006            (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57007             (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
57008           llvm::all_of(Ops, [Op0](SDValue Op) {
57009             return Op0.getOperand(1) == Op.getOperand(1);
57010           })) {
57011         return DAG.getNode(Op0.getOpcode(), DL, VT,
57012                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
57013       }
57014       break;
57015     case X86ISD::VPERMI:
57016     case X86ISD::VROTLI:
57017     case X86ISD::VROTRI:
57018       if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57019           llvm::all_of(Ops, [Op0](SDValue Op) {
57020             return Op0.getOperand(1) == Op.getOperand(1);
57021           })) {
57022         return DAG.getNode(Op0.getOpcode(), DL, VT,
57023                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
57024       }
57025       break;
57026     case ISD::AND:
57027     case ISD::OR:
57028     case ISD::XOR:
57029     case X86ISD::ANDNP:
57030       if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57031                        (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
57032         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
57033         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
57034                                  NumOps * SrcVT.getVectorNumElements());
57035         return DAG.getNode(Op0.getOpcode(), DL, VT,
57036                            ConcatSubOperand(SrcVT, Ops, 0),
57037                            ConcatSubOperand(SrcVT, Ops, 1));
57038       }
57039       break;
57040     case X86ISD::GF2P8AFFINEQB:
57041       if (!IsSplat &&
57042           (VT.is256BitVector() ||
57043            (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
57044           llvm::all_of(Ops, [Op0](SDValue Op) {
57045             return Op0.getOperand(2) == Op.getOperand(2);
57046           })) {
57047         return DAG.getNode(Op0.getOpcode(), DL, VT,
57048                            ConcatSubOperand(VT, Ops, 0),
57049                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
57050       }
57051       break;
57052     case ISD::ADD:
57053     case ISD::SUB:
57054     case ISD::MUL:
57055       if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57056                        (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
57057                         (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
57058         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
57059         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
57060                                  NumOps * SrcVT.getVectorNumElements());
57061         return DAG.getNode(Op0.getOpcode(), DL, VT,
57062                            ConcatSubOperand(SrcVT, Ops, 0),
57063                            ConcatSubOperand(SrcVT, Ops, 1));
57064       }
57065       break;
57066     // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
57067     // their latency are short, so here we don't replace them.
57068     case ISD::FDIV:
57069       if (!IsSplat && (VT.is256BitVector() ||
57070                        (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
57071         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
57072         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
57073                                  NumOps * SrcVT.getVectorNumElements());
57074         return DAG.getNode(Op0.getOpcode(), DL, VT,
57075                            ConcatSubOperand(SrcVT, Ops, 0),
57076                            ConcatSubOperand(SrcVT, Ops, 1));
57077       }
57078       break;
57079     case X86ISD::HADD:
57080     case X86ISD::HSUB:
57081     case X86ISD::FHADD:
57082     case X86ISD::FHSUB:
57083     case X86ISD::PACKSS:
57084     case X86ISD::PACKUS:
57085       if (!IsSplat && VT.is256BitVector() &&
57086           (VT.isFloatingPoint() || Subtarget.hasInt256())) {
57087         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
57088         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
57089                                  NumOps * SrcVT.getVectorNumElements());
57090         return DAG.getNode(Op0.getOpcode(), DL, VT,
57091                            ConcatSubOperand(SrcVT, Ops, 0),
57092                            ConcatSubOperand(SrcVT, Ops, 1));
57093       }
57094       break;
57095     case X86ISD::PALIGNR:
57096       if (!IsSplat &&
57097           ((VT.is256BitVector() && Subtarget.hasInt256()) ||
57098            (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
57099           llvm::all_of(Ops, [Op0](SDValue Op) {
57100             return Op0.getOperand(2) == Op.getOperand(2);
57101           })) {
57102         return DAG.getNode(Op0.getOpcode(), DL, VT,
57103                            ConcatSubOperand(VT, Ops, 0),
57104                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
57105       }
57106       break;
57107     case ISD::VSELECT:
57108       if (!IsSplat && Subtarget.hasAVX512() &&
57109           (VT.is256BitVector() ||
57110            (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
57111           (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
57112         EVT SelVT = Ops[0].getOperand(0).getValueType();
57113         if (SelVT.getVectorElementType() == MVT::i1) {
57114           SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
57115                                    Ops.size() * SelVT.getVectorNumElements());
57116           if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
57117             return DAG.getNode(Op0.getOpcode(), DL, VT,
57118                                ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
57119                                ConcatSubOperand(VT, Ops, 1),
57120                                ConcatSubOperand(VT, Ops, 2));
57121         }
57122       }
57123       [[fallthrough]];
57124     case X86ISD::BLENDV:
57125       if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&
57126           (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
57127           IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
57128         EVT SelVT = Ops[0].getOperand(0).getValueType();
57129         SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());
57130         if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
57131           return DAG.getNode(Op0.getOpcode(), DL, VT,
57132                              ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
57133                              ConcatSubOperand(VT, Ops, 1),
57134                              ConcatSubOperand(VT, Ops, 2));
57135       }
57136       break;
57137     }
57138   }
57139
57140   // Fold subvector loads into one.
57141   // If needed, look through bitcasts to get to the load.
57142   if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
57143     unsigned Fast;
57144     const X86TargetLowering *TLI = Subtarget.getTargetLowering();
57145     if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
57146                                 *FirstLd->getMemOperand(), &Fast) &&
57147         Fast) {
57148       if (SDValue Ld =
57149               EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
57150         return Ld;
57151     }
57152   }
57153
57154   // Attempt to fold target constant loads.
57155   if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
57156     SmallVector<APInt> EltBits;
57157     APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
57158     for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
57159       APInt OpUndefElts;
57160       SmallVector<APInt> OpEltBits;
57161       if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
57162                                         OpEltBits, true, false))
57163           break;
57164       EltBits.append(OpEltBits);
57165       UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
57166     }
57167     if (EltBits.size() == VT.getVectorNumElements())
57168       return getConstVector(EltBits, UndefElts, VT, DAG, DL);
57169   }
57170
57171   return SDValue();
57172 }
57173
57174 static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
57175                                      TargetLowering::DAGCombinerInfo &DCI,
57176                                      const X86Subtarget &Subtarget) {
57177   EVT VT = N->getValueType(0);
57178   EVT SrcVT = N->getOperand(0).getValueType();
57179   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57180   SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
57181
57182   if (VT.getVectorElementType() == MVT::i1) {
57183     // Attempt to constant fold.
57184     unsigned SubSizeInBits = SrcVT.getSizeInBits();
57185     APInt Constant = APInt::getZero(VT.getSizeInBits());
57186     for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
57187       auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
57188       if (!C) break;
57189       Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
57190       if (I == (E - 1)) {
57191         EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
57192         if (TLI.isTypeLegal(IntVT))
57193           return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
57194       }
57195     }
57196
57197     // Don't do anything else for i1 vectors.
57198     return SDValue();
57199   }
57200
57201   if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
57202     if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
57203                                            DCI, Subtarget))
57204       return R;
57205   }
57206
57207   return SDValue();
57208 }
57209
57210 static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
57211                                        TargetLowering::DAGCombinerInfo &DCI,
57212                                        const X86Subtarget &Subtarget) {
57213   if (DCI.isBeforeLegalizeOps())
57214     return SDValue();
57215
57216   MVT OpVT = N->getSimpleValueType(0);
57217
57218   bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
57219
57220   SDLoc dl(N);
57221   SDValue Vec = N->getOperand(0);
57222   SDValue SubVec = N->getOperand(1);
57223
57224   uint64_t IdxVal = N->getConstantOperandVal(2);
57225   MVT SubVecVT = SubVec.getSimpleValueType();
57226
57227   if (Vec.isUndef() && SubVec.isUndef())
57228     return DAG.getUNDEF(OpVT);
57229
57230   // Inserting undefs/zeros into zeros/undefs is a zero vector.
57231   if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
57232       (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
57233     return getZeroVector(OpVT, Subtarget, DAG, dl);
57234
57235   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
57236     // If we're inserting into a zero vector and then into a larger zero vector,
57237     // just insert into the larger zero vector directly.
57238     if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
57239         ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
57240       uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
57241       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
57242                          getZeroVector(OpVT, Subtarget, DAG, dl),
57243                          SubVec.getOperand(1),
57244                          DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
57245     }
57246
57247     // If we're inserting into a zero vector and our input was extracted from an
57248     // insert into a zero vector of the same type and the extraction was at
57249     // least as large as the original insertion. Just insert the original
57250     // subvector into a zero vector.
57251     if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
57252         isNullConstant(SubVec.getOperand(1)) &&
57253         SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
57254       SDValue Ins = SubVec.getOperand(0);
57255       if (isNullConstant(Ins.getOperand(2)) &&
57256           ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
57257           Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
57258               SubVecVT.getFixedSizeInBits())
57259           return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
57260                              getZeroVector(OpVT, Subtarget, DAG, dl),
57261                              Ins.getOperand(1), N->getOperand(2));
57262     }
57263   }
57264
57265   // Stop here if this is an i1 vector.
57266   if (IsI1Vector)
57267     return SDValue();
57268
57269   // Eliminate an intermediate vector widening:
57270   // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
57271   // insert_subvector X, Y, Idx
57272   // TODO: This is a more general version of a DAGCombiner fold, can we move it
57273   // there?
57274   if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
57275       SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
57276     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
57277                        SubVec.getOperand(1), N->getOperand(2));
57278
57279   // If this is an insert of an extract, combine to a shuffle. Don't do this
57280   // if the insert or extract can be represented with a subregister operation.
57281   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
57282       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
57283       (IdxVal != 0 ||
57284        !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
57285     int ExtIdxVal = SubVec.getConstantOperandVal(1);
57286     if (ExtIdxVal != 0) {
57287       int VecNumElts = OpVT.getVectorNumElements();
57288       int SubVecNumElts = SubVecVT.getVectorNumElements();
57289       SmallVector<int, 64> Mask(VecNumElts);
57290       // First create an identity shuffle mask.
57291       for (int i = 0; i != VecNumElts; ++i)
57292         Mask[i] = i;
57293       // Now insert the extracted portion.
57294       for (int i = 0; i != SubVecNumElts; ++i)
57295         Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
57296
57297       return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
57298     }
57299   }
57300
57301   // Match concat_vector style patterns.
57302   SmallVector<SDValue, 2> SubVectorOps;
57303   if (collectConcatOps(N, SubVectorOps, DAG)) {
57304     if (SDValue Fold =
57305             combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
57306       return Fold;
57307
57308     // If we're inserting all zeros into the upper half, change this to
57309     // a concat with zero. We will match this to a move
57310     // with implicit upper bit zeroing during isel.
57311     // We do this here because we don't want combineConcatVectorOps to
57312     // create INSERT_SUBVECTOR from CONCAT_VECTORS.
57313     if (SubVectorOps.size() == 2 &&
57314         ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
57315       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
57316                          getZeroVector(OpVT, Subtarget, DAG, dl),
57317                          SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
57318   }
57319
57320   // If this is a broadcast insert into an upper undef, use a larger broadcast.
57321   if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
57322     return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
57323
57324   // If this is a broadcast load inserted into an upper undef, use a larger
57325   // broadcast load.
57326   if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
57327       SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
57328     auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
57329     SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
57330     SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
57331     SDValue BcastLd =
57332         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
57333                                 MemIntr->getMemoryVT(),
57334                                 MemIntr->getMemOperand());
57335     DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
57336     return BcastLd;
57337   }
57338
57339   // If we're splatting the lower half subvector of a full vector load into the
57340   // upper half, attempt to create a subvector broadcast.
57341   if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
57342       Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
57343     auto *VecLd = dyn_cast<LoadSDNode>(Vec);
57344     auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
57345     if (VecLd && SubLd &&
57346         DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
57347                                            SubVec.getValueSizeInBits() / 8, 0))
57348       return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
57349                                SubLd, 0, DAG);
57350   }
57351
57352   return SDValue();
57353 }
57354
57355 /// If we are extracting a subvector of a vector select and the select condition
57356 /// is composed of concatenated vectors, try to narrow the select width. This
57357 /// is a common pattern for AVX1 integer code because 256-bit selects may be
57358 /// legal, but there is almost no integer math/logic available for 256-bit.
57359 /// This function should only be called with legal types (otherwise, the calls
57360 /// to get simple value types will assert).
57361 static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
57362   SDValue Sel = Ext->getOperand(0);
57363   if (Sel.getOpcode() != ISD::VSELECT ||
57364       !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG))
57365     return SDValue();
57366
57367   // Note: We assume simple value types because this should only be called with
57368   //       legal operations/types.
57369   // TODO: This can be extended to handle extraction to 256-bits.
57370   MVT VT = Ext->getSimpleValueType(0);
57371   if (!VT.is128BitVector())
57372     return SDValue();
57373
57374   MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
57375   if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
57376     return SDValue();
57377
57378   MVT WideVT = Ext->getOperand(0).getSimpleValueType();
57379   MVT SelVT = Sel.getSimpleValueType();
57380   assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
57381          "Unexpected vector type with legal operations");
57382
57383   unsigned SelElts = SelVT.getVectorNumElements();
57384   unsigned CastedElts = WideVT.getVectorNumElements();
57385   unsigned ExtIdx = Ext->getConstantOperandVal(1);
57386   if (SelElts % CastedElts == 0) {
57387     // The select has the same or more (narrower) elements than the extract
57388     // operand. The extraction index gets scaled by that factor.
57389     ExtIdx *= (SelElts / CastedElts);
57390   } else if (CastedElts % SelElts == 0) {
57391     // The select has less (wider) elements than the extract operand. Make sure
57392     // that the extraction index can be divided evenly.
57393     unsigned IndexDivisor = CastedElts / SelElts;
57394     if (ExtIdx % IndexDivisor != 0)
57395       return SDValue();
57396     ExtIdx /= IndexDivisor;
57397   } else {
57398     llvm_unreachable("Element count of simple vector types are not divisible?");
57399   }
57400
57401   unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
57402   unsigned NarrowElts = SelElts / NarrowingFactor;
57403   MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
57404   SDLoc DL(Ext);
57405   SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
57406   SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
57407   SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
57408   SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
57409   return DAG.getBitcast(VT, NarrowSel);
57410 }
57411
57412 static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
57413                                         TargetLowering::DAGCombinerInfo &DCI,
57414                                         const X86Subtarget &Subtarget) {
57415   // For AVX1 only, if we are extracting from a 256-bit and+not (which will
57416   // eventually get combined/lowered into ANDNP) with a concatenated operand,
57417   // split the 'and' into 128-bit ops to avoid the concatenate and extract.
57418   // We let generic combining take over from there to simplify the
57419   // insert/extract and 'not'.
57420   // This pattern emerges during AVX1 legalization. We handle it before lowering
57421   // to avoid complications like splitting constant vector loads.
57422
57423   // Capture the original wide type in the likely case that we need to bitcast
57424   // back to this type.
57425   if (!N->getValueType(0).isSimple())
57426     return SDValue();
57427
57428   MVT VT = N->getSimpleValueType(0);
57429   SDValue InVec = N->getOperand(0);
57430   unsigned IdxVal = N->getConstantOperandVal(1);
57431   SDValue InVecBC = peekThroughBitcasts(InVec);
57432   EVT InVecVT = InVec.getValueType();
57433   unsigned SizeInBits = VT.getSizeInBits();
57434   unsigned InSizeInBits = InVecVT.getSizeInBits();
57435   unsigned NumSubElts = VT.getVectorNumElements();
57436   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57437
57438   if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
57439       TLI.isTypeLegal(InVecVT) &&
57440       InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
57441     auto isConcatenatedNot = [](SDValue V) {
57442       V = peekThroughBitcasts(V);
57443       if (!isBitwiseNot(V))
57444         return false;
57445       SDValue NotOp = V->getOperand(0);
57446       return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
57447     };
57448     if (isConcatenatedNot(InVecBC.getOperand(0)) ||
57449         isConcatenatedNot(InVecBC.getOperand(1))) {
57450       // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
57451       SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
57452       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
57453                          DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
57454     }
57455   }
57456
57457   if (DCI.isBeforeLegalizeOps())
57458     return SDValue();
57459
57460   if (SDValue V = narrowExtractedVectorSelect(N, DAG))
57461     return V;
57462
57463   if (ISD::isBuildVectorAllZeros(InVec.getNode()))
57464     return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
57465
57466   if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
57467     if (VT.getScalarType() == MVT::i1)
57468       return DAG.getConstant(1, SDLoc(N), VT);
57469     return getOnesVector(VT, DAG, SDLoc(N));
57470   }
57471
57472   if (InVec.getOpcode() == ISD::BUILD_VECTOR)
57473     return DAG.getBuildVector(VT, SDLoc(N),
57474                               InVec->ops().slice(IdxVal, NumSubElts));
57475
57476   // If we are extracting from an insert into a larger vector, replace with a
57477   // smaller insert if we don't access less than the original subvector. Don't
57478   // do this for i1 vectors.
57479   // TODO: Relax the matching indices requirement?
57480   if (VT.getVectorElementType() != MVT::i1 &&
57481       InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
57482       IdxVal == InVec.getConstantOperandVal(2) &&
57483       InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
57484     SDLoc DL(N);
57485     SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
57486                                  InVec.getOperand(0), N->getOperand(1));
57487     unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
57488     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
57489                        InVec.getOperand(1),
57490                        DAG.getVectorIdxConstant(NewIdxVal, DL));
57491   }
57492
57493   // If we're extracting an upper subvector from a broadcast we should just
57494   // extract the lowest subvector instead which should allow
57495   // SimplifyDemandedVectorElts do more simplifications.
57496   if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
57497                       InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
57498                       DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
57499     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
57500
57501   // If we're extracting a broadcasted subvector, just use the lowest subvector.
57502   if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
57503       cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
57504     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
57505
57506   // Attempt to extract from the source of a shuffle vector.
57507   if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
57508     SmallVector<int, 32> ShuffleMask;
57509     SmallVector<int, 32> ScaledMask;
57510     SmallVector<SDValue, 2> ShuffleInputs;
57511     unsigned NumSubVecs = InSizeInBits / SizeInBits;
57512     // Decode the shuffle mask and scale it so its shuffling subvectors.
57513     if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
57514         scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
57515       unsigned SubVecIdx = IdxVal / NumSubElts;
57516       if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
57517         return DAG.getUNDEF(VT);
57518       if (ScaledMask[SubVecIdx] == SM_SentinelZero)
57519         return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
57520       SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
57521       if (Src.getValueSizeInBits() == InSizeInBits) {
57522         unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
57523         unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
57524         return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
57525                                 SDLoc(N), SizeInBits);
57526       }
57527     }
57528   }
57529
57530   // If we're extracting the lowest subvector and we're the only user,
57531   // we may be able to perform this with a smaller vector width.
57532   unsigned InOpcode = InVec.getOpcode();
57533   if (InVec.hasOneUse()) {
57534     if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
57535       // v2f64 CVTDQ2PD(v4i32).
57536       if (InOpcode == ISD::SINT_TO_FP &&
57537           InVec.getOperand(0).getValueType() == MVT::v4i32) {
57538         return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
57539       }
57540       // v2f64 CVTUDQ2PD(v4i32).
57541       if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
57542           InVec.getOperand(0).getValueType() == MVT::v4i32) {
57543         return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
57544       }
57545       // v2f64 CVTPS2PD(v4f32).
57546       if (InOpcode == ISD::FP_EXTEND &&
57547           InVec.getOperand(0).getValueType() == MVT::v4f32) {
57548         return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
57549       }
57550     }
57551     if (IdxVal == 0 &&
57552         (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
57553         (SizeInBits == 128 || SizeInBits == 256) &&
57554         InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
57555       SDLoc DL(N);
57556       SDValue Ext = InVec.getOperand(0);
57557       if (Ext.getValueSizeInBits() > SizeInBits)
57558         Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
57559       unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
57560       return DAG.getNode(ExtOp, DL, VT, Ext);
57561     }
57562     if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
57563         InVec.getOperand(0).getValueType().is256BitVector() &&
57564         InVec.getOperand(1).getValueType().is256BitVector() &&
57565         InVec.getOperand(2).getValueType().is256BitVector()) {
57566       SDLoc DL(N);
57567       SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
57568       SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
57569       SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
57570       return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
57571     }
57572     if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
57573         (VT.is128BitVector() || VT.is256BitVector())) {
57574       SDLoc DL(N);
57575       SDValue InVecSrc = InVec.getOperand(0);
57576       unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
57577       SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
57578       return DAG.getNode(InOpcode, DL, VT, Ext);
57579     }
57580     if (InOpcode == X86ISD::MOVDDUP &&
57581         (VT.is128BitVector() || VT.is256BitVector())) {
57582       SDLoc DL(N);
57583       SDValue Ext0 =
57584           extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
57585       return DAG.getNode(InOpcode, DL, VT, Ext0);
57586     }
57587   }
57588
57589   // Always split vXi64 logical shifts where we're extracting the upper 32-bits
57590   // as this is very likely to fold into a shuffle/truncation.
57591   if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
57592       InVecVT.getScalarSizeInBits() == 64 &&
57593       InVec.getConstantOperandAPInt(1) == 32) {
57594     SDLoc DL(N);
57595     SDValue Ext =
57596         extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
57597     return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
57598   }
57599
57600   return SDValue();
57601 }
57602
57603 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
57604   EVT VT = N->getValueType(0);
57605   SDValue Src = N->getOperand(0);
57606   SDLoc DL(N);
57607
57608   // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
57609   // This occurs frequently in our masked scalar intrinsic code and our
57610   // floating point select lowering with AVX512.
57611   // TODO: SimplifyDemandedBits instead?
57612   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
57613       isOneConstant(Src.getOperand(1)))
57614     return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
57615
57616   // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
57617   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57618       Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
57619       Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
57620     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
57621       if (C->isZero())
57622         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
57623                            Src.getOperand(1));
57624
57625   // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
57626   // TODO: Move to DAGCombine/SimplifyDemandedBits?
57627   if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
57628     auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
57629       if (Op.getValueType() != MVT::i64)
57630         return SDValue();
57631       unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
57632       if (Op.getOpcode() == Opc &&
57633           Op.getOperand(0).getScalarValueSizeInBits() <= 32)
57634         return Op.getOperand(0);
57635       unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
57636       if (auto *Ld = dyn_cast<LoadSDNode>(Op))
57637         if (Ld->getExtensionType() == Ext &&
57638             Ld->getMemoryVT().getScalarSizeInBits() <= 32)
57639           return Op;
57640       if (IsZeroExt) {
57641         KnownBits Known = DAG.computeKnownBits(Op);
57642         if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
57643           return Op;
57644       }
57645       return SDValue();
57646     };
57647
57648     if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
57649       return DAG.getBitcast(
57650           VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
57651                           DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
57652
57653     if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
57654       return DAG.getBitcast(
57655           VT,
57656           DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
57657                       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
57658                                   DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
57659   }
57660
57661   // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
57662   if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
57663       Src.getOperand(0).getValueType() == MVT::x86mmx)
57664     return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
57665
57666   // See if we're broadcasting the scalar value, in which case just reuse that.
57667   // Ensure the same SDValue from the SDNode use is being used.
57668   if (VT.getScalarType() == Src.getValueType())
57669     for (SDNode *User : Src->uses())
57670       if (User->getOpcode() == X86ISD::VBROADCAST &&
57671           Src == User->getOperand(0)) {
57672         unsigned SizeInBits = VT.getFixedSizeInBits();
57673         unsigned BroadcastSizeInBits =
57674             User->getValueSizeInBits(0).getFixedValue();
57675         if (BroadcastSizeInBits == SizeInBits)
57676           return SDValue(User, 0);
57677         if (BroadcastSizeInBits > SizeInBits)
57678           return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
57679         // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
57680         // coverage.
57681       }
57682
57683   return SDValue();
57684 }
57685
57686 // Simplify PMULDQ and PMULUDQ operations.
57687 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
57688                              TargetLowering::DAGCombinerInfo &DCI,
57689                              const X86Subtarget &Subtarget) {
57690   SDValue LHS = N->getOperand(0);
57691   SDValue RHS = N->getOperand(1);
57692
57693   // Canonicalize constant to RHS.
57694   if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
57695       !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
57696     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
57697
57698   // Multiply by zero.
57699   // Don't return RHS as it may contain UNDEFs.
57700   if (ISD::isBuildVectorAllZeros(RHS.getNode()))
57701     return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
57702
57703   // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
57704   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57705   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
57706     return SDValue(N, 0);
57707
57708   // If the input is an extend_invec and the SimplifyDemandedBits call didn't
57709   // convert it to any_extend_invec, due to the LegalOperations check, do the
57710   // conversion directly to a vector shuffle manually. This exposes combine
57711   // opportunities missed by combineEXTEND_VECTOR_INREG not calling
57712   // combineX86ShufflesRecursively on SSE4.1 targets.
57713   // FIXME: This is basically a hack around several other issues related to
57714   // ANY_EXTEND_VECTOR_INREG.
57715   if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
57716       (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57717        LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57718       LHS.getOperand(0).getValueType() == MVT::v4i32) {
57719     SDLoc dl(N);
57720     LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
57721                                LHS.getOperand(0), { 0, -1, 1, -1 });
57722     LHS = DAG.getBitcast(MVT::v2i64, LHS);
57723     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57724   }
57725   if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
57726       (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
57727        RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
57728       RHS.getOperand(0).getValueType() == MVT::v4i32) {
57729     SDLoc dl(N);
57730     RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
57731                                RHS.getOperand(0), { 0, -1, 1, -1 });
57732     RHS = DAG.getBitcast(MVT::v2i64, RHS);
57733     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
57734   }
57735
57736   return SDValue();
57737 }
57738
57739 // Simplify VPMADDUBSW/VPMADDWD operations.
57740 static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
57741                              TargetLowering::DAGCombinerInfo &DCI) {
57742   EVT VT = N->getValueType(0);
57743   SDValue LHS = N->getOperand(0);
57744   SDValue RHS = N->getOperand(1);
57745
57746   // Multiply by zero.
57747   // Don't return LHS/RHS as it may contain UNDEFs.
57748   if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
57749       ISD::isBuildVectorAllZeros(RHS.getNode()))
57750     return DAG.getConstant(0, SDLoc(N), VT);
57751
57752   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57753   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57754   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57755     return SDValue(N, 0);
57756
57757   return SDValue();
57758 }
57759
57760 static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
57761                                           TargetLowering::DAGCombinerInfo &DCI,
57762                                           const X86Subtarget &Subtarget) {
57763   EVT VT = N->getValueType(0);
57764   SDValue In = N->getOperand(0);
57765   unsigned Opcode = N->getOpcode();
57766   unsigned InOpcode = In.getOpcode();
57767   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57768   SDLoc DL(N);
57769
57770   // Try to merge vector loads and extend_inreg to an extload.
57771   if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
57772       In.hasOneUse()) {
57773     auto *Ld = cast<LoadSDNode>(In);
57774     if (Ld->isSimple()) {
57775       MVT SVT = In.getSimpleValueType().getVectorElementType();
57776       ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
57777                                  ? ISD::SEXTLOAD
57778                                  : ISD::ZEXTLOAD;
57779       EVT MemVT = VT.changeVectorElementType(SVT);
57780       if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
57781         SDValue Load = DAG.getExtLoad(
57782             Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
57783             MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
57784         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
57785         return Load;
57786       }
57787     }
57788   }
57789
57790   // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
57791   if (Opcode == InOpcode)
57792     return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
57793
57794   // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
57795   // -> EXTEND_VECTOR_INREG(X).
57796   // TODO: Handle non-zero subvector indices.
57797   if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
57798       In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
57799       In.getOperand(0).getOperand(0).getValueSizeInBits() ==
57800           In.getValueSizeInBits())
57801     return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
57802
57803   // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
57804   // TODO: Move to DAGCombine?
57805   if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
57806       In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
57807       In.getValueSizeInBits() == VT.getSizeInBits()) {
57808     unsigned NumElts = VT.getVectorNumElements();
57809     unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
57810     EVT EltVT = In.getOperand(0).getValueType();
57811     SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
57812     for (unsigned I = 0; I != NumElts; ++I)
57813       Elts[I * Scale] = In.getOperand(I);
57814     return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
57815   }
57816
57817   // Attempt to combine as a shuffle on SSE41+ targets.
57818   if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
57819        Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&
57820       Subtarget.hasSSE41()) {
57821     SDValue Op(N, 0);
57822     if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
57823       if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
57824         return Res;
57825   }
57826
57827   return SDValue();
57828 }
57829
57830 static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
57831                              TargetLowering::DAGCombinerInfo &DCI) {
57832   EVT VT = N->getValueType(0);
57833
57834   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
57835     return DAG.getConstant(0, SDLoc(N), VT);
57836
57837   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57838   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
57839   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
57840     return SDValue(N, 0);
57841
57842   return SDValue();
57843 }
57844
57845 // Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
57846 // Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
57847 // extra instructions between the conversion due to going to scalar and back.
57848 static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
57849                                  const X86Subtarget &Subtarget) {
57850   if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
57851     return SDValue();
57852
57853   if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
57854     return SDValue();
57855
57856   if (N->getValueType(0) != MVT::f32 ||
57857       N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
57858     return SDValue();
57859
57860   SDLoc dl(N);
57861   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
57862                             N->getOperand(0).getOperand(0));
57863   Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
57864                     DAG.getTargetConstant(4, dl, MVT::i32));
57865   Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
57866   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
57867                      DAG.getIntPtrConstant(0, dl));
57868 }
57869
57870 static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
57871                                 const X86Subtarget &Subtarget) {
57872   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57873     return SDValue();
57874
57875   if (Subtarget.hasFP16())
57876     return SDValue();
57877
57878   bool IsStrict = N->isStrictFPOpcode();
57879   EVT VT = N->getValueType(0);
57880   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57881   EVT SrcVT = Src.getValueType();
57882
57883   if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
57884     return SDValue();
57885
57886   if (VT.getVectorElementType() != MVT::f32 &&
57887       VT.getVectorElementType() != MVT::f64)
57888     return SDValue();
57889
57890   unsigned NumElts = VT.getVectorNumElements();
57891   if (NumElts == 1 || !isPowerOf2_32(NumElts))
57892     return SDValue();
57893
57894   SDLoc dl(N);
57895
57896   // Convert the input to vXi16.
57897   EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
57898   Src = DAG.getBitcast(IntVT, Src);
57899
57900   // Widen to at least 8 input elements.
57901   if (NumElts < 8) {
57902     unsigned NumConcats = 8 / NumElts;
57903     SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
57904                                 : DAG.getConstant(0, dl, IntVT);
57905     SmallVector<SDValue, 4> Ops(NumConcats, Fill);
57906     Ops[0] = Src;
57907     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
57908   }
57909
57910   // Destination is vXf32 with at least 4 elements.
57911   EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
57912                                std::max(4U, NumElts));
57913   SDValue Cvt, Chain;
57914   if (IsStrict) {
57915     Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
57916                       {N->getOperand(0), Src});
57917     Chain = Cvt.getValue(1);
57918   } else {
57919     Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
57920   }
57921
57922   if (NumElts < 4) {
57923     assert(NumElts == 2 && "Unexpected size");
57924     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
57925                       DAG.getIntPtrConstant(0, dl));
57926   }
57927
57928   if (IsStrict) {
57929     // Extend to the original VT if necessary.
57930     if (Cvt.getValueType() != VT) {
57931       Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
57932                         {Chain, Cvt});
57933       Chain = Cvt.getValue(1);
57934     }
57935     return DAG.getMergeValues({Cvt, Chain}, dl);
57936   }
57937
57938   // Extend to the original VT if necessary.
57939   return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
57940 }
57941
57942 // Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
57943 // from. Limit this to cases where the loads have the same input chain and the
57944 // output chains are unused. This avoids any memory ordering issues.
57945 static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
57946                                      TargetLowering::DAGCombinerInfo &DCI) {
57947   assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
57948           N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
57949          "Unknown broadcast load type");
57950
57951   // Only do this if the chain result is unused.
57952   if (N->hasAnyUseOfValue(1))
57953     return SDValue();
57954
57955   auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
57956
57957   SDValue Ptr = MemIntrin->getBasePtr();
57958   SDValue Chain = MemIntrin->getChain();
57959   EVT VT = N->getSimpleValueType(0);
57960   EVT MemVT = MemIntrin->getMemoryVT();
57961
57962   // Look at other users of our base pointer and try to find a wider broadcast.
57963   // The input chain and the size of the memory VT must match.
57964   for (SDNode *User : Ptr->uses())
57965     if (User != N && User->getOpcode() == N->getOpcode() &&
57966         cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
57967         cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
57968         cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
57969             MemVT.getSizeInBits() &&
57970         !User->hasAnyUseOfValue(1) &&
57971         User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
57972       SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
57973                                          VT.getSizeInBits());
57974       Extract = DAG.getBitcast(VT, Extract);
57975       return DCI.CombineTo(N, Extract, SDValue(User, 1));
57976     }
57977
57978   return SDValue();
57979 }
57980
57981 static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
57982                                const X86Subtarget &Subtarget) {
57983   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
57984     return SDValue();
57985
57986   bool IsStrict = N->isStrictFPOpcode();
57987   EVT VT = N->getValueType(0);
57988   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
57989   EVT SrcVT = Src.getValueType();
57990
57991   if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
57992       SrcVT.getVectorElementType() != MVT::f32)
57993     return SDValue();
57994
57995   SDLoc dl(N);
57996
57997   SDValue Cvt, Chain;
57998   unsigned NumElts = VT.getVectorNumElements();
57999   if (Subtarget.hasFP16()) {
58000     // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))
58001     // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))
58002     if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {
58003       SDValue Cvt0, Cvt1;
58004       SDValue Op0 = Src.getOperand(0);
58005       SDValue Op1 = Src.getOperand(1);
58006       bool IsOp0Strict = Op0->isStrictFPOpcode();
58007       if (Op0.getOpcode() != Op1.getOpcode() ||
58008           Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
58009           Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
58010         return SDValue();
58011       }
58012       int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
58013       if (IsStrict) {
58014         assert(IsOp0Strict && "Op0 must be strict node");
58015         unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
58016                            ? X86ISD::STRICT_CVTSI2P
58017                            : X86ISD::STRICT_CVTUI2P;
58018         Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
58019                            {Op0.getOperand(0), Op0.getOperand(1)});
58020         Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
58021                            {Op1.getOperand(0), Op1.getOperand(1)});
58022         Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
58023         return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
58024       }
58025       unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
58026                                                         : X86ISD::CVTUI2P;
58027       Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
58028       Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
58029       return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
58030     }
58031     return SDValue();
58032   }
58033
58034   if (NumElts == 1 || !isPowerOf2_32(NumElts))
58035     return SDValue();
58036
58037   // Widen to at least 4 input elements.
58038   if (NumElts < 4)
58039     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
58040                       DAG.getConstantFP(0.0, dl, SrcVT));
58041
58042   // Destination is v8i16 with at least 8 elements.
58043   EVT CvtVT =
58044       EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
58045   SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
58046   if (IsStrict) {
58047     Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
58048                       {N->getOperand(0), Src, Rnd});
58049     Chain = Cvt.getValue(1);
58050   } else {
58051     Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
58052   }
58053
58054   // Extract down to real number of elements.
58055   if (NumElts < 8) {
58056     EVT IntVT = VT.changeVectorElementTypeToInteger();
58057     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
58058                       DAG.getIntPtrConstant(0, dl));
58059   }
58060
58061   Cvt = DAG.getBitcast(VT, Cvt);
58062
58063   if (IsStrict)
58064     return DAG.getMergeValues({Cvt, Chain}, dl);
58065
58066   return Cvt;
58067 }
58068
58069 static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
58070   SDValue Src = N->getOperand(0);
58071
58072   // Turn MOVDQ2Q+simple_load into an mmx load.
58073   if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
58074     LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
58075
58076     if (LN->isSimple()) {
58077       SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
58078                                   LN->getBasePtr(),
58079                                   LN->getPointerInfo(),
58080                                   LN->getOriginalAlign(),
58081                                   LN->getMemOperand()->getFlags());
58082       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
58083       return NewLd;
58084     }
58085   }
58086
58087   return SDValue();
58088 }
58089
58090 static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
58091                            TargetLowering::DAGCombinerInfo &DCI) {
58092   unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
58093   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58094   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
58095     return SDValue(N, 0);
58096
58097   return SDValue();
58098 }
58099
58100 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
58101                                              DAGCombinerInfo &DCI) const {
58102   SelectionDAG &DAG = DCI.DAG;
58103   switch (N->getOpcode()) {
58104   default: break;
58105   case ISD::SCALAR_TO_VECTOR:
58106     return combineScalarToVector(N, DAG);
58107   case ISD::EXTRACT_VECTOR_ELT:
58108   case X86ISD::PEXTRW:
58109   case X86ISD::PEXTRB:
58110     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
58111   case ISD::CONCAT_VECTORS:
58112     return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
58113   case ISD::INSERT_SUBVECTOR:
58114     return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
58115   case ISD::EXTRACT_SUBVECTOR:
58116     return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
58117   case ISD::VSELECT:
58118   case ISD::SELECT:
58119   case X86ISD::BLENDV:      return combineSelect(N, DAG, DCI, Subtarget);
58120   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
58121   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
58122   case X86ISD::CMP:         return combineCMP(N, DAG);
58123   case ISD::ADD:            return combineAdd(N, DAG, DCI, Subtarget);
58124   case ISD::SUB:            return combineSub(N, DAG, DCI, Subtarget);
58125   case X86ISD::ADD:
58126   case X86ISD::SUB:         return combineX86AddSub(N, DAG, DCI);
58127   case X86ISD::SBB:         return combineSBB(N, DAG);
58128   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
58129   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
58130   case ISD::SHL:            return combineShiftLeft(N, DAG);
58131   case ISD::SRA:            return combineShiftRightArithmetic(N, DAG, Subtarget);
58132   case ISD::SRL:            return combineShiftRightLogical(N, DAG, DCI, Subtarget);
58133   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
58134   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
58135   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
58136   case X86ISD::BEXTR:
58137   case X86ISD::BEXTRI:      return combineBEXTR(N, DAG, DCI, Subtarget);
58138   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
58139   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
58140   case ISD::STORE:          return combineStore(N, DAG, DCI, Subtarget);
58141   case ISD::MSTORE:         return combineMaskedStore(N, DAG, DCI, Subtarget);
58142   case X86ISD::VEXTRACT_STORE:
58143     return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
58144   case ISD::SINT_TO_FP:
58145   case ISD::STRICT_SINT_TO_FP:
58146     return combineSIntToFP(N, DAG, DCI, Subtarget);
58147   case ISD::UINT_TO_FP:
58148   case ISD::STRICT_UINT_TO_FP:
58149     return combineUIntToFP(N, DAG, Subtarget);
58150   case ISD::FADD:
58151   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
58152   case X86ISD::VFCMULC:
58153   case X86ISD::VFMULC:      return combineFMulcFCMulc(N, DAG, Subtarget);
58154   case ISD::FNEG:           return combineFneg(N, DAG, DCI, Subtarget);
58155   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
58156   case X86ISD::VTRUNC:      return combineVTRUNC(N, DAG, DCI);
58157   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
58158   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
58159   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
58160   case X86ISD::FXOR:
58161   case X86ISD::FOR:         return combineFOr(N, DAG, DCI, Subtarget);
58162   case X86ISD::FMIN:
58163   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
58164   case ISD::FMINNUM:
58165   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
58166   case X86ISD::CVTSI2P:
58167   case X86ISD::CVTUI2P:     return combineX86INT_TO_FP(N, DAG, DCI);
58168   case X86ISD::CVTP2SI:
58169   case X86ISD::CVTP2UI:
58170   case X86ISD::STRICT_CVTTP2SI:
58171   case X86ISD::CVTTP2SI:
58172   case X86ISD::STRICT_CVTTP2UI:
58173   case X86ISD::CVTTP2UI:
58174                             return combineCVTP2I_CVTTP2I(N, DAG, DCI);
58175   case X86ISD::STRICT_CVTPH2PS:
58176   case X86ISD::CVTPH2PS:    return combineCVTPH2PS(N, DAG, DCI);
58177   case X86ISD::BT:          return combineBT(N, DAG, DCI);
58178   case ISD::ANY_EXTEND:
58179   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
58180   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
58181   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
58182   case ISD::ANY_EXTEND_VECTOR_INREG:
58183   case ISD::SIGN_EXTEND_VECTOR_INREG:
58184   case ISD::ZERO_EXTEND_VECTOR_INREG:
58185     return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
58186   case ISD::SETCC:          return combineSetCC(N, DAG, DCI, Subtarget);
58187   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
58188   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
58189   case X86ISD::PACKSS:
58190   case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);
58191   case X86ISD::HADD:
58192   case X86ISD::HSUB:
58193   case X86ISD::FHADD:
58194   case X86ISD::FHSUB:       return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
58195   case X86ISD::VSHL:
58196   case X86ISD::VSRA:
58197   case X86ISD::VSRL:
58198     return combineVectorShiftVar(N, DAG, DCI, Subtarget);
58199   case X86ISD::VSHLI:
58200   case X86ISD::VSRAI:
58201   case X86ISD::VSRLI:
58202     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
58203   case ISD::INSERT_VECTOR_ELT:
58204   case X86ISD::PINSRB:
58205   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
58206   case X86ISD::SHUFP:       // Handle all target specific shuffles
58207   case X86ISD::INSERTPS:
58208   case X86ISD::EXTRQI:
58209   case X86ISD::INSERTQI:
58210   case X86ISD::VALIGN:
58211   case X86ISD::PALIGNR:
58212   case X86ISD::VSHLDQ:
58213   case X86ISD::VSRLDQ:
58214   case X86ISD::BLENDI:
58215   case X86ISD::UNPCKH:
58216   case X86ISD::UNPCKL:
58217   case X86ISD::MOVHLPS:
58218   case X86ISD::MOVLHPS:
58219   case X86ISD::PSHUFB:
58220   case X86ISD::PSHUFD:
58221   case X86ISD::PSHUFHW:
58222   case X86ISD::PSHUFLW:
58223   case X86ISD::MOVSHDUP:
58224   case X86ISD::MOVSLDUP:
58225   case X86ISD::MOVDDUP:
58226   case X86ISD::MOVSS:
58227   case X86ISD::MOVSD:
58228   case X86ISD::MOVSH:
58229   case X86ISD::VBROADCAST:
58230   case X86ISD::VPPERM:
58231   case X86ISD::VPERMI:
58232   case X86ISD::VPERMV:
58233   case X86ISD::VPERMV3:
58234   case X86ISD::VPERMIL2:
58235   case X86ISD::VPERMILPI:
58236   case X86ISD::VPERMILPV:
58237   case X86ISD::VPERM2X128:
58238   case X86ISD::SHUF128:
58239   case X86ISD::VZEXT_MOVL:
58240   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
58241   case X86ISD::FMADD_RND:
58242   case X86ISD::FMSUB:
58243   case X86ISD::STRICT_FMSUB:
58244   case X86ISD::FMSUB_RND:
58245   case X86ISD::FNMADD:
58246   case X86ISD::STRICT_FNMADD:
58247   case X86ISD::FNMADD_RND:
58248   case X86ISD::FNMSUB:
58249   case X86ISD::STRICT_FNMSUB:
58250   case X86ISD::FNMSUB_RND:
58251   case ISD::FMA:
58252   case ISD::STRICT_FMA:     return combineFMA(N, DAG, DCI, Subtarget);
58253   case X86ISD::FMADDSUB_RND:
58254   case X86ISD::FMSUBADD_RND:
58255   case X86ISD::FMADDSUB:
58256   case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, DCI);
58257   case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI, Subtarget);
58258   case X86ISD::TESTP:       return combineTESTP(N, DAG, DCI, Subtarget);
58259   case X86ISD::MGATHER:
58260   case X86ISD::MSCATTER:
58261     return combineX86GatherScatter(N, DAG, DCI, Subtarget);
58262   case ISD::MGATHER:
58263   case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI);
58264   case X86ISD::PCMPEQ:
58265   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
58266   case X86ISD::PMULDQ:
58267   case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);
58268   case X86ISD::VPMADDUBSW:
58269   case X86ISD::VPMADDWD:    return combineVPMADD(N, DAG, DCI);
58270   case X86ISD::KSHIFTL:
58271   case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);
58272   case ISD::FP16_TO_FP:     return combineFP16_TO_FP(N, DAG, Subtarget);
58273   case ISD::STRICT_FP_EXTEND:
58274   case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, Subtarget);
58275   case ISD::STRICT_FP_ROUND:
58276   case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
58277   case X86ISD::VBROADCAST_LOAD:
58278   case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
58279   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
58280   case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);
58281   }
58282
58283   return SDValue();
58284 }
58285
58286 bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {
58287   return false;
58288 }
58289
58290 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
58291   if (!isTypeLegal(VT))
58292     return false;
58293
58294   // There are no vXi8 shifts.
58295   if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
58296     return false;
58297
58298   // TODO: Almost no 8-bit ops are desirable because they have no actual
58299   //       size/speed advantages vs. 32-bit ops, but they do have a major
58300   //       potential disadvantage by causing partial register stalls.
58301   //
58302   // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
58303   // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
58304   // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
58305   // check for a constant operand to the multiply.
58306   if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
58307     return false;
58308
58309   // i16 instruction encodings are longer and some i16 instructions are slow,
58310   // so those are not desirable.
58311   if (VT == MVT::i16) {
58312     switch (Opc) {
58313     default:
58314       break;
58315     case ISD::LOAD:
58316     case ISD::SIGN_EXTEND:
58317     case ISD::ZERO_EXTEND:
58318     case ISD::ANY_EXTEND:
58319     case ISD::SHL:
58320     case ISD::SRA:
58321     case ISD::SRL:
58322     case ISD::SUB:
58323     case ISD::ADD:
58324     case ISD::MUL:
58325     case ISD::AND:
58326     case ISD::OR:
58327     case ISD::XOR:
58328       return false;
58329     }
58330   }
58331
58332   // Any legal type not explicitly accounted for above here is desirable.
58333   return true;
58334 }
58335
58336 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
58337                                                   SDValue Value, SDValue Addr,
58338                                                   SelectionDAG &DAG) const {
58339   const Module *M = DAG.getMachineFunction().getMMI().getModule();
58340   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
58341   if (IsCFProtectionSupported) {
58342     // In case control-flow branch protection is enabled, we need to add
58343     // notrack prefix to the indirect branch.
58344     // In order to do that we create NT_BRIND SDNode.
58345     // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
58346     return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
58347   }
58348
58349   return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
58350 }
58351
58352 TargetLowering::AndOrSETCCFoldKind
58353 X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(
58354     const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
58355   using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;
58356   EVT VT = LogicOp->getValueType(0);
58357   EVT OpVT = SETCC0->getOperand(0).getValueType();
58358   if (!VT.isInteger())
58359     return AndOrSETCCFoldKind::None;
58360
58361   if (VT.isVector())
58362     return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |
58363                               (isOperationLegal(ISD::ABS, OpVT)
58364                                    ? AndOrSETCCFoldKind::ABS
58365                                    : AndOrSETCCFoldKind::None));
58366
58367   // Don't use `NotAnd` as even though `not` is generally shorter code size than
58368   // `add`, `add` can lower to LEA which can save moves / spills. Any case where
58369   // `NotAnd` applies, `AddAnd` does as well.
58370   // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
58371   // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
58372   return AndOrSETCCFoldKind::AddAnd;
58373 }
58374
58375 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
58376   EVT VT = Op.getValueType();
58377   bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
58378                              isa<ConstantSDNode>(Op.getOperand(1));
58379
58380   // i16 is legal, but undesirable since i16 instruction encodings are longer
58381   // and some i16 instructions are slow.
58382   // 8-bit multiply-by-constant can usually be expanded to something cheaper
58383   // using LEA and/or other ALU ops.
58384   if (VT != MVT::i16 && !Is8BitMulByConstant)
58385     return false;
58386
58387   auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
58388     if (!Op.hasOneUse())
58389       return false;
58390     SDNode *User = *Op->use_begin();
58391     if (!ISD::isNormalStore(User))
58392       return false;
58393     auto *Ld = cast<LoadSDNode>(Load);
58394     auto *St = cast<StoreSDNode>(User);
58395     return Ld->getBasePtr() == St->getBasePtr();
58396   };
58397
58398   auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
58399     if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
58400       return false;
58401     if (!Op.hasOneUse())
58402       return false;
58403     SDNode *User = *Op->use_begin();
58404     if (User->getOpcode() != ISD::ATOMIC_STORE)
58405       return false;
58406     auto *Ld = cast<AtomicSDNode>(Load);
58407     auto *St = cast<AtomicSDNode>(User);
58408     return Ld->getBasePtr() == St->getBasePtr();
58409   };
58410
58411   bool Commute = false;
58412   switch (Op.getOpcode()) {
58413   default: return false;
58414   case ISD::SIGN_EXTEND:
58415   case ISD::ZERO_EXTEND:
58416   case ISD::ANY_EXTEND:
58417     break;
58418   case ISD::SHL:
58419   case ISD::SRA:
58420   case ISD::SRL: {
58421     SDValue N0 = Op.getOperand(0);
58422     // Look out for (store (shl (load), x)).
58423     if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
58424       return false;
58425     break;
58426   }
58427   case ISD::ADD:
58428   case ISD::MUL:
58429   case ISD::AND:
58430   case ISD::OR:
58431   case ISD::XOR:
58432     Commute = true;
58433     [[fallthrough]];
58434   case ISD::SUB: {
58435     SDValue N0 = Op.getOperand(0);
58436     SDValue N1 = Op.getOperand(1);
58437     // Avoid disabling potential load folding opportunities.
58438     if (X86::mayFoldLoad(N1, Subtarget) &&
58439         (!Commute || !isa<ConstantSDNode>(N0) ||
58440          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
58441       return false;
58442     if (X86::mayFoldLoad(N0, Subtarget) &&
58443         ((Commute && !isa<ConstantSDNode>(N1)) ||
58444          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
58445       return false;
58446     if (IsFoldableAtomicRMW(N0, Op) ||
58447         (Commute && IsFoldableAtomicRMW(N1, Op)))
58448       return false;
58449   }
58450   }
58451
58452   PVT = MVT::i32;
58453   return true;
58454 }
58455
58456 //===----------------------------------------------------------------------===//
58457 //                           X86 Inline Assembly Support
58458 //===----------------------------------------------------------------------===//
58459
58460 // Helper to match a string separated by whitespace.
58461 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
58462   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
58463
58464   for (StringRef Piece : Pieces) {
58465     if (!S.startswith(Piece)) // Check if the piece matches.
58466       return false;
58467
58468     S = S.substr(Piece.size());
58469     StringRef::size_type Pos = S.find_first_not_of(" \t");
58470     if (Pos == 0) // We matched a prefix.
58471       return false;
58472
58473     S = S.substr(Pos);
58474   }
58475
58476   return S.empty();
58477 }
58478
58479 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
58480
58481   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
58482     if (llvm::is_contained(AsmPieces, "~{cc}") &&
58483         llvm::is_contained(AsmPieces, "~{flags}") &&
58484         llvm::is_contained(AsmPieces, "~{fpsr}")) {
58485
58486       if (AsmPieces.size() == 3)
58487         return true;
58488       else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
58489         return true;
58490     }
58491   }
58492   return false;
58493 }
58494
58495 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
58496   InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
58497
58498   const std::string &AsmStr = IA->getAsmString();
58499
58500   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
58501   if (!Ty || Ty->getBitWidth() % 16 != 0)
58502     return false;
58503
58504   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
58505   SmallVector<StringRef, 4> AsmPieces;
58506   SplitString(AsmStr, AsmPieces, ";\n");
58507
58508   switch (AsmPieces.size()) {
58509   default: return false;
58510   case 1:
58511     // FIXME: this should verify that we are targeting a 486 or better.  If not,
58512     // we will turn this bswap into something that will be lowered to logical
58513     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
58514     // lower so don't worry about this.
58515     // bswap $0
58516     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
58517         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
58518         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
58519         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
58520         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
58521         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
58522       // No need to check constraints, nothing other than the equivalent of
58523       // "=r,0" would be valid here.
58524       return IntrinsicLowering::LowerToByteSwap(CI);
58525     }
58526
58527     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
58528     if (CI->getType()->isIntegerTy(16) &&
58529         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
58530         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
58531          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
58532       AsmPieces.clear();
58533       StringRef ConstraintsStr = IA->getConstraintString();
58534       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
58535       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
58536       if (clobbersFlagRegisters(AsmPieces))
58537         return IntrinsicLowering::LowerToByteSwap(CI);
58538     }
58539     break;
58540   case 3:
58541     if (CI->getType()->isIntegerTy(32) &&
58542         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
58543         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
58544         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
58545         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
58546       AsmPieces.clear();
58547       StringRef ConstraintsStr = IA->getConstraintString();
58548       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
58549       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
58550       if (clobbersFlagRegisters(AsmPieces))
58551         return IntrinsicLowering::LowerToByteSwap(CI);
58552     }
58553
58554     if (CI->getType()->isIntegerTy(64)) {
58555       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
58556       if (Constraints.size() >= 2 &&
58557           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
58558           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
58559         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
58560         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
58561             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
58562             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
58563           return IntrinsicLowering::LowerToByteSwap(CI);
58564       }
58565     }
58566     break;
58567   }
58568   return false;
58569 }
58570
58571 static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
58572   X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
58573                            .Case("{@cca}", X86::COND_A)
58574                            .Case("{@ccae}", X86::COND_AE)
58575                            .Case("{@ccb}", X86::COND_B)
58576                            .Case("{@ccbe}", X86::COND_BE)
58577                            .Case("{@ccc}", X86::COND_B)
58578                            .Case("{@cce}", X86::COND_E)
58579                            .Case("{@ccz}", X86::COND_E)
58580                            .Case("{@ccg}", X86::COND_G)
58581                            .Case("{@ccge}", X86::COND_GE)
58582                            .Case("{@ccl}", X86::COND_L)
58583                            .Case("{@ccle}", X86::COND_LE)
58584                            .Case("{@ccna}", X86::COND_BE)
58585                            .Case("{@ccnae}", X86::COND_B)
58586                            .Case("{@ccnb}", X86::COND_AE)
58587                            .Case("{@ccnbe}", X86::COND_A)
58588                            .Case("{@ccnc}", X86::COND_AE)
58589                            .Case("{@ccne}", X86::COND_NE)
58590                            .Case("{@ccnz}", X86::COND_NE)
58591                            .Case("{@ccng}", X86::COND_LE)
58592                            .Case("{@ccnge}", X86::COND_L)
58593                            .Case("{@ccnl}", X86::COND_GE)
58594                            .Case("{@ccnle}", X86::COND_G)
58595                            .Case("{@ccno}", X86::COND_NO)
58596                            .Case("{@ccnp}", X86::COND_NP)
58597                            .Case("{@ccns}", X86::COND_NS)
58598                            .Case("{@cco}", X86::COND_O)
58599                            .Case("{@ccp}", X86::COND_P)
58600                            .Case("{@ccs}", X86::COND_S)
58601                            .Default(X86::COND_INVALID);
58602   return Cond;
58603 }
58604
58605 /// Given a constraint letter, return the type of constraint for this target.
58606 X86TargetLowering::ConstraintType
58607 X86TargetLowering::getConstraintType(StringRef Constraint) const {
58608   if (Constraint.size() == 1) {
58609     switch (Constraint[0]) {
58610     case 'R':
58611     case 'q':
58612     case 'Q':
58613     case 'f':
58614     case 't':
58615     case 'u':
58616     case 'y':
58617     case 'x':
58618     case 'v':
58619     case 'l':
58620     case 'k': // AVX512 masking registers.
58621       return C_RegisterClass;
58622     case 'a':
58623     case 'b':
58624     case 'c':
58625     case 'd':
58626     case 'S':
58627     case 'D':
58628     case 'A':
58629       return C_Register;
58630     case 'I':
58631     case 'J':
58632     case 'K':
58633     case 'N':
58634     case 'G':
58635     case 'L':
58636     case 'M':
58637       return C_Immediate;
58638     case 'C':
58639     case 'e':
58640     case 'Z':
58641       return C_Other;
58642     default:
58643       break;
58644     }
58645   }
58646   else if (Constraint.size() == 2) {
58647     switch (Constraint[0]) {
58648     default:
58649       break;
58650     case 'Y':
58651       switch (Constraint[1]) {
58652       default:
58653         break;
58654       case 'z':
58655         return C_Register;
58656       case 'i':
58657       case 'm':
58658       case 'k':
58659       case 't':
58660       case '2':
58661         return C_RegisterClass;
58662       }
58663     }
58664   } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58665     return C_Other;
58666   return TargetLowering::getConstraintType(Constraint);
58667 }
58668
58669 /// Examine constraint type and operand type and determine a weight value.
58670 /// This object must already have been set up with the operand type
58671 /// and the current alternative constraint selected.
58672 TargetLowering::ConstraintWeight
58673   X86TargetLowering::getSingleConstraintMatchWeight(
58674     AsmOperandInfo &info, const char *constraint) const {
58675   ConstraintWeight weight = CW_Invalid;
58676   Value *CallOperandVal = info.CallOperandVal;
58677     // If we don't have a value, we can't do a match,
58678     // but allow it at the lowest weight.
58679   if (!CallOperandVal)
58680     return CW_Default;
58681   Type *type = CallOperandVal->getType();
58682   // Look at the constraint type.
58683   switch (*constraint) {
58684   default:
58685     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
58686     [[fallthrough]];
58687   case 'R':
58688   case 'q':
58689   case 'Q':
58690   case 'a':
58691   case 'b':
58692   case 'c':
58693   case 'd':
58694   case 'S':
58695   case 'D':
58696   case 'A':
58697     if (CallOperandVal->getType()->isIntegerTy())
58698       weight = CW_SpecificReg;
58699     break;
58700   case 'f':
58701   case 't':
58702   case 'u':
58703     if (type->isFloatingPointTy())
58704       weight = CW_SpecificReg;
58705     break;
58706   case 'y':
58707     if (type->isX86_MMXTy() && Subtarget.hasMMX())
58708       weight = CW_SpecificReg;
58709     break;
58710   case 'Y':
58711     if (StringRef(constraint).size() != 2)
58712       break;
58713     switch (constraint[1]) {
58714       default:
58715         return CW_Invalid;
58716       // XMM0
58717       case 'z':
58718         if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58719             ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
58720             ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
58721           return CW_SpecificReg;
58722         return CW_Invalid;
58723       // Conditional OpMask regs (AVX512)
58724       case 'k':
58725         if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58726           return CW_Register;
58727         return CW_Invalid;
58728       // Any MMX reg
58729       case 'm':
58730         if (type->isX86_MMXTy() && Subtarget.hasMMX())
58731           return weight;
58732         return CW_Invalid;
58733       // Any SSE reg when ISA >= SSE2, same as 'x'
58734       case 'i':
58735       case 't':
58736       case '2':
58737         if (!Subtarget.hasSSE2())
58738           return CW_Invalid;
58739         break;
58740     }
58741     break;
58742   case 'v':
58743     if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
58744       weight = CW_Register;
58745     [[fallthrough]];
58746   case 'x':
58747     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
58748         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
58749       weight = CW_Register;
58750     break;
58751   case 'k':
58752     // Enable conditional vector operations using %k<#> registers.
58753     if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
58754       weight = CW_Register;
58755     break;
58756   case 'I':
58757     if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
58758       if (C->getZExtValue() <= 31)
58759         weight = CW_Constant;
58760     }
58761     break;
58762   case 'J':
58763     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58764       if (C->getZExtValue() <= 63)
58765         weight = CW_Constant;
58766     }
58767     break;
58768   case 'K':
58769     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58770       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
58771         weight = CW_Constant;
58772     }
58773     break;
58774   case 'L':
58775     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58776       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
58777         weight = CW_Constant;
58778     }
58779     break;
58780   case 'M':
58781     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58782       if (C->getZExtValue() <= 3)
58783         weight = CW_Constant;
58784     }
58785     break;
58786   case 'N':
58787     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58788       if (C->getZExtValue() <= 0xff)
58789         weight = CW_Constant;
58790     }
58791     break;
58792   case 'G':
58793   case 'C':
58794     if (isa<ConstantFP>(CallOperandVal)) {
58795       weight = CW_Constant;
58796     }
58797     break;
58798   case 'e':
58799     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58800       if ((C->getSExtValue() >= -0x80000000LL) &&
58801           (C->getSExtValue() <= 0x7fffffffLL))
58802         weight = CW_Constant;
58803     }
58804     break;
58805   case 'Z':
58806     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {
58807       if (C->getZExtValue() <= 0xffffffff)
58808         weight = CW_Constant;
58809     }
58810     break;
58811   }
58812   return weight;
58813 }
58814
58815 /// Try to replace an X constraint, which matches anything, with another that
58816 /// has more specific requirements based on the type of the corresponding
58817 /// operand.
58818 const char *X86TargetLowering::
58819 LowerXConstraint(EVT ConstraintVT) const {
58820   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
58821   // 'f' like normal targets.
58822   if (ConstraintVT.isFloatingPoint()) {
58823     if (Subtarget.hasSSE1())
58824       return "x";
58825   }
58826
58827   return TargetLowering::LowerXConstraint(ConstraintVT);
58828 }
58829
58830 // Lower @cc targets via setcc.
58831 SDValue X86TargetLowering::LowerAsmOutputForConstraint(
58832     SDValue &Chain, SDValue &Glue, const SDLoc &DL,
58833     const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
58834   X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
58835   if (Cond == X86::COND_INVALID)
58836     return SDValue();
58837   // Check that return type is valid.
58838   if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
58839       OpInfo.ConstraintVT.getSizeInBits() < 8)
58840     report_fatal_error("Glue output operand is of invalid type");
58841
58842   // Get EFLAGS register. Only update chain when copyfrom is glued.
58843   if (Glue.getNode()) {
58844     Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
58845     Chain = Glue.getValue(1);
58846   } else
58847     Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
58848   // Extract CC code.
58849   SDValue CC = getSETCC(Cond, Glue, DL, DAG);
58850   // Extend to 32-bits
58851   SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
58852
58853   return Result;
58854 }
58855
58856 /// Lower the specified operand into the Ops vector.
58857 /// If it is invalid, don't add anything to Ops.
58858 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
58859                                                      std::string &Constraint,
58860                                                      std::vector<SDValue>&Ops,
58861                                                      SelectionDAG &DAG) const {
58862   SDValue Result;
58863
58864   // Only support length 1 constraints for now.
58865   if (Constraint.length() > 1) return;
58866
58867   char ConstraintLetter = Constraint[0];
58868   switch (ConstraintLetter) {
58869   default: break;
58870   case 'I':
58871     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58872       if (C->getZExtValue() <= 31) {
58873         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58874                                        Op.getValueType());
58875         break;
58876       }
58877     }
58878     return;
58879   case 'J':
58880     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58881       if (C->getZExtValue() <= 63) {
58882         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58883                                        Op.getValueType());
58884         break;
58885       }
58886     }
58887     return;
58888   case 'K':
58889     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58890       if (isInt<8>(C->getSExtValue())) {
58891         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58892                                        Op.getValueType());
58893         break;
58894       }
58895     }
58896     return;
58897   case 'L':
58898     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58899       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
58900           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
58901         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
58902                                        Op.getValueType());
58903         break;
58904       }
58905     }
58906     return;
58907   case 'M':
58908     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58909       if (C->getZExtValue() <= 3) {
58910         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58911                                        Op.getValueType());
58912         break;
58913       }
58914     }
58915     return;
58916   case 'N':
58917     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58918       if (C->getZExtValue() <= 255) {
58919         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58920                                        Op.getValueType());
58921         break;
58922       }
58923     }
58924     return;
58925   case 'O':
58926     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58927       if (C->getZExtValue() <= 127) {
58928         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58929                                        Op.getValueType());
58930         break;
58931       }
58932     }
58933     return;
58934   case 'e': {
58935     // 32-bit signed value
58936     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58937       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
58938                                            C->getSExtValue())) {
58939         // Widen to 64 bits here to get it sign extended.
58940         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
58941         break;
58942       }
58943     // FIXME gcc accepts some relocatable values here too, but only in certain
58944     // memory models; it's complicated.
58945     }
58946     return;
58947   }
58948   case 'Z': {
58949     // 32-bit unsigned value
58950     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
58951       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
58952                                            C->getZExtValue())) {
58953         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
58954                                        Op.getValueType());
58955         break;
58956       }
58957     }
58958     // FIXME gcc accepts some relocatable values here too, but only in certain
58959     // memory models; it's complicated.
58960     return;
58961   }
58962   case 'i': {
58963     // Literal immediates are always ok.
58964     if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
58965       bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
58966       BooleanContent BCont = getBooleanContents(MVT::i64);
58967       ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
58968                                     : ISD::SIGN_EXTEND;
58969       int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
58970                                                   : CST->getSExtValue();
58971       Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
58972       break;
58973     }
58974
58975     // In any sort of PIC mode addresses need to be computed at runtime by
58976     // adding in a register or some sort of table lookup.  These can't
58977     // be used as immediates. BlockAddresses and BasicBlocks are fine though.
58978     if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
58979         !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
58980       return;
58981
58982     // If we are in non-pic codegen mode, we allow the address of a global (with
58983     // an optional displacement) to be used with 'i'.
58984     if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
58985       // If we require an extra load to get this address, as in PIC mode, we
58986       // can't accept it.
58987       if (isGlobalStubReference(
58988               Subtarget.classifyGlobalReference(GA->getGlobal())))
58989         return;
58990     break;
58991   }
58992   }
58993
58994   if (Result.getNode()) {
58995     Ops.push_back(Result);
58996     return;
58997   }
58998   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
58999 }
59000
59001 /// Check if \p RC is a general purpose register class.
59002 /// I.e., GR* or one of their variant.
59003 static bool isGRClass(const TargetRegisterClass &RC) {
59004   return RC.hasSuperClassEq(&X86::GR8RegClass) ||
59005          RC.hasSuperClassEq(&X86::GR16RegClass) ||
59006          RC.hasSuperClassEq(&X86::GR32RegClass) ||
59007          RC.hasSuperClassEq(&X86::GR64RegClass) ||
59008          RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
59009 }
59010
59011 /// Check if \p RC is a vector register class.
59012 /// I.e., FR* / VR* or one of their variant.
59013 static bool isFRClass(const TargetRegisterClass &RC) {
59014   return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
59015          RC.hasSuperClassEq(&X86::FR32XRegClass) ||
59016          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
59017          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
59018          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
59019          RC.hasSuperClassEq(&X86::VR512RegClass);
59020 }
59021
59022 /// Check if \p RC is a mask register class.
59023 /// I.e., VK* or one of their variant.
59024 static bool isVKClass(const TargetRegisterClass &RC) {
59025   return RC.hasSuperClassEq(&X86::VK1RegClass) ||
59026          RC.hasSuperClassEq(&X86::VK2RegClass) ||
59027          RC.hasSuperClassEq(&X86::VK4RegClass) ||
59028          RC.hasSuperClassEq(&X86::VK8RegClass) ||
59029          RC.hasSuperClassEq(&X86::VK16RegClass) ||
59030          RC.hasSuperClassEq(&X86::VK32RegClass) ||
59031          RC.hasSuperClassEq(&X86::VK64RegClass);
59032 }
59033
59034 std::pair<unsigned, const TargetRegisterClass *>
59035 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
59036                                                 StringRef Constraint,
59037                                                 MVT VT) const {
59038   // First, see if this is a constraint that directly corresponds to an LLVM
59039   // register class.
59040   if (Constraint.size() == 1) {
59041     // GCC Constraint Letters
59042     switch (Constraint[0]) {
59043     default: break;
59044     // 'A' means [ER]AX + [ER]DX.
59045     case 'A':
59046       if (Subtarget.is64Bit())
59047         return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
59048       assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
59049              "Expecting 64, 32 or 16 bit subtarget");
59050       return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
59051
59052       // TODO: Slight differences here in allocation order and leaving
59053       // RIP in the class. Do they matter any more here than they do
59054       // in the normal allocation?
59055     case 'k':
59056       if (Subtarget.hasAVX512()) {
59057         if (VT == MVT::i1)
59058           return std::make_pair(0U, &X86::VK1RegClass);
59059         if (VT == MVT::i8)
59060           return std::make_pair(0U, &X86::VK8RegClass);
59061         if (VT == MVT::i16)
59062           return std::make_pair(0U, &X86::VK16RegClass);
59063       }
59064       if (Subtarget.hasBWI()) {
59065         if (VT == MVT::i32)
59066           return std::make_pair(0U, &X86::VK32RegClass);
59067         if (VT == MVT::i64)
59068           return std::make_pair(0U, &X86::VK64RegClass);
59069       }
59070       break;
59071     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
59072       if (Subtarget.is64Bit()) {
59073         if (VT == MVT::i8 || VT == MVT::i1)
59074           return std::make_pair(0U, &X86::GR8RegClass);
59075         if (VT == MVT::i16)
59076           return std::make_pair(0U, &X86::GR16RegClass);
59077         if (VT == MVT::i32 || VT == MVT::f32)
59078           return std::make_pair(0U, &X86::GR32RegClass);
59079         if (VT != MVT::f80 && !VT.isVector())
59080           return std::make_pair(0U, &X86::GR64RegClass);
59081         break;
59082       }
59083       [[fallthrough]];
59084       // 32-bit fallthrough
59085     case 'Q':   // Q_REGS
59086       if (VT == MVT::i8 || VT == MVT::i1)
59087         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
59088       if (VT == MVT::i16)
59089         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
59090       if (VT == MVT::i32 || VT == MVT::f32 ||
59091           (!VT.isVector() && !Subtarget.is64Bit()))
59092         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
59093       if (VT != MVT::f80 && !VT.isVector())
59094         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
59095       break;
59096     case 'r':   // GENERAL_REGS
59097     case 'l':   // INDEX_REGS
59098       if (VT == MVT::i8 || VT == MVT::i1)
59099         return std::make_pair(0U, &X86::GR8RegClass);
59100       if (VT == MVT::i16)
59101         return std::make_pair(0U, &X86::GR16RegClass);
59102       if (VT == MVT::i32 || VT == MVT::f32 ||
59103           (!VT.isVector() && !Subtarget.is64Bit()))
59104         return std::make_pair(0U, &X86::GR32RegClass);
59105       if (VT != MVT::f80 && !VT.isVector())
59106         return std::make_pair(0U, &X86::GR64RegClass);
59107       break;
59108     case 'R':   // LEGACY_REGS
59109       if (VT == MVT::i8 || VT == MVT::i1)
59110         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
59111       if (VT == MVT::i16)
59112         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
59113       if (VT == MVT::i32 || VT == MVT::f32 ||
59114           (!VT.isVector() && !Subtarget.is64Bit()))
59115         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
59116       if (VT != MVT::f80 && !VT.isVector())
59117         return std::make_pair(0U, &X86::GR64_NOREXRegClass);
59118       break;
59119     case 'f':  // FP Stack registers.
59120       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
59121       // value to the correct fpstack register class.
59122       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
59123         return std::make_pair(0U, &X86::RFP32RegClass);
59124       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
59125         return std::make_pair(0U, &X86::RFP64RegClass);
59126       if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
59127         return std::make_pair(0U, &X86::RFP80RegClass);
59128       break;
59129     case 'y':   // MMX_REGS if MMX allowed.
59130       if (!Subtarget.hasMMX()) break;
59131       return std::make_pair(0U, &X86::VR64RegClass);
59132     case 'v':
59133     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
59134       if (!Subtarget.hasSSE1()) break;
59135       bool VConstraint = (Constraint[0] == 'v');
59136
59137       switch (VT.SimpleTy) {
59138       default: break;
59139       // Scalar SSE types.
59140       case MVT::f16:
59141         if (VConstraint && Subtarget.hasFP16())
59142           return std::make_pair(0U, &X86::FR16XRegClass);
59143         break;
59144       case MVT::f32:
59145       case MVT::i32:
59146         if (VConstraint && Subtarget.hasVLX())
59147           return std::make_pair(0U, &X86::FR32XRegClass);
59148         return std::make_pair(0U, &X86::FR32RegClass);
59149       case MVT::f64:
59150       case MVT::i64:
59151         if (VConstraint && Subtarget.hasVLX())
59152           return std::make_pair(0U, &X86::FR64XRegClass);
59153         return std::make_pair(0U, &X86::FR64RegClass);
59154       case MVT::i128:
59155         if (Subtarget.is64Bit()) {
59156           if (VConstraint && Subtarget.hasVLX())
59157             return std::make_pair(0U, &X86::VR128XRegClass);
59158           return std::make_pair(0U, &X86::VR128RegClass);
59159         }
59160         break;
59161       // Vector types and fp128.
59162       case MVT::v8f16:
59163         if (!Subtarget.hasFP16())
59164           break;
59165         [[fallthrough]];
59166       case MVT::f128:
59167       case MVT::v16i8:
59168       case MVT::v8i16:
59169       case MVT::v4i32:
59170       case MVT::v2i64:
59171       case MVT::v4f32:
59172       case MVT::v2f64:
59173         if (VConstraint && Subtarget.hasVLX())
59174           return std::make_pair(0U, &X86::VR128XRegClass);
59175         return std::make_pair(0U, &X86::VR128RegClass);
59176       // AVX types.
59177       case MVT::v16f16:
59178         if (!Subtarget.hasFP16())
59179           break;
59180         [[fallthrough]];
59181       case MVT::v32i8:
59182       case MVT::v16i16:
59183       case MVT::v8i32:
59184       case MVT::v4i64:
59185       case MVT::v8f32:
59186       case MVT::v4f64:
59187         if (VConstraint && Subtarget.hasVLX())
59188           return std::make_pair(0U, &X86::VR256XRegClass);
59189         if (Subtarget.hasAVX())
59190           return std::make_pair(0U, &X86::VR256RegClass);
59191         break;
59192       case MVT::v32f16:
59193         if (!Subtarget.hasFP16())
59194           break;
59195         [[fallthrough]];
59196       case MVT::v64i8:
59197       case MVT::v32i16:
59198       case MVT::v8f64:
59199       case MVT::v16f32:
59200       case MVT::v16i32:
59201       case MVT::v8i64:
59202         if (!Subtarget.hasAVX512()) break;
59203         if (VConstraint)
59204           return std::make_pair(0U, &X86::VR512RegClass);
59205         return std::make_pair(0U, &X86::VR512_0_15RegClass);
59206       }
59207       break;
59208     }
59209   } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
59210     switch (Constraint[1]) {
59211     default:
59212       break;
59213     case 'i':
59214     case 't':
59215     case '2':
59216       return getRegForInlineAsmConstraint(TRI, "x", VT);
59217     case 'm':
59218       if (!Subtarget.hasMMX()) break;
59219       return std::make_pair(0U, &X86::VR64RegClass);
59220     case 'z':
59221       if (!Subtarget.hasSSE1()) break;
59222       switch (VT.SimpleTy) {
59223       default: break;
59224       // Scalar SSE types.
59225       case MVT::f16:
59226         if (!Subtarget.hasFP16())
59227           break;
59228         return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
59229       case MVT::f32:
59230       case MVT::i32:
59231         return std::make_pair(X86::XMM0, &X86::FR32RegClass);
59232       case MVT::f64:
59233       case MVT::i64:
59234         return std::make_pair(X86::XMM0, &X86::FR64RegClass);
59235       case MVT::v8f16:
59236         if (!Subtarget.hasFP16())
59237           break;
59238         [[fallthrough]];
59239       case MVT::f128:
59240       case MVT::v16i8:
59241       case MVT::v8i16:
59242       case MVT::v4i32:
59243       case MVT::v2i64:
59244       case MVT::v4f32:
59245       case MVT::v2f64:
59246         return std::make_pair(X86::XMM0, &X86::VR128RegClass);
59247       // AVX types.
59248       case MVT::v16f16:
59249         if (!Subtarget.hasFP16())
59250           break;
59251         [[fallthrough]];
59252       case MVT::v32i8:
59253       case MVT::v16i16:
59254       case MVT::v8i32:
59255       case MVT::v4i64:
59256       case MVT::v8f32:
59257       case MVT::v4f64:
59258         if (Subtarget.hasAVX())
59259           return std::make_pair(X86::YMM0, &X86::VR256RegClass);
59260         break;
59261       case MVT::v32f16:
59262         if (!Subtarget.hasFP16())
59263           break;
59264         [[fallthrough]];
59265       case MVT::v64i8:
59266       case MVT::v32i16:
59267       case MVT::v8f64:
59268       case MVT::v16f32:
59269       case MVT::v16i32:
59270       case MVT::v8i64:
59271         if (Subtarget.hasAVX512())
59272           return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
59273         break;
59274       }
59275       break;
59276     case 'k':
59277       // This register class doesn't allocate k0 for masked vector operation.
59278       if (Subtarget.hasAVX512()) {
59279         if (VT == MVT::i1)
59280           return std::make_pair(0U, &X86::VK1WMRegClass);
59281         if (VT == MVT::i8)
59282           return std::make_pair(0U, &X86::VK8WMRegClass);
59283         if (VT == MVT::i16)
59284           return std::make_pair(0U, &X86::VK16WMRegClass);
59285       }
59286       if (Subtarget.hasBWI()) {
59287         if (VT == MVT::i32)
59288           return std::make_pair(0U, &X86::VK32WMRegClass);
59289         if (VT == MVT::i64)
59290           return std::make_pair(0U, &X86::VK64WMRegClass);
59291       }
59292       break;
59293     }
59294   }
59295
59296   if (parseConstraintCode(Constraint) != X86::COND_INVALID)
59297     return std::make_pair(0U, &X86::GR32RegClass);
59298
59299   // Use the default implementation in TargetLowering to convert the register
59300   // constraint into a member of a register class.
59301   std::pair<Register, const TargetRegisterClass*> Res;
59302   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
59303
59304   // Not found as a standard register?
59305   if (!Res.second) {
59306     // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
59307     // to/from f80.
59308     if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
59309       // Map st(0) -> st(7) -> ST0
59310       if (Constraint.size() == 7 && Constraint[0] == '{' &&
59311           tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
59312           Constraint[3] == '(' &&
59313           (Constraint[4] >= '0' && Constraint[4] <= '7') &&
59314           Constraint[5] == ')' && Constraint[6] == '}') {
59315         // st(7) is not allocatable and thus not a member of RFP80. Return
59316         // singleton class in cases where we have a reference to it.
59317         if (Constraint[4] == '7')
59318           return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
59319         return std::make_pair(X86::FP0 + Constraint[4] - '0',
59320                               &X86::RFP80RegClass);
59321       }
59322
59323       // GCC allows "st(0)" to be called just plain "st".
59324       if (StringRef("{st}").equals_insensitive(Constraint))
59325         return std::make_pair(X86::FP0, &X86::RFP80RegClass);
59326     }
59327
59328     // flags -> EFLAGS
59329     if (StringRef("{flags}").equals_insensitive(Constraint))
59330       return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
59331
59332     // dirflag -> DF
59333     // Only allow for clobber.
59334     if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
59335         VT == MVT::Other)
59336       return std::make_pair(X86::DF, &X86::DFCCRRegClass);
59337
59338     // fpsr -> FPSW
59339     if (StringRef("{fpsr}").equals_insensitive(Constraint))
59340       return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
59341
59342     return Res;
59343   }
59344
59345   // Make sure it isn't a register that requires 64-bit mode.
59346   if (!Subtarget.is64Bit() &&
59347       (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
59348       TRI->getEncodingValue(Res.first) >= 8) {
59349     // Register requires REX prefix, but we're in 32-bit mode.
59350     return std::make_pair(0, nullptr);
59351   }
59352
59353   // Make sure it isn't a register that requires AVX512.
59354   if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
59355       TRI->getEncodingValue(Res.first) & 0x10) {
59356     // Register requires EVEX prefix.
59357     return std::make_pair(0, nullptr);
59358   }
59359
59360   // Otherwise, check to see if this is a register class of the wrong value
59361   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
59362   // turn into {ax},{dx}.
59363   // MVT::Other is used to specify clobber names.
59364   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
59365     return Res;   // Correct type already, nothing to do.
59366
59367   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
59368   // return "eax". This should even work for things like getting 64bit integer
59369   // registers when given an f64 type.
59370   const TargetRegisterClass *Class = Res.second;
59371   // The generic code will match the first register class that contains the
59372   // given register. Thus, based on the ordering of the tablegened file,
59373   // the "plain" GR classes might not come first.
59374   // Therefore, use a helper method.
59375   if (isGRClass(*Class)) {
59376     unsigned Size = VT.getSizeInBits();
59377     if (Size == 1) Size = 8;
59378     if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
59379       return std::make_pair(0, nullptr);
59380     Register DestReg = getX86SubSuperRegister(Res.first, Size);
59381     if (DestReg.isValid()) {
59382       bool is64Bit = Subtarget.is64Bit();
59383       const TargetRegisterClass *RC =
59384           Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
59385         : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
59386         : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
59387         : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
59388       if (Size == 64 && !is64Bit) {
59389         // Model GCC's behavior here and select a fixed pair of 32-bit
59390         // registers.
59391         switch (DestReg) {
59392         case X86::RAX:
59393           return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
59394         case X86::RDX:
59395           return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
59396         case X86::RCX:
59397           return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
59398         case X86::RBX:
59399           return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
59400         case X86::RSI:
59401           return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
59402         case X86::RDI:
59403           return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
59404         case X86::RBP:
59405           return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
59406         default:
59407           return std::make_pair(0, nullptr);
59408         }
59409       }
59410       if (RC && RC->contains(DestReg))
59411         return std::make_pair(DestReg, RC);
59412       return Res;
59413     }
59414     // No register found/type mismatch.
59415     return std::make_pair(0, nullptr);
59416   } else if (isFRClass(*Class)) {
59417     // Handle references to XMM physical registers that got mapped into the
59418     // wrong class.  This can happen with constraints like {xmm0} where the
59419     // target independent register mapper will just pick the first match it can
59420     // find, ignoring the required type.
59421
59422     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
59423     if (VT == MVT::f16)
59424       Res.second = &X86::FR16XRegClass;
59425     else if (VT == MVT::f32 || VT == MVT::i32)
59426       Res.second = &X86::FR32XRegClass;
59427     else if (VT == MVT::f64 || VT == MVT::i64)
59428       Res.second = &X86::FR64XRegClass;
59429     else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
59430       Res.second = &X86::VR128XRegClass;
59431     else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
59432       Res.second = &X86::VR256XRegClass;
59433     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
59434       Res.second = &X86::VR512RegClass;
59435     else {
59436       // Type mismatch and not a clobber: Return an error;
59437       Res.first = 0;
59438       Res.second = nullptr;
59439     }
59440   } else if (isVKClass(*Class)) {
59441     if (VT == MVT::i1)
59442       Res.second = &X86::VK1RegClass;
59443     else if (VT == MVT::i8)
59444       Res.second = &X86::VK8RegClass;
59445     else if (VT == MVT::i16)
59446       Res.second = &X86::VK16RegClass;
59447     else if (VT == MVT::i32)
59448       Res.second = &X86::VK32RegClass;
59449     else if (VT == MVT::i64)
59450       Res.second = &X86::VK64RegClass;
59451     else {
59452       // Type mismatch and not a clobber: Return an error;
59453       Res.first = 0;
59454       Res.second = nullptr;
59455     }
59456   }
59457
59458   return Res;
59459 }
59460
59461 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
59462   // Integer division on x86 is expensive. However, when aggressively optimizing
59463   // for code size, we prefer to use a div instruction, as it is usually smaller
59464   // than the alternative sequence.
59465   // The exception to this is vector division. Since x86 doesn't have vector
59466   // integer division, leaving the division as-is is a loss even in terms of
59467   // size, because it will have to be scalarized, while the alternative code
59468   // sequence can be performed in vector form.
59469   bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
59470   return OptSize && !VT.isVector();
59471 }
59472
59473 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
59474   if (!Subtarget.is64Bit())
59475     return;
59476
59477   // Update IsSplitCSR in X86MachineFunctionInfo.
59478   X86MachineFunctionInfo *AFI =
59479       Entry->getParent()->getInfo<X86MachineFunctionInfo>();
59480   AFI->setIsSplitCSR(true);
59481 }
59482
59483 void X86TargetLowering::insertCopiesSplitCSR(
59484     MachineBasicBlock *Entry,
59485     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
59486   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
59487   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
59488   if (!IStart)
59489     return;
59490
59491   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
59492   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
59493   MachineBasicBlock::iterator MBBI = Entry->begin();
59494   for (const MCPhysReg *I = IStart; *I; ++I) {
59495     const TargetRegisterClass *RC = nullptr;
59496     if (X86::GR64RegClass.contains(*I))
59497       RC = &X86::GR64RegClass;
59498     else
59499       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
59500
59501     Register NewVR = MRI->createVirtualRegister(RC);
59502     // Create copy from CSR to a virtual register.
59503     // FIXME: this currently does not emit CFI pseudo-instructions, it works
59504     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
59505     // nounwind. If we want to generalize this later, we may need to emit
59506     // CFI pseudo-instructions.
59507     assert(
59508         Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
59509         "Function should be nounwind in insertCopiesSplitCSR!");
59510     Entry->addLiveIn(*I);
59511     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
59512         .addReg(*I);
59513
59514     // Insert the copy-back instructions right before the terminator.
59515     for (auto *Exit : Exits)
59516       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
59517               TII->get(TargetOpcode::COPY), *I)
59518           .addReg(NewVR);
59519   }
59520 }
59521
59522 bool X86TargetLowering::supportSwiftError() const {
59523   return Subtarget.is64Bit();
59524 }
59525
59526 MachineInstr *
59527 X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
59528                                  MachineBasicBlock::instr_iterator &MBBI,
59529                                  const TargetInstrInfo *TII) const {
59530   assert(MBBI->isCall() && MBBI->getCFIType() &&
59531          "Invalid call instruction for a KCFI check");
59532
59533   MachineFunction &MF = *MBB.getParent();
59534   // If the call target is a memory operand, unfold it and use R11 for the
59535   // call, so KCFI_CHECK won't have to recompute the address.
59536   switch (MBBI->getOpcode()) {
59537   case X86::CALL64m:
59538   case X86::CALL64m_NT:
59539   case X86::TAILJMPm64:
59540   case X86::TAILJMPm64_REX: {
59541     MachineBasicBlock::instr_iterator OrigCall = MBBI;
59542     SmallVector<MachineInstr *, 2> NewMIs;
59543     if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
59544                                   /*UnfoldStore=*/false, NewMIs))
59545       report_fatal_error("Failed to unfold memory operand for a KCFI check");
59546     for (auto *NewMI : NewMIs)
59547       MBBI = MBB.insert(OrigCall, NewMI);
59548     assert(MBBI->isCall() &&
59549            "Unexpected instruction after memory operand unfolding");
59550     if (OrigCall->shouldUpdateCallSiteInfo())
59551       MF.moveCallSiteInfo(&*OrigCall, &*MBBI);
59552     MBBI->setCFIType(MF, OrigCall->getCFIType());
59553     OrigCall->eraseFromParent();
59554     break;
59555   }
59556   default:
59557     break;
59558   }
59559
59560   MachineOperand &Target = MBBI->getOperand(0);
59561   Register TargetReg;
59562   switch (MBBI->getOpcode()) {
59563   case X86::CALL64r:
59564   case X86::CALL64r_NT:
59565   case X86::TAILJMPr64:
59566   case X86::TAILJMPr64_REX:
59567     assert(Target.isReg() && "Unexpected target operand for an indirect call");
59568     Target.setIsRenamable(false);
59569     TargetReg = Target.getReg();
59570     break;
59571   case X86::CALL64pcrel32:
59572   case X86::TAILJMPd64:
59573     assert(Target.isSymbol() && "Unexpected target operand for a direct call");
59574     // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
59575     // 64-bit indirect thunk calls.
59576     assert(StringRef(Target.getSymbolName()).endswith("_r11") &&
59577            "Unexpected register for an indirect thunk call");
59578     TargetReg = X86::R11;
59579     break;
59580   default:
59581     llvm_unreachable("Unexpected CFI call opcode");
59582     break;
59583   }
59584
59585   return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(X86::KCFI_CHECK))
59586       .addReg(TargetReg)
59587       .addImm(MBBI->getCFIType())
59588       .getInstr();
59589 }
59590
59591 /// Returns true if stack probing through a function call is requested.
59592 bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {
59593   return !getStackProbeSymbolName(MF).empty();
59594 }
59595
59596 /// Returns true if stack probing through inline assembly is requested.
59597 bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
59598
59599   // No inline stack probe for Windows, they have their own mechanism.
59600   if (Subtarget.isOSWindows() ||
59601       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
59602     return false;
59603
59604   // If the function specifically requests inline stack probes, emit them.
59605   if (MF.getFunction().hasFnAttribute("probe-stack"))
59606     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
59607            "inline-asm";
59608
59609   return false;
59610 }
59611
59612 /// Returns the name of the symbol used to emit stack probes or the empty
59613 /// string if not applicable.
59614 StringRef
59615 X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {
59616   // Inline Stack probes disable stack probe call
59617   if (hasInlineStackProbe(MF))
59618     return "";
59619
59620   // If the function specifically requests stack probes, emit them.
59621   if (MF.getFunction().hasFnAttribute("probe-stack"))
59622     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
59623
59624   // Generally, if we aren't on Windows, the platform ABI does not include
59625   // support for stack probes, so don't emit them.
59626   if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
59627       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
59628     return "";
59629
59630   // We need a stack probe to conform to the Windows ABI. Choose the right
59631   // symbol.
59632   if (Subtarget.is64Bit())
59633     return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
59634   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
59635 }
59636
59637 unsigned
59638 X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
59639   // The default stack probe size is 4096 if the function has no stackprobesize
59640   // attribute.
59641   return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
59642                                                         4096);
59643 }
59644
59645 Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
59646   if (ML->isInnermost() &&
59647       ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
59648     return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
59649   return TargetLowering::getPrefLoopAlignment();
59650 }