llvm/lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines the interfaces that X86 uses to lower LLVM code into a
  10 // selection DAG.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "X86ISelLowering.h"
  15 #include "MCTargetDesc/X86ShuffleDecode.h"
  16 #include "X86.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86IntrinsicsInfo.h"
  21 #include "X86MachineFunctionInfo.h"
  22 #include "X86TargetMachine.h"
  23 #include "X86TargetObjectFile.h"
  24 #include "llvm/ADT/SmallBitVector.h"
  25 #include "llvm/ADT/SmallSet.h"
  26 #include "llvm/ADT/Statistic.h"
  27 #include "llvm/ADT/StringExtras.h"
  28 #include "llvm/ADT/StringSwitch.h"
  29 #include "llvm/Analysis/BlockFrequencyInfo.h"
  30 #include "llvm/Analysis/ObjCARCUtil.h"
  31 #include "llvm/Analysis/ProfileSummaryInfo.h"
  32 #include "llvm/Analysis/VectorUtils.h"
  33 #include "llvm/CodeGen/IntrinsicLowering.h"
  34 #include "llvm/CodeGen/MachineFrameInfo.h"
  35 #include "llvm/CodeGen/MachineFunction.h"
  36 #include "llvm/CodeGen/MachineInstrBuilder.h"
  37 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  38 #include "llvm/CodeGen/MachineLoopInfo.h"
  39 #include "llvm/CodeGen/MachineModuleInfo.h"
  40 #include "llvm/CodeGen/MachineRegisterInfo.h"
  41 #include "llvm/CodeGen/TargetLowering.h"
  42 #include "llvm/CodeGen/WinEHFuncInfo.h"
  43 #include "llvm/IR/CallingConv.h"
  44 #include "llvm/IR/Constants.h"
  45 #include "llvm/IR/DerivedTypes.h"
  46 #include "llvm/IR/EHPersonalities.h"
  47 #include "llvm/IR/Function.h"
  48 #include "llvm/IR/GlobalAlias.h"
  49 #include "llvm/IR/GlobalVariable.h"
  50 #include "llvm/IR/IRBuilder.h"
  51 #include "llvm/IR/Instructions.h"
  52 #include "llvm/IR/Intrinsics.h"
  53 #include "llvm/IR/PatternMatch.h"
  54 #include "llvm/MC/MCAsmInfo.h"
  55 #include "llvm/MC/MCContext.h"
  56 #include "llvm/MC/MCExpr.h"
  57 #include "llvm/MC/MCSymbol.h"
  58 #include "llvm/Support/CommandLine.h"
  59 #include "llvm/Support/Debug.h"
  60 #include "llvm/Support/ErrorHandling.h"
  61 #include "llvm/Support/KnownBits.h"
  62 #include "llvm/Support/MathExtras.h"
  63 #include "llvm/Target/TargetOptions.h"
  64 #include <algorithm>
  65 #include <bitset>
  66 #include <cctype>
  67 #include <numeric>
  68 using namespace llvm;
  69
  70 #define DEBUG_TYPE "x86-isel"
  71
  72 static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
  73     "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
  74     cl::desc(
  75         "Sets the preferable loop alignment for experiments (as log2 bytes) "
  76         "for innermost loops only. If specified, this option overrides "
  77         "alignment set by x86-experimental-pref-loop-alignment."),
  78     cl::Hidden);
  79
  80 static cl::opt<bool> MulConstantOptimization(
  81     "mul-constant-optimization", cl::init(true),
  82     cl::desc("Replace 'mul x, Const' with more effective instructions like "
  83              "SHIFT, LEA, etc."),
  84     cl::Hidden);
  85
  86 static cl::opt<bool> ExperimentalUnorderedISEL(
  87     "x86-experimental-unordered-atomic-isel", cl::init(false),
  88     cl::desc("Use LoadSDNode and StoreSDNode instead of "
  89              "AtomicSDNode for unordered atomic loads and "
  90              "stores respectively."),
  91     cl::Hidden);
  92
  93 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
  94                                      const X86Subtarget &STI)
  95     : TargetLowering(TM), Subtarget(STI) {
  96   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
  97   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
  98
  99   // Set up the TargetLowering object.
 100
 101   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 102   setBooleanContents(ZeroOrOneBooleanContent);
 103   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 104   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 105
 106   // For 64-bit, since we have so many registers, use the ILP scheduler.
 107   // For 32-bit, use the register pressure specific scheduling.
 108   // For Atom, always use ILP scheduling.
 109   if (Subtarget.isAtom())
 110     setSchedulingPreference(Sched::ILP);
 111   else if (Subtarget.is64Bit())
 112     setSchedulingPreference(Sched::ILP);
 113   else
 114     setSchedulingPreference(Sched::RegPressure);
 115   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 116   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 117
 118   // Bypass expensive divides and use cheaper ones.
 119   if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
 120     if (Subtarget.hasSlowDivide32())
 121       addBypassSlowDiv(32, 8);
 122     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
 123       addBypassSlowDiv(64, 32);
 124   }
 125
 126   // Setup Windows compiler runtime calls.
 127   if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
 128     static const struct {
 129       const RTLIB::Libcall Op;
 130       const char * const Name;
 131       const CallingConv::ID CC;
 132     } LibraryCalls[] = {
 133       { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
 134       { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
 135       { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
 136       { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
 137       { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
 138     };
 139
 140     for (const auto &LC : LibraryCalls) {
 141       setLibcallName(LC.Op, LC.Name);
 142       setLibcallCallingConv(LC.Op, LC.CC);
 143     }
 144   }
 145
 146   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
 147     // MSVCRT doesn't have powi; fall back to pow
 148     setLibcallName(RTLIB::POWI_F32, nullptr);
 149     setLibcallName(RTLIB::POWI_F64, nullptr);
 150   }
 151
 152   // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
 153   // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
 154   // FIXME: Should we be limiting the atomic size on other configs? Default is
 155   // 1024.
 156   if (!Subtarget.canUseCMPXCHG8B())
 157     setMaxAtomicSizeInBitsSupported(32);
 158
 159   setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
 160
 161   setMaxLargeFPConvertBitWidthSupported(128);
 162
 163   // Set up the register classes.
 164   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 165   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 166   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 167   if (Subtarget.is64Bit())
 168     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 169
 170   for (MVT VT : MVT::integer_valuetypes())
 171     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 172
 173   // We don't accept any truncstore of integer registers.
 174   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 175   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 176   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 177   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 178   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 179   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 180
 181   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 182
 183   // SETOEQ and SETUNE require checking two conditions.
 184   for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
 185     setCondCodeAction(ISD::SETOEQ, VT, Expand);
 186     setCondCodeAction(ISD::SETUNE, VT, Expand);
 187   }
 188
 189   // Integer absolute.
 190   if (Subtarget.canUseCMOV()) {
 191     setOperationAction(ISD::ABS            , MVT::i16  , Custom);
 192     setOperationAction(ISD::ABS            , MVT::i32  , Custom);
 193     if (Subtarget.is64Bit())
 194       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
 195   }
 196
 197   // Absolute difference.
 198   for (auto Op : {ISD::ABDS, ISD::ABDU}) {
 199     setOperationAction(Op                  , MVT::i8   , Custom);
 200     setOperationAction(Op                  , MVT::i16  , Custom);
 201     setOperationAction(Op                  , MVT::i32  , Custom);
 202     if (Subtarget.is64Bit())
 203      setOperationAction(Op                 , MVT::i64  , Custom);
 204   }
 205
 206   // Signed saturation subtraction.
 207   setOperationAction(ISD::SSUBSAT          , MVT::i8   , Custom);
 208   setOperationAction(ISD::SSUBSAT          , MVT::i16  , Custom);
 209   setOperationAction(ISD::SSUBSAT          , MVT::i32  , Custom);
 210   if (Subtarget.is64Bit())
 211     setOperationAction(ISD::SSUBSAT        , MVT::i64  , Custom);
 212
 213   // Funnel shifts.
 214   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
 215     // For slow shld targets we only lower for code size.
 216     LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
 217
 218     setOperationAction(ShiftOp             , MVT::i8   , Custom);
 219     setOperationAction(ShiftOp             , MVT::i16  , Custom);
 220     setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);
 221     if (Subtarget.is64Bit())
 222       setOperationAction(ShiftOp           , MVT::i64  , ShiftDoubleAction);
 223   }
 224
 225   if (!Subtarget.useSoftFloat()) {
 226     // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 227     // operation.
 228     setOperationAction(ISD::UINT_TO_FP,        MVT::i8, Promote);
 229     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
 230     setOperationAction(ISD::UINT_TO_FP,        MVT::i16, Promote);
 231     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
 232     // We have an algorithm for SSE2, and we turn this into a 64-bit
 233     // FILD or VCVTUSI2SS/SD for other targets.
 234     setOperationAction(ISD::UINT_TO_FP,        MVT::i32, Custom);
 235     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
 236     // We have an algorithm for SSE2->double, and we turn this into a
 237     // 64-bit FILD followed by conditional FADD for other targets.
 238     setOperationAction(ISD::UINT_TO_FP,        MVT::i64, Custom);
 239     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
 240
 241     // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 242     // this operation.
 243     setOperationAction(ISD::SINT_TO_FP,        MVT::i8, Promote);
 244     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
 245     // SSE has no i16 to fp conversion, only i32. We promote in the handler
 246     // to allow f80 to use i16 and f64 to use i16 with sse1 only
 247     setOperationAction(ISD::SINT_TO_FP,        MVT::i16, Custom);
 248     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
 249     // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
 250     setOperationAction(ISD::SINT_TO_FP,        MVT::i32, Custom);
 251     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
 252     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 253     // are Legal, f80 is custom lowered.
 254     setOperationAction(ISD::SINT_TO_FP,        MVT::i64, Custom);
 255     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
 256
 257     // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 258     // this operation.
 259     setOperationAction(ISD::FP_TO_SINT,        MVT::i8,  Promote);
 260     // FIXME: This doesn't generate invalid exception when it should. PR44019.
 261     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8,  Promote);
 262     setOperationAction(ISD::FP_TO_SINT,        MVT::i16, Custom);
 263     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
 264     setOperationAction(ISD::FP_TO_SINT,        MVT::i32, Custom);
 265     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
 266     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 267     // are Legal, f80 is custom lowered.
 268     setOperationAction(ISD::FP_TO_SINT,        MVT::i64, Custom);
 269     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
 270
 271     // Handle FP_TO_UINT by promoting the destination to a larger signed
 272     // conversion.
 273     setOperationAction(ISD::FP_TO_UINT,        MVT::i8,  Promote);
 274     // FIXME: This doesn't generate invalid exception when it should. PR44019.
 275     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8,  Promote);
 276     setOperationAction(ISD::FP_TO_UINT,        MVT::i16, Promote);
 277     // FIXME: This doesn't generate invalid exception when it should. PR44019.
 278     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
 279     setOperationAction(ISD::FP_TO_UINT,        MVT::i32, Custom);
 280     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
 281     setOperationAction(ISD::FP_TO_UINT,        MVT::i64, Custom);
 282     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
 283
 284     setOperationAction(ISD::LRINT,             MVT::f32, Custom);
 285     setOperationAction(ISD::LRINT,             MVT::f64, Custom);
 286     setOperationAction(ISD::LLRINT,            MVT::f32, Custom);
 287     setOperationAction(ISD::LLRINT,            MVT::f64, Custom);
 288
 289     if (!Subtarget.is64Bit()) {
 290       setOperationAction(ISD::LRINT,  MVT::i64, Custom);
 291       setOperationAction(ISD::LLRINT, MVT::i64, Custom);
 292     }
 293   }
 294
 295   if (Subtarget.hasSSE2()) {
 296     // Custom lowering for saturating float to int conversions.
 297     // We handle promotion to larger result types manually.
 298     for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
 299       setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
 300       setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
 301     }
 302     if (Subtarget.is64Bit()) {
 303       setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
 304       setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
 305     }
 306   }
 307
 308   // Handle address space casts between mixed sized pointers.
 309   setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
 310   setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
 311
 312   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 313   if (!Subtarget.hasSSE2()) {
 314     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 315     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 316     if (Subtarget.is64Bit()) {
 317       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 318       // Without SSE, i64->f64 goes through memory.
 319       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 320     }
 321   } else if (!Subtarget.is64Bit())
 322     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
 323
 324   // Scalar integer divide and remainder are lowered to use operations that
 325   // produce two results, to match the available instructions. This exposes
 326   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 327   // into a single instruction.
 328   //
 329   // Scalar integer multiply-high is also lowered to use two-result
 330   // operations, to match the available instructions. However, plain multiply
 331   // (low) operations are left as Legal, as there are single-result
 332   // instructions for this in x86. Using the two-result multiply instructions
 333   // when both high and low results are needed must be arranged by dagcombine.
 334   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 335     setOperationAction(ISD::MULHS, VT, Expand);
 336     setOperationAction(ISD::MULHU, VT, Expand);
 337     setOperationAction(ISD::SDIV, VT, Expand);
 338     setOperationAction(ISD::UDIV, VT, Expand);
 339     setOperationAction(ISD::SREM, VT, Expand);
 340     setOperationAction(ISD::UREM, VT, Expand);
 341   }
 342
 343   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 344   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 345   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
 346                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
 347     setOperationAction(ISD::BR_CC,     VT, Expand);
 348     setOperationAction(ISD::SELECT_CC, VT, Expand);
 349   }
 350   if (Subtarget.is64Bit())
 351     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 352   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 353   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 354   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 355
 356   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 357   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 358   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 359   setOperationAction(ISD::FREM             , MVT::f128 , Expand);
 360
 361   if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
 362     setOperationAction(ISD::GET_ROUNDING   , MVT::i32  , Custom);
 363     setOperationAction(ISD::SET_ROUNDING   , MVT::Other, Custom);
 364     setOperationAction(ISD::GET_FPENV_MEM  , MVT::Other, Custom);
 365     setOperationAction(ISD::SET_FPENV_MEM  , MVT::Other, Custom);
 366     setOperationAction(ISD::RESET_FPENV    , MVT::Other, Custom);
 367   }
 368
 369   // Promote the i8 variants and force them on up to i32 which has a shorter
 370   // encoding.
 371   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
 372   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 373   // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
 374   // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
 375   // promote that too.
 376   setOperationPromotedToType(ISD::CTTZ           , MVT::i16  , MVT::i32);
 377   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , MVT::i32);
 378
 379   if (!Subtarget.hasBMI()) {
 380     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 381     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
 382     if (Subtarget.is64Bit()) {
 383       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 384       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
 385     }
 386   }
 387
 388   if (Subtarget.hasLZCNT()) {
 389     // When promoting the i8 variants, force them to i32 for a shorter
 390     // encoding.
 391     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
 392     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 393   } else {
 394     for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
 395       if (VT == MVT::i64 && !Subtarget.is64Bit())
 396         continue;
 397       setOperationAction(ISD::CTLZ           , VT, Custom);
 398       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
 399     }
 400   }
 401
 402   for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
 403                   ISD::STRICT_FP_TO_FP16}) {
 404     // Special handling for half-precision floating point conversions.
 405     // If we don't have F16C support, then lower half float conversions
 406     // into library calls.
 407     setOperationAction(
 408         Op, MVT::f32,
 409         (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
 410     // There's never any support for operations beyond MVT::f32.
 411     setOperationAction(Op, MVT::f64, Expand);
 412     setOperationAction(Op, MVT::f80, Expand);
 413     setOperationAction(Op, MVT::f128, Expand);
 414   }
 415
 416   for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
 417     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
 418     setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
 419     setTruncStoreAction(VT, MVT::f16, Expand);
 420     setTruncStoreAction(VT, MVT::bf16, Expand);
 421
 422     setOperationAction(ISD::BF16_TO_FP, VT, Expand);
 423     setOperationAction(ISD::FP_TO_BF16, VT, Custom);
 424   }
 425
 426   setOperationAction(ISD::PARITY, MVT::i8, Custom);
 427   setOperationAction(ISD::PARITY, MVT::i16, Custom);
 428   setOperationAction(ISD::PARITY, MVT::i32, Custom);
 429   if (Subtarget.is64Bit())
 430     setOperationAction(ISD::PARITY, MVT::i64, Custom);
 431   if (Subtarget.hasPOPCNT()) {
 432     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
 433     // popcntw is longer to encode than popcntl and also has a false dependency
 434     // on the dest that popcntl hasn't had since Cannon Lake.
 435     setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
 436   } else {
 437     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 438     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 439     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 440     if (Subtarget.is64Bit())
 441       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 442     else
 443       setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
 444   }
 445
 446   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 447
 448   if (!Subtarget.hasMOVBE())
 449     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 450
 451   // X86 wants to expand cmov itself.
 452   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
 453     setOperationAction(ISD::SELECT, VT, Custom);
 454     setOperationAction(ISD::SETCC, VT, Custom);
 455     setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
 456     setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
 457   }
 458   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 459     if (VT == MVT::i64 && !Subtarget.is64Bit())
 460       continue;
 461     setOperationAction(ISD::SELECT, VT, Custom);
 462     setOperationAction(ISD::SETCC,  VT, Custom);
 463   }
 464
 465   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
 466   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
 467   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
 468
 469   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 470   // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
 471   // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
 472   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 473   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 474   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
 475   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
 476     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 477
 478   // Darwin ABI issue.
 479   for (auto VT : { MVT::i32, MVT::i64 }) {
 480     if (VT == MVT::i64 && !Subtarget.is64Bit())
 481       continue;
 482     setOperationAction(ISD::ConstantPool    , VT, Custom);
 483     setOperationAction(ISD::JumpTable       , VT, Custom);
 484     setOperationAction(ISD::GlobalAddress   , VT, Custom);
 485     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
 486     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
 487     setOperationAction(ISD::BlockAddress    , VT, Custom);
 488   }
 489
 490   // 64-bit shl, sra, srl (iff 32-bit x86)
 491   for (auto VT : { MVT::i32, MVT::i64 }) {
 492     if (VT == MVT::i64 && !Subtarget.is64Bit())
 493       continue;
 494     setOperationAction(ISD::SHL_PARTS, VT, Custom);
 495     setOperationAction(ISD::SRA_PARTS, VT, Custom);
 496     setOperationAction(ISD::SRL_PARTS, VT, Custom);
 497   }
 498
 499   if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
 500     setOperationAction(ISD::PREFETCH      , MVT::Other, Custom);
 501
 502   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 503
 504   // Expand certain atomics
 505   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 506     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 507     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 508     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
 509     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
 510     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
 511     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
 512     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 513   }
 514
 515   if (!Subtarget.is64Bit())
 516     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
 517
 518   if (Subtarget.canUseCMPXCHG16B())
 519     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 520
 521   // FIXME - use subtarget debug flags
 522   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
 523       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
 524       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
 525     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 526   }
 527
 528   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 529   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 530
 531   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 532   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 533
 534   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 535   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 536   if (Subtarget.isTargetPS())
 537     setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
 538   else
 539     setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
 540
 541   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 542   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 543   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 544   bool Is64Bit = Subtarget.is64Bit();
 545   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
 546   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
 547
 548   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 549   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 550
 551   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
 552
 553   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
 554   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
 555   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
 556
 557   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
 558
 559   auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
 560     setOperationAction(ISD::FABS, VT, Action);
 561     setOperationAction(ISD::FNEG, VT, Action);
 562     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 563     setOperationAction(ISD::FREM, VT, Action);
 564     setOperationAction(ISD::FMA, VT, Action);
 565     setOperationAction(ISD::FMINNUM, VT, Action);
 566     setOperationAction(ISD::FMAXNUM, VT, Action);
 567     setOperationAction(ISD::FMINIMUM, VT, Action);
 568     setOperationAction(ISD::FMAXIMUM, VT, Action);
 569     setOperationAction(ISD::FSIN, VT, Action);
 570     setOperationAction(ISD::FCOS, VT, Action);
 571     setOperationAction(ISD::FSINCOS, VT, Action);
 572     setOperationAction(ISD::FSQRT, VT, Action);
 573     setOperationAction(ISD::FPOW, VT, Action);
 574     setOperationAction(ISD::FLOG, VT, Action);
 575     setOperationAction(ISD::FLOG2, VT, Action);
 576     setOperationAction(ISD::FLOG10, VT, Action);
 577     setOperationAction(ISD::FEXP, VT, Action);
 578     setOperationAction(ISD::FEXP2, VT, Action);
 579     setOperationAction(ISD::FEXP10, VT, Action);
 580     setOperationAction(ISD::FCEIL, VT, Action);
 581     setOperationAction(ISD::FFLOOR, VT, Action);
 582     setOperationAction(ISD::FNEARBYINT, VT, Action);
 583     setOperationAction(ISD::FRINT, VT, Action);
 584     setOperationAction(ISD::BR_CC, VT, Action);
 585     setOperationAction(ISD::SETCC, VT, Action);
 586     setOperationAction(ISD::SELECT, VT, Custom);
 587     setOperationAction(ISD::SELECT_CC, VT, Action);
 588     setOperationAction(ISD::FROUND, VT, Action);
 589     setOperationAction(ISD::FROUNDEVEN, VT, Action);
 590     setOperationAction(ISD::FTRUNC, VT, Action);
 591     setOperationAction(ISD::FLDEXP, VT, Action);
 592   };
 593
 594   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
 595     // f16, f32 and f64 use SSE.
 596     // Set up the FP register classes.
 597     addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
 598                                                      : &X86::FR16RegClass);
 599     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
 600                                                      : &X86::FR32RegClass);
 601     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
 602                                                      : &X86::FR64RegClass);
 603
 604     // Disable f32->f64 extload as we can only generate this in one instruction
 605     // under optsize. So its easier to pattern match (fpext (load)) for that
 606     // case instead of needing to emit 2 instructions for extload in the
 607     // non-optsize case.
 608     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
 609
 610     for (auto VT : { MVT::f32, MVT::f64 }) {
 611       // Use ANDPD to simulate FABS.
 612       setOperationAction(ISD::FABS, VT, Custom);
 613
 614       // Use XORP to simulate FNEG.
 615       setOperationAction(ISD::FNEG, VT, Custom);
 616
 617       // Use ANDPD and ORPD to simulate FCOPYSIGN.
 618       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
 619
 620       // These might be better off as horizontal vector ops.
 621       setOperationAction(ISD::FADD, VT, Custom);
 622       setOperationAction(ISD::FSUB, VT, Custom);
 623
 624       // We don't support sin/cos/fmod
 625       setOperationAction(ISD::FSIN   , VT, Expand);
 626       setOperationAction(ISD::FCOS   , VT, Expand);
 627       setOperationAction(ISD::FSINCOS, VT, Expand);
 628     }
 629
 630     // Half type will be promoted by default.
 631     setF16Action(MVT::f16, Promote);
 632     setOperationAction(ISD::FADD, MVT::f16, Promote);
 633     setOperationAction(ISD::FSUB, MVT::f16, Promote);
 634     setOperationAction(ISD::FMUL, MVT::f16, Promote);
 635     setOperationAction(ISD::FDIV, MVT::f16, Promote);
 636     setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
 637     setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
 638     setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
 639
 640     setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);
 641     setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);
 642     setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);
 643     setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);
 644     setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);
 645     setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);
 646     setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);
 647     setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);
 648     setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);
 649     setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);
 650     setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);
 651     setOperationAction(ISD::STRICT_FLDEXP, MVT::f16, Promote);
 652     setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);
 653     setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);
 654     setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);
 655     setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);
 656     setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);
 657     setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);
 658     setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);
 659     setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);
 660     setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);
 661     setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);
 662     setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);
 663     setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
 664     setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);
 665     setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);
 666     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
 667     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
 668     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
 669
 670     setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
 671     setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
 672
 673     // Lower this to MOVMSK plus an AND.
 674     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 675     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 676
 677   } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
 678              (UseX87 || Is64Bit)) {
 679     // Use SSE for f32, x87 for f64.
 680     // Set up the FP register classes.
 681     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 682     if (UseX87)
 683       addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 684
 685     // Use ANDPS to simulate FABS.
 686     setOperationAction(ISD::FABS , MVT::f32, Custom);
 687
 688     // Use XORP to simulate FNEG.
 689     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 690
 691     if (UseX87)
 692       setOperationAction(ISD::UNDEF, MVT::f64, Expand);
 693
 694     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 695     if (UseX87)
 696       setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 697     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 698
 699     // We don't support sin/cos/fmod
 700     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 701     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 702     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 703
 704     if (UseX87) {
 705       // Always expand sin/cos functions even though x87 has an instruction.
 706       setOperationAction(ISD::FSIN, MVT::f64, Expand);
 707       setOperationAction(ISD::FCOS, MVT::f64, Expand);
 708       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 709     }
 710   } else if (UseX87) {
 711     // f32 and f64 in x87.
 712     // Set up the FP register classes.
 713     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 714     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 715
 716     for (auto VT : { MVT::f32, MVT::f64 }) {
 717       setOperationAction(ISD::UNDEF,     VT, Expand);
 718       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 719
 720       // Always expand sin/cos functions even though x87 has an instruction.
 721       setOperationAction(ISD::FSIN   , VT, Expand);
 722       setOperationAction(ISD::FCOS   , VT, Expand);
 723       setOperationAction(ISD::FSINCOS, VT, Expand);
 724     }
 725   }
 726
 727   // Expand FP32 immediates into loads from the stack, save special cases.
 728   if (isTypeLegal(MVT::f32)) {
 729     if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
 730       addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 731       addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 732       addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 733       addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 734     } else // SSE immediates.
 735       addLegalFPImmediate(APFloat(+0.0f)); // xorps
 736   }
 737   // Expand FP64 immediates into loads from the stack, save special cases.
 738   if (isTypeLegal(MVT::f64)) {
 739     if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
 740       addLegalFPImmediate(APFloat(+0.0)); // FLD0
 741       addLegalFPImmediate(APFloat(+1.0)); // FLD1
 742       addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 743       addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 744     } else // SSE immediates.
 745       addLegalFPImmediate(APFloat(+0.0)); // xorpd
 746   }
 747   // Support fp16 0 immediate.
 748   if (isTypeLegal(MVT::f16))
 749     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
 750
 751   // Handle constrained floating-point operations of scalar.
 752   setOperationAction(ISD::STRICT_FADD,      MVT::f32, Legal);
 753   setOperationAction(ISD::STRICT_FADD,      MVT::f64, Legal);
 754   setOperationAction(ISD::STRICT_FSUB,      MVT::f32, Legal);
 755   setOperationAction(ISD::STRICT_FSUB,      MVT::f64, Legal);
 756   setOperationAction(ISD::STRICT_FMUL,      MVT::f32, Legal);
 757   setOperationAction(ISD::STRICT_FMUL,      MVT::f64, Legal);
 758   setOperationAction(ISD::STRICT_FDIV,      MVT::f32, Legal);
 759   setOperationAction(ISD::STRICT_FDIV,      MVT::f64, Legal);
 760   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f32, Legal);
 761   setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f64, Legal);
 762   setOperationAction(ISD::STRICT_FSQRT,     MVT::f32, Legal);
 763   setOperationAction(ISD::STRICT_FSQRT,     MVT::f64, Legal);
 764
 765   // We don't support FMA.
 766   setOperationAction(ISD::FMA, MVT::f64, Expand);
 767   setOperationAction(ISD::FMA, MVT::f32, Expand);
 768
 769   // f80 always uses X87.
 770   if (UseX87) {
 771     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 772     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 773     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 774     {
 775       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
 776       addLegalFPImmediate(TmpFlt);  // FLD0
 777       TmpFlt.changeSign();
 778       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 779
 780       bool ignored;
 781       APFloat TmpFlt2(+1.0);
 782       TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
 783                       &ignored);
 784       addLegalFPImmediate(TmpFlt2);  // FLD1
 785       TmpFlt2.changeSign();
 786       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 787     }
 788
 789     // Always expand sin/cos functions even though x87 has an instruction.
 790     setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 791     setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 792     setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 793
 794     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 795     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 796     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 797     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 798     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 799     setOperationAction(ISD::FMA, MVT::f80, Expand);
 800     setOperationAction(ISD::LROUND, MVT::f80, Expand);
 801     setOperationAction(ISD::LLROUND, MVT::f80, Expand);
 802     setOperationAction(ISD::LRINT, MVT::f80, Custom);
 803     setOperationAction(ISD::LLRINT, MVT::f80, Custom);
 804
 805     // Handle constrained floating-point operations of scalar.
 806     setOperationAction(ISD::STRICT_FADD     , MVT::f80, Legal);
 807     setOperationAction(ISD::STRICT_FSUB     , MVT::f80, Legal);
 808     setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
 809     setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
 810     setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
 811     if (isTypeLegal(MVT::f16)) {
 812       setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
 813       setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
 814     } else {
 815       setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
 816     }
 817     // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
 818     // as Custom.
 819     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
 820   }
 821
 822   // f128 uses xmm registers, but most operations require libcalls.
 823   if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
 824     addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
 825                                                    : &X86::VR128RegClass);
 826
 827     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
 828
 829     setOperationAction(ISD::FADD,        MVT::f128, LibCall);
 830     setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
 831     setOperationAction(ISD::FSUB,        MVT::f128, LibCall);
 832     setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
 833     setOperationAction(ISD::FDIV,        MVT::f128, LibCall);
 834     setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
 835     setOperationAction(ISD::FMUL,        MVT::f128, LibCall);
 836     setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
 837     setOperationAction(ISD::FMA,         MVT::f128, LibCall);
 838     setOperationAction(ISD::STRICT_FMA,  MVT::f128, LibCall);
 839
 840     setOperationAction(ISD::FABS, MVT::f128, Custom);
 841     setOperationAction(ISD::FNEG, MVT::f128, Custom);
 842     setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
 843
 844     setOperationAction(ISD::FSIN,         MVT::f128, LibCall);
 845     setOperationAction(ISD::STRICT_FSIN,  MVT::f128, LibCall);
 846     setOperationAction(ISD::FCOS,         MVT::f128, LibCall);
 847     setOperationAction(ISD::STRICT_FCOS,  MVT::f128, LibCall);
 848     setOperationAction(ISD::FSINCOS,      MVT::f128, LibCall);
 849     // No STRICT_FSINCOS
 850     setOperationAction(ISD::FSQRT,        MVT::f128, LibCall);
 851     setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
 852
 853     setOperationAction(ISD::FP_EXTEND,        MVT::f128, Custom);
 854     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
 855     // We need to custom handle any FP_ROUND with an f128 input, but
 856     // LegalizeDAG uses the result type to know when to run a custom handler.
 857     // So we have to list all legal floating point result types here.
 858     if (isTypeLegal(MVT::f32)) {
 859       setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
 860       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
 861     }
 862     if (isTypeLegal(MVT::f64)) {
 863       setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
 864       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
 865     }
 866     if (isTypeLegal(MVT::f80)) {
 867       setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
 868       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
 869     }
 870
 871     setOperationAction(ISD::SETCC, MVT::f128, Custom);
 872
 873     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
 874     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
 875     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
 876     setTruncStoreAction(MVT::f128, MVT::f32, Expand);
 877     setTruncStoreAction(MVT::f128, MVT::f64, Expand);
 878     setTruncStoreAction(MVT::f128, MVT::f80, Expand);
 879   }
 880
 881   // Always use a library call for pow.
 882   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 883   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 884   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 885   setOperationAction(ISD::FPOW             , MVT::f128 , Expand);
 886
 887   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 888   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 889   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 890   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 891   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 892   setOperationAction(ISD::FEXP10, MVT::f80, Expand);
 893   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 894   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 895
 896   // Some FP actions are always expanded for vector types.
 897   for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
 898                    MVT::v4f32, MVT::v8f32,  MVT::v16f32,
 899                    MVT::v2f64, MVT::v4f64,  MVT::v8f64 }) {
 900     setOperationAction(ISD::FSIN,      VT, Expand);
 901     setOperationAction(ISD::FSINCOS,   VT, Expand);
 902     setOperationAction(ISD::FCOS,      VT, Expand);
 903     setOperationAction(ISD::FREM,      VT, Expand);
 904     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 905     setOperationAction(ISD::FPOW,      VT, Expand);
 906     setOperationAction(ISD::FLOG,      VT, Expand);
 907     setOperationAction(ISD::FLOG2,     VT, Expand);
 908     setOperationAction(ISD::FLOG10,    VT, Expand);
 909     setOperationAction(ISD::FEXP,      VT, Expand);
 910     setOperationAction(ISD::FEXP2,     VT, Expand);
 911     setOperationAction(ISD::FEXP10,    VT, Expand);
 912   }
 913
 914   // First set operation action for all vector types to either promote
 915   // (for widening) or expand (for scalarization). Then we will selectively
 916   // turn on ones that can be effectively codegen'd.
 917   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
 918     setOperationAction(ISD::SDIV, VT, Expand);
 919     setOperationAction(ISD::UDIV, VT, Expand);
 920     setOperationAction(ISD::SREM, VT, Expand);
 921     setOperationAction(ISD::UREM, VT, Expand);
 922     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 923     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 924     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 925     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 926     setOperationAction(ISD::FMA,  VT, Expand);
 927     setOperationAction(ISD::FFLOOR, VT, Expand);
 928     setOperationAction(ISD::FCEIL, VT, Expand);
 929     setOperationAction(ISD::FTRUNC, VT, Expand);
 930     setOperationAction(ISD::FRINT, VT, Expand);
 931     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 932     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 933     setOperationAction(ISD::MULHS, VT, Expand);
 934     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 935     setOperationAction(ISD::MULHU, VT, Expand);
 936     setOperationAction(ISD::SDIVREM, VT, Expand);
 937     setOperationAction(ISD::UDIVREM, VT, Expand);
 938     setOperationAction(ISD::CTPOP, VT, Expand);
 939     setOperationAction(ISD::CTTZ, VT, Expand);
 940     setOperationAction(ISD::CTLZ, VT, Expand);
 941     setOperationAction(ISD::ROTL, VT, Expand);
 942     setOperationAction(ISD::ROTR, VT, Expand);
 943     setOperationAction(ISD::BSWAP, VT, Expand);
 944     setOperationAction(ISD::SETCC, VT, Expand);
 945     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 946     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 947     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 948     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 949     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 950     setOperationAction(ISD::TRUNCATE, VT, Expand);
 951     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 952     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 953     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 954     setOperationAction(ISD::SELECT_CC, VT, Expand);
 955     for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
 956       setTruncStoreAction(InnerVT, VT, Expand);
 957
 958       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 959       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 960
 961       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 962       // types, we have to deal with them whether we ask for Expansion or not.
 963       // Setting Expand causes its own optimisation problems though, so leave
 964       // them legal.
 965       if (VT.getVectorElementType() == MVT::i1)
 966         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 967
 968       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
 969       // split/scalarized right now.
 970       if (VT.getVectorElementType() == MVT::f16 ||
 971           VT.getVectorElementType() == MVT::bf16)
 972         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 973     }
 974   }
 975
 976   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 977   // with -msoft-float, disable use of MMX as well.
 978   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
 979     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 980     // No operations on x86mmx supported, everything uses intrinsics.
 981   }
 982
 983   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
 984     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
 985                                                     : &X86::VR128RegClass);
 986
 987     setOperationAction(ISD::FMAXIMUM,           MVT::f32, Custom);
 988     setOperationAction(ISD::FMINIMUM,           MVT::f32, Custom);
 989
 990     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 991     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 992     setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
 993     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 994     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 995     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
 996     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 997     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 998
 999     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
1000     setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
1001
1002     setOperationAction(ISD::STRICT_FADD,        MVT::v4f32, Legal);
1003     setOperationAction(ISD::STRICT_FSUB,        MVT::v4f32, Legal);
1004     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f32, Legal);
1005     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f32, Legal);
1006     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f32, Legal);
1007   }
1008
1009   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1010     addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1011                                                     : &X86::VR128RegClass);
1012
1013     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1014     // registers cannot be used even for integer operations.
1015     addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1016                                                     : &X86::VR128RegClass);
1017     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1018                                                     : &X86::VR128RegClass);
1019     addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1020                                                     : &X86::VR128RegClass);
1021     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1022                                                     : &X86::VR128RegClass);
1023     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1024                                                     : &X86::VR128RegClass);
1025
1026     for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1027       setOperationAction(ISD::FMAXIMUM, VT, Custom);
1028       setOperationAction(ISD::FMINIMUM, VT, Custom);
1029     }
1030
1031     for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1032                      MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1033       setOperationAction(ISD::SDIV, VT, Custom);
1034       setOperationAction(ISD::SREM, VT, Custom);
1035       setOperationAction(ISD::UDIV, VT, Custom);
1036       setOperationAction(ISD::UREM, VT, Custom);
1037     }
1038
1039     setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
1040     setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);
1041     setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);
1042
1043     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
1044     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
1045     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
1046     setOperationAction(ISD::MULHU,              MVT::v4i32, Custom);
1047     setOperationAction(ISD::MULHS,              MVT::v4i32, Custom);
1048     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
1049     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
1050     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
1051     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
1052     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
1053     setOperationAction(ISD::AVGCEILU,           MVT::v16i8, Legal);
1054     setOperationAction(ISD::AVGCEILU,           MVT::v8i16, Legal);
1055
1056     setOperationAction(ISD::SMULO,              MVT::v16i8, Custom);
1057     setOperationAction(ISD::UMULO,              MVT::v16i8, Custom);
1058     setOperationAction(ISD::UMULO,              MVT::v2i32, Custom);
1059
1060     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
1061     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
1062     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
1063
1064     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1065       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1066       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1067       setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1068       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1069     }
1070
1071     setOperationAction(ISD::ABDU,               MVT::v16i8, Custom);
1072     setOperationAction(ISD::ABDS,               MVT::v16i8, Custom);
1073     setOperationAction(ISD::ABDU,               MVT::v8i16, Custom);
1074     setOperationAction(ISD::ABDS,               MVT::v8i16, Custom);
1075     setOperationAction(ISD::ABDU,               MVT::v4i32, Custom);
1076     setOperationAction(ISD::ABDS,               MVT::v4i32, Custom);
1077
1078     setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
1079     setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
1080     setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
1081     setOperationAction(ISD::SSUBSAT,            MVT::v16i8, Legal);
1082     setOperationAction(ISD::UADDSAT,            MVT::v8i16, Legal);
1083     setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);
1084     setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);
1085     setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);
1086     setOperationAction(ISD::USUBSAT,            MVT::v4i32, Custom);
1087     setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);
1088
1089     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1090     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1091     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1092     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1093
1094     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1095       setOperationAction(ISD::SETCC,              VT, Custom);
1096       setOperationAction(ISD::CTPOP,              VT, Custom);
1097       setOperationAction(ISD::ABS,                VT, Custom);
1098
1099       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1100       // setcc all the way to isel and prefer SETGT in some isel patterns.
1101       setCondCodeAction(ISD::SETLT, VT, Custom);
1102       setCondCodeAction(ISD::SETLE, VT, Custom);
1103     }
1104
1105     setOperationAction(ISD::SETCC,          MVT::v2f64, Custom);
1106     setOperationAction(ISD::SETCC,          MVT::v4f32, Custom);
1107     setOperationAction(ISD::STRICT_FSETCC,  MVT::v2f64, Custom);
1108     setOperationAction(ISD::STRICT_FSETCC,  MVT::v4f32, Custom);
1109     setOperationAction(ISD::STRICT_FSETCCS, MVT::v2f64, Custom);
1110     setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f32, Custom);
1111
1112     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1113       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1114       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1115       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1116       setOperationAction(ISD::VSELECT,            VT, Custom);
1117       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1118     }
1119
1120     for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1121       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1122       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1123       setOperationAction(ISD::VSELECT,            VT, Custom);
1124
1125       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1126         continue;
1127
1128       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1129       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1130     }
1131     setF16Action(MVT::v8f16, Expand);
1132     setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1133     setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1134     setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1135     setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1136
1137     // Custom lower v2i64 and v2f64 selects.
1138     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1139     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1140     setOperationAction(ISD::SELECT,             MVT::v4i32, Custom);
1141     setOperationAction(ISD::SELECT,             MVT::v8i16, Custom);
1142     setOperationAction(ISD::SELECT,             MVT::v8f16, Custom);
1143     setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
1144
1145     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Custom);
1146     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);
1147     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
1148     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
1149     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Custom);
1150     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v2i32, Custom);
1151
1152     // Custom legalize these to avoid over promotion or custom promotion.
1153     for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1154       setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
1155       setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
1156       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1157       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1158     }
1159
1160     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Custom);
1161     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v4i32, Custom);
1162     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
1163     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2i32, Custom);
1164
1165     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
1166     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2i32, Custom);
1167
1168     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
1169     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v4i32, Custom);
1170
1171     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1172     setOperationAction(ISD::SINT_TO_FP,         MVT::v2f32, Custom);
1173     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2f32, Custom);
1174     setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
1175     setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2f32, Custom);
1176
1177     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1178     setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v2f32, Custom);
1179     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1180     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v2f32, Custom);
1181
1182     // We want to legalize this to an f64 load rather than an i64 load on
1183     // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1184     // store.
1185     setOperationAction(ISD::LOAD,               MVT::v2i32, Custom);
1186     setOperationAction(ISD::LOAD,               MVT::v4i16, Custom);
1187     setOperationAction(ISD::LOAD,               MVT::v8i8,  Custom);
1188     setOperationAction(ISD::STORE,              MVT::v2i32, Custom);
1189     setOperationAction(ISD::STORE,              MVT::v4i16, Custom);
1190     setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);
1191
1192     // Add 32-bit vector stores to help vectorization opportunities.
1193     setOperationAction(ISD::STORE,              MVT::v2i16, Custom);
1194     setOperationAction(ISD::STORE,              MVT::v4i8,  Custom);
1195
1196     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1197     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1198     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1199     if (!Subtarget.hasAVX512())
1200       setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1201
1202     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1203     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1204     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1205
1206     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1207
1208     setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
1209     setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
1210     setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
1211     setOperationAction(ISD::TRUNCATE,    MVT::v2i64, Custom);
1212     setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
1213     setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
1214     setOperationAction(ISD::TRUNCATE,    MVT::v4i32, Custom);
1215     setOperationAction(ISD::TRUNCATE,    MVT::v4i64, Custom);
1216     setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
1217     setOperationAction(ISD::TRUNCATE,    MVT::v8i16, Custom);
1218     setOperationAction(ISD::TRUNCATE,    MVT::v8i32, Custom);
1219     setOperationAction(ISD::TRUNCATE,    MVT::v8i64, Custom);
1220     setOperationAction(ISD::TRUNCATE,    MVT::v16i8, Custom);
1221     setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Custom);
1222     setOperationAction(ISD::TRUNCATE,    MVT::v16i32, Custom);
1223     setOperationAction(ISD::TRUNCATE,    MVT::v16i64, Custom);
1224
1225     // In the customized shift lowering, the legal v4i32/v2i64 cases
1226     // in AVX2 will be recognized.
1227     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1228       setOperationAction(ISD::SRL,              VT, Custom);
1229       setOperationAction(ISD::SHL,              VT, Custom);
1230       setOperationAction(ISD::SRA,              VT, Custom);
1231       if (VT == MVT::v2i64) continue;
1232       setOperationAction(ISD::ROTL,             VT, Custom);
1233       setOperationAction(ISD::ROTR,             VT, Custom);
1234       setOperationAction(ISD::FSHL,             VT, Custom);
1235       setOperationAction(ISD::FSHR,             VT, Custom);
1236     }
1237
1238     setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);
1239     setOperationAction(ISD::STRICT_FADD,        MVT::v2f64, Legal);
1240     setOperationAction(ISD::STRICT_FSUB,        MVT::v2f64, Legal);
1241     setOperationAction(ISD::STRICT_FMUL,        MVT::v2f64, Legal);
1242     setOperationAction(ISD::STRICT_FDIV,        MVT::v2f64, Legal);
1243   }
1244
1245   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1246     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
1247     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
1248     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
1249     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
1250     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
1251     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
1252     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
1253     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
1254
1255     // These might be better off as horizontal vector ops.
1256     setOperationAction(ISD::ADD,                MVT::i16, Custom);
1257     setOperationAction(ISD::ADD,                MVT::i32, Custom);
1258     setOperationAction(ISD::SUB,                MVT::i16, Custom);
1259     setOperationAction(ISD::SUB,                MVT::i32, Custom);
1260   }
1261
1262   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1263     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1264       setOperationAction(ISD::FFLOOR,            RoundedTy,  Legal);
1265       setOperationAction(ISD::STRICT_FFLOOR,     RoundedTy,  Legal);
1266       setOperationAction(ISD::FCEIL,             RoundedTy,  Legal);
1267       setOperationAction(ISD::STRICT_FCEIL,      RoundedTy,  Legal);
1268       setOperationAction(ISD::FTRUNC,            RoundedTy,  Legal);
1269       setOperationAction(ISD::STRICT_FTRUNC,     RoundedTy,  Legal);
1270       setOperationAction(ISD::FRINT,             RoundedTy,  Legal);
1271       setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
1272       setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
1273       setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
1274       setOperationAction(ISD::FROUNDEVEN,        RoundedTy,  Legal);
1275       setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy,  Legal);
1276
1277       setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
1278     }
1279
1280     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
1281     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
1282     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
1283     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
1284     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
1285     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
1286     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
1287     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
1288
1289     for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1290       setOperationAction(ISD::ABDS,             VT, Custom);
1291       setOperationAction(ISD::ABDU,             VT, Custom);
1292     }
1293
1294     setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
1295     setOperationAction(ISD::SADDSAT,            MVT::v2i64, Custom);
1296     setOperationAction(ISD::SSUBSAT,            MVT::v2i64, Custom);
1297
1298     // FIXME: Do we need to handle scalar-to-vector here?
1299     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1300     setOperationAction(ISD::SMULO,              MVT::v2i32, Custom);
1301
1302     // We directly match byte blends in the backend as they match the VSELECT
1303     // condition form.
1304     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1305
1306     // SSE41 brings specific instructions for doing vector sign extend even in
1307     // cases where we don't have SRA.
1308     for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1309       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1310       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1311     }
1312
1313     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1314     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1315       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
1316       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
1317       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
1318       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1319       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1320       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1321     }
1322
1323     if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1324       // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1325       // do the pre and post work in the vector domain.
1326       setOperationAction(ISD::UINT_TO_FP,        MVT::v4i64, Custom);
1327       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1328       // We need to mark SINT_TO_FP as Custom even though we want to expand it
1329       // so that DAG combine doesn't try to turn it into uint_to_fp.
1330       setOperationAction(ISD::SINT_TO_FP,        MVT::v4i64, Custom);
1331       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1332     }
1333   }
1334
1335   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1336     setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);
1337   }
1338
1339   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1340     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1341                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1342       setOperationAction(ISD::ROTL, VT, Custom);
1343       setOperationAction(ISD::ROTR, VT, Custom);
1344     }
1345
1346     // XOP can efficiently perform BITREVERSE with VPPERM.
1347     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1348       setOperationAction(ISD::BITREVERSE, VT, Custom);
1349
1350     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
1351                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1352       setOperationAction(ISD::BITREVERSE, VT, Custom);
1353   }
1354
1355   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1356     bool HasInt256 = Subtarget.hasInt256();
1357
1358     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1359                                                      : &X86::VR256RegClass);
1360     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1361                                                      : &X86::VR256RegClass);
1362     addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1363                                                      : &X86::VR256RegClass);
1364     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1365                                                      : &X86::VR256RegClass);
1366     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1367                                                      : &X86::VR256RegClass);
1368     addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1369                                                      : &X86::VR256RegClass);
1370     addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
1371                                                      : &X86::VR256RegClass);
1372
1373     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1374       setOperationAction(ISD::FFLOOR,            VT, Legal);
1375       setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1376       setOperationAction(ISD::FCEIL,             VT, Legal);
1377       setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1378       setOperationAction(ISD::FTRUNC,            VT, Legal);
1379       setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1380       setOperationAction(ISD::FRINT,             VT, Legal);
1381       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1382       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1383       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1384       setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1385       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1386
1387       setOperationAction(ISD::FROUND,            VT, Custom);
1388
1389       setOperationAction(ISD::FNEG,              VT, Custom);
1390       setOperationAction(ISD::FABS,              VT, Custom);
1391       setOperationAction(ISD::FCOPYSIGN,         VT, Custom);
1392
1393       setOperationAction(ISD::FMAXIMUM,          VT, Custom);
1394       setOperationAction(ISD::FMINIMUM,          VT, Custom);
1395     }
1396
1397     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1398     // even though v8i16 is a legal type.
1399     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
1400     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i16, MVT::v8i32);
1401     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1402     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1403     setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Custom);
1404     setOperationAction(ISD::FP_TO_UINT,                MVT::v8i32, Custom);
1405     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Custom);
1406
1407     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Custom);
1408     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i32, Custom);
1409     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Expand);
1410     setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Expand);
1411     setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);
1412     setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Custom);
1413
1414     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v4f32, Legal);
1415     setOperationAction(ISD::STRICT_FADD,        MVT::v8f32, Legal);
1416     setOperationAction(ISD::STRICT_FADD,        MVT::v4f64, Legal);
1417     setOperationAction(ISD::STRICT_FSUB,        MVT::v8f32, Legal);
1418     setOperationAction(ISD::STRICT_FSUB,        MVT::v4f64, Legal);
1419     setOperationAction(ISD::STRICT_FMUL,        MVT::v8f32, Legal);
1420     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f64, Legal);
1421     setOperationAction(ISD::STRICT_FDIV,        MVT::v8f32, Legal);
1422     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f64, Legal);
1423     setOperationAction(ISD::STRICT_FSQRT,       MVT::v8f32, Legal);
1424     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f64, Legal);
1425
1426     if (!Subtarget.hasAVX512())
1427       setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1428
1429     // In the customized shift lowering, the legal v8i32/v4i64 cases
1430     // in AVX2 will be recognized.
1431     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1432       setOperationAction(ISD::SRL,             VT, Custom);
1433       setOperationAction(ISD::SHL,             VT, Custom);
1434       setOperationAction(ISD::SRA,             VT, Custom);
1435       setOperationAction(ISD::ABDS,            VT, Custom);
1436       setOperationAction(ISD::ABDU,            VT, Custom);
1437       if (VT == MVT::v4i64) continue;
1438       setOperationAction(ISD::ROTL,            VT, Custom);
1439       setOperationAction(ISD::ROTR,            VT, Custom);
1440       setOperationAction(ISD::FSHL,            VT, Custom);
1441       setOperationAction(ISD::FSHR,            VT, Custom);
1442     }
1443
1444     // These types need custom splitting if their input is a 128-bit vector.
1445     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
1446     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
1447     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);
1448     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);
1449
1450     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1451     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1452     setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);
1453     setOperationAction(ISD::SELECT,            MVT::v16i16, Custom);
1454     setOperationAction(ISD::SELECT,            MVT::v16f16, Custom);
1455     setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
1456     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1457
1458     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1459       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
1460       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
1461       setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
1462     }
1463
1464     setOperationAction(ISD::TRUNCATE,          MVT::v32i8, Custom);
1465     setOperationAction(ISD::TRUNCATE,          MVT::v32i16, Custom);
1466     setOperationAction(ISD::TRUNCATE,          MVT::v32i32, Custom);
1467     setOperationAction(ISD::TRUNCATE,          MVT::v32i64, Custom);
1468
1469     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1470
1471     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1472       setOperationAction(ISD::SETCC,           VT, Custom);
1473       setOperationAction(ISD::CTPOP,           VT, Custom);
1474       setOperationAction(ISD::CTLZ,            VT, Custom);
1475
1476       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1477       // setcc all the way to isel and prefer SETGT in some isel patterns.
1478       setCondCodeAction(ISD::SETLT, VT, Custom);
1479       setCondCodeAction(ISD::SETLE, VT, Custom);
1480     }
1481
1482     setOperationAction(ISD::SETCC,          MVT::v4f64, Custom);
1483     setOperationAction(ISD::SETCC,          MVT::v8f32, Custom);
1484     setOperationAction(ISD::STRICT_FSETCC,  MVT::v4f64, Custom);
1485     setOperationAction(ISD::STRICT_FSETCC,  MVT::v8f32, Custom);
1486     setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f64, Custom);
1487     setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f32, Custom);
1488
1489     if (Subtarget.hasAnyFMA()) {
1490       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1491                        MVT::v2f64, MVT::v4f64 }) {
1492         setOperationAction(ISD::FMA, VT, Legal);
1493         setOperationAction(ISD::STRICT_FMA, VT, Legal);
1494       }
1495     }
1496
1497     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1498       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1499       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1500     }
1501
1502     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1503     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1504     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1505     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1506
1507     setOperationAction(ISD::MULHU,     MVT::v8i32,  Custom);
1508     setOperationAction(ISD::MULHS,     MVT::v8i32,  Custom);
1509     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1510     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1511     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1512     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1513     setOperationAction(ISD::AVGCEILU,  MVT::v16i16, HasInt256 ? Legal : Custom);
1514     setOperationAction(ISD::AVGCEILU,  MVT::v32i8,  HasInt256 ? Legal : Custom);
1515
1516     setOperationAction(ISD::SMULO,     MVT::v32i8, Custom);
1517     setOperationAction(ISD::UMULO,     MVT::v32i8, Custom);
1518
1519     setOperationAction(ISD::ABS,       MVT::v4i64,  Custom);
1520     setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);
1521     setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);
1522     setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
1523     setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
1524
1525     setOperationAction(ISD::UADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1526     setOperationAction(ISD::SADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1527     setOperationAction(ISD::USUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1528     setOperationAction(ISD::SSUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
1529     setOperationAction(ISD::UADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1530     setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1531     setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1532     setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
1533     setOperationAction(ISD::UADDSAT,   MVT::v8i32, Custom);
1534     setOperationAction(ISD::USUBSAT,   MVT::v8i32, Custom);
1535     setOperationAction(ISD::UADDSAT,   MVT::v4i64, Custom);
1536     setOperationAction(ISD::USUBSAT,   MVT::v4i64, Custom);
1537
1538     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1539       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
1540       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1541       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1542       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1543       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1544     }
1545
1546     for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1547       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1548       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1549     }
1550
1551     if (HasInt256) {
1552       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1553       // when we have a 256bit-wide blend with immediate.
1554       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1555       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1556
1557       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1558       for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1559         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1560         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
1561         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
1562         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
1563         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
1564         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
1565       }
1566     }
1567
1568     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1569                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1570       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
1571       setOperationAction(ISD::MSTORE, VT, Legal);
1572     }
1573
1574     // Extract subvector is special because the value type
1575     // (result) is 128-bit but the source is 256-bit wide.
1576     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1577                      MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1578       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1579     }
1580
1581     // Custom lower several nodes for 256-bit types.
1582     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1583                     MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1584       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1585       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1586       setOperationAction(ISD::VSELECT,            VT, Custom);
1587       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1588       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1589       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1590       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1591       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1592       setOperationAction(ISD::STORE,              VT, Custom);
1593     }
1594     setF16Action(MVT::v16f16, Expand);
1595     setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1596     setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1597     setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1598     setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1599
1600     if (HasInt256) {
1601       setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1602
1603       // Custom legalize 2x32 to get a little better code.
1604       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1605       setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1606
1607       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1608                        MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1609         setOperationAction(ISD::MGATHER,  VT, Custom);
1610     }
1611   }
1612
1613   if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1614       Subtarget.hasF16C()) {
1615     for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1616       setOperationAction(ISD::FP_ROUND,           VT, Custom);
1617       setOperationAction(ISD::STRICT_FP_ROUND,    VT, Custom);
1618     }
1619     for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1620       setOperationAction(ISD::FP_EXTEND,          VT, Custom);
1621       setOperationAction(ISD::STRICT_FP_EXTEND,   VT, Custom);
1622     }
1623     for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1624       setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1625       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1626     }
1627   }
1628
1629   // This block controls legalization of the mask vector sizes that are
1630   // available with AVX512. 512-bit vectors are in a separate block controlled
1631   // by useAVX512Regs.
1632   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1633     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
1634     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1635     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1636     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1637     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1638
1639     setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
1640     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1641     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
1642
1643     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i1,  MVT::v8i32);
1644     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i1,  MVT::v8i32);
1645     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v4i1,  MVT::v4i32);
1646     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v4i1,  MVT::v4i32);
1647     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
1648     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
1649     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
1650     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
1651     setOperationAction(ISD::FP_TO_SINT,                MVT::v2i1,  Custom);
1652     setOperationAction(ISD::FP_TO_UINT,                MVT::v2i1,  Custom);
1653     setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v2i1,  Custom);
1654     setOperationAction(ISD::STRICT_FP_TO_UINT,         MVT::v2i1,  Custom);
1655
1656     // There is no byte sized k-register load or store without AVX512DQ.
1657     if (!Subtarget.hasDQI()) {
1658       setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1659       setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1660       setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1661       setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1662
1663       setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1664       setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1665       setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1666       setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1667     }
1668
1669     // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1670     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1671       setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1672       setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1673       setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
1674     }
1675
1676     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1677       setOperationAction(ISD::VSELECT,          VT, Expand);
1678
1679     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1680       setOperationAction(ISD::SETCC,            VT, Custom);
1681       setOperationAction(ISD::SELECT,           VT, Custom);
1682       setOperationAction(ISD::TRUNCATE,         VT, Custom);
1683
1684       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
1685       setOperationAction(ISD::CONCAT_VECTORS,   VT, Custom);
1686       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1687       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1688       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1689       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
1690     }
1691
1692     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1693       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1694   }
1695
1696   // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1697   // elements. 512-bits can be disabled based on prefer-vector-width and
1698   // required-vector-width function attributes.
1699   if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1700     bool HasBWI = Subtarget.hasBWI();
1701
1702     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1703     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1704     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1705     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1706     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1707     addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1708     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1709
1710     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1711       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1712       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1713       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1714       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1715       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1716       if (HasBWI)
1717         setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1718     }
1719
1720     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1721       setOperationAction(ISD::FMAXIMUM, VT, Custom);
1722       setOperationAction(ISD::FMINIMUM, VT, Custom);
1723       setOperationAction(ISD::FNEG,  VT, Custom);
1724       setOperationAction(ISD::FABS,  VT, Custom);
1725       setOperationAction(ISD::FMA,   VT, Legal);
1726       setOperationAction(ISD::STRICT_FMA, VT, Legal);
1727       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1728     }
1729
1730     for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1731       setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
1732       setOperationPromotedToType(ISD::FP_TO_UINT       , VT, MVT::v16i32);
1733       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1734       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1735     }
1736
1737     for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1738       setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
1739       setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
1740       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1741       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1742     }
1743
1744     setOperationAction(ISD::SINT_TO_FP,        MVT::v16i32, Custom);
1745     setOperationAction(ISD::UINT_TO_FP,        MVT::v16i32, Custom);
1746     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
1747     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
1748     setOperationAction(ISD::FP_EXTEND,         MVT::v8f64,  Custom);
1749     setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v8f64,  Custom);
1750
1751     setOperationAction(ISD::STRICT_FADD,      MVT::v16f32, Legal);
1752     setOperationAction(ISD::STRICT_FADD,      MVT::v8f64,  Legal);
1753     setOperationAction(ISD::STRICT_FSUB,      MVT::v16f32, Legal);
1754     setOperationAction(ISD::STRICT_FSUB,      MVT::v8f64,  Legal);
1755     setOperationAction(ISD::STRICT_FMUL,      MVT::v16f32, Legal);
1756     setOperationAction(ISD::STRICT_FMUL,      MVT::v8f64,  Legal);
1757     setOperationAction(ISD::STRICT_FDIV,      MVT::v16f32, Legal);
1758     setOperationAction(ISD::STRICT_FDIV,      MVT::v8f64,  Legal);
1759     setOperationAction(ISD::STRICT_FSQRT,     MVT::v16f32, Legal);
1760     setOperationAction(ISD::STRICT_FSQRT,     MVT::v8f64,  Legal);
1761     setOperationAction(ISD::STRICT_FP_ROUND,  MVT::v8f32,  Legal);
1762
1763     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1764     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1765     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1766     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1767     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1768     if (HasBWI)
1769       setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1770
1771     // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1772     // to 512-bit rather than use the AVX2 instructions so that we can use
1773     // k-masks.
1774     if (!Subtarget.hasVLX()) {
1775       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1776            MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1777         setOperationAction(ISD::MLOAD,  VT, Custom);
1778         setOperationAction(ISD::MSTORE, VT, Custom);
1779       }
1780     }
1781
1782     setOperationAction(ISD::TRUNCATE,    MVT::v8i32,  Legal);
1783     setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Legal);
1784     setOperationAction(ISD::TRUNCATE,    MVT::v32i8,  HasBWI ? Legal : Custom);
1785     setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1786     setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1787     setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
1788     setOperationAction(ISD::ANY_EXTEND,  MVT::v32i16, Custom);
1789     setOperationAction(ISD::ANY_EXTEND,  MVT::v16i32, Custom);
1790     setOperationAction(ISD::ANY_EXTEND,  MVT::v8i64,  Custom);
1791     setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1792     setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1793     setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
1794
1795     if (HasBWI) {
1796       // Extends from v64i1 masks to 512-bit vectors.
1797       setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1798       setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1799       setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
1800     }
1801
1802     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1803       setOperationAction(ISD::FFLOOR,            VT, Legal);
1804       setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
1805       setOperationAction(ISD::FCEIL,             VT, Legal);
1806       setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
1807       setOperationAction(ISD::FTRUNC,            VT, Legal);
1808       setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
1809       setOperationAction(ISD::FRINT,             VT, Legal);
1810       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
1811       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
1812       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1813       setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
1814       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1815
1816       setOperationAction(ISD::FROUND,            VT, Custom);
1817     }
1818
1819     for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1820       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1821       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1822     }
1823
1824     setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1825     setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1826     setOperationAction(ISD::ADD, MVT::v64i8,  HasBWI ? Legal : Custom);
1827     setOperationAction(ISD::SUB, MVT::v64i8,  HasBWI ? Legal : Custom);
1828
1829     setOperationAction(ISD::MUL, MVT::v8i64,  Custom);
1830     setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1831     setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1832     setOperationAction(ISD::MUL, MVT::v64i8,  Custom);
1833
1834     setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1835     setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1836     setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1837     setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1838     setOperationAction(ISD::MULHS, MVT::v64i8,  Custom);
1839     setOperationAction(ISD::MULHU, MVT::v64i8,  Custom);
1840     setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1841     setOperationAction(ISD::AVGCEILU, MVT::v64i8,  HasBWI ? Legal : Custom);
1842
1843     setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1844     setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1845
1846     setOperationAction(ISD::BITREVERSE, MVT::v64i8,  Custom);
1847
1848     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1849       setOperationAction(ISD::SRL,              VT, Custom);
1850       setOperationAction(ISD::SHL,              VT, Custom);
1851       setOperationAction(ISD::SRA,              VT, Custom);
1852       setOperationAction(ISD::ROTL,             VT, Custom);
1853       setOperationAction(ISD::ROTR,             VT, Custom);
1854       setOperationAction(ISD::SETCC,            VT, Custom);
1855       setOperationAction(ISD::ABDS,             VT, Custom);
1856       setOperationAction(ISD::ABDU,             VT, Custom);
1857
1858       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1859       // setcc all the way to isel and prefer SETGT in some isel patterns.
1860       setCondCodeAction(ISD::SETLT, VT, Custom);
1861       setCondCodeAction(ISD::SETLE, VT, Custom);
1862     }
1863
1864     setOperationAction(ISD::SETCC,          MVT::v8f64, Custom);
1865     setOperationAction(ISD::SETCC,          MVT::v16f32, Custom);
1866     setOperationAction(ISD::STRICT_FSETCC,  MVT::v8f64, Custom);
1867     setOperationAction(ISD::STRICT_FSETCC,  MVT::v16f32, Custom);
1868     setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f64, Custom);
1869     setOperationAction(ISD::STRICT_FSETCCS, MVT::v16f32, Custom);
1870
1871     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1872       setOperationAction(ISD::SMAX,             VT, Legal);
1873       setOperationAction(ISD::UMAX,             VT, Legal);
1874       setOperationAction(ISD::SMIN,             VT, Legal);
1875       setOperationAction(ISD::UMIN,             VT, Legal);
1876       setOperationAction(ISD::ABS,              VT, Legal);
1877       setOperationAction(ISD::CTPOP,            VT, Custom);
1878     }
1879
1880     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1881       setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
1882       setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
1883       setOperationAction(ISD::CTLZ,    VT, Custom);
1884       setOperationAction(ISD::SMAX,    VT, HasBWI ? Legal : Custom);
1885       setOperationAction(ISD::UMAX,    VT, HasBWI ? Legal : Custom);
1886       setOperationAction(ISD::SMIN,    VT, HasBWI ? Legal : Custom);
1887       setOperationAction(ISD::UMIN,    VT, HasBWI ? Legal : Custom);
1888       setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1889       setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1890       setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1891       setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1892     }
1893
1894     setOperationAction(ISD::FSHL,       MVT::v64i8, Custom);
1895     setOperationAction(ISD::FSHR,       MVT::v64i8, Custom);
1896     setOperationAction(ISD::FSHL,      MVT::v32i16, Custom);
1897     setOperationAction(ISD::FSHR,      MVT::v32i16, Custom);
1898     setOperationAction(ISD::FSHL,      MVT::v16i32, Custom);
1899     setOperationAction(ISD::FSHR,      MVT::v16i32, Custom);
1900
1901     if (Subtarget.hasDQI()) {
1902       for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1903                        ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
1904                        ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1905         setOperationAction(Opc,           MVT::v8i64, Custom);
1906       setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
1907     }
1908
1909     if (Subtarget.hasCDI()) {
1910       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1911       for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1912         setOperationAction(ISD::CTLZ,            VT, Legal);
1913       }
1914     } // Subtarget.hasCDI()
1915
1916     if (Subtarget.hasVPOPCNTDQ()) {
1917       for (auto VT : { MVT::v16i32, MVT::v8i64 })
1918         setOperationAction(ISD::CTPOP, VT, Legal);
1919     }
1920
1921     // Extract subvector is special because the value type
1922     // (result) is 256-bit but the source is 512-bit wide.
1923     // 128-bit was made Legal under AVX1.
1924     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1925                      MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1926       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1927
1928     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1929                      MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
1930       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1931       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1932       setOperationAction(ISD::SELECT,             VT, Custom);
1933       setOperationAction(ISD::VSELECT,            VT, Custom);
1934       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1935       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1936       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1937       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1938       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1939     }
1940     setF16Action(MVT::v32f16, Expand);
1941     setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
1942     setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
1943     setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
1944     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom);
1945     for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1946       setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1947       setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
1948     }
1949
1950     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1951       setOperationAction(ISD::MLOAD,               VT, Legal);
1952       setOperationAction(ISD::MSTORE,              VT, Legal);
1953       setOperationAction(ISD::MGATHER,             VT, Custom);
1954       setOperationAction(ISD::MSCATTER,            VT, Custom);
1955     }
1956     if (HasBWI) {
1957       for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1958         setOperationAction(ISD::MLOAD,        VT, Legal);
1959         setOperationAction(ISD::MSTORE,       VT, Legal);
1960       }
1961     } else {
1962       setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1963       setOperationAction(ISD::STORE, MVT::v64i8,  Custom);
1964     }
1965
1966     if (Subtarget.hasVBMI2()) {
1967       for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1968                        MVT::v16i16, MVT::v8i32, MVT::v4i64,
1969                        MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1970         setOperationAction(ISD::FSHL, VT, Custom);
1971         setOperationAction(ISD::FSHR, VT, Custom);
1972       }
1973
1974       setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1975       setOperationAction(ISD::ROTR, MVT::v8i16,  Custom);
1976       setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1977       setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1978     }
1979   }// useAVX512Regs
1980
1981   // This block controls legalization for operations that don't have
1982   // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1983   // narrower widths.
1984   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1985     // These operations are handled on non-VLX by artificially widening in
1986     // isel patterns.
1987
1988     setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i32, Custom);
1989     setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v4i32, Custom);
1990     setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v2i32, Custom);
1991
1992     if (Subtarget.hasDQI()) {
1993       // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1994       // v2f32 UINT_TO_FP is already custom under SSE2.
1995       assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1996              isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
1997              "Unexpected operation action!");
1998       // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1999       setOperationAction(ISD::FP_TO_SINT,        MVT::v2f32, Custom);
2000       setOperationAction(ISD::FP_TO_UINT,        MVT::v2f32, Custom);
2001       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
2002       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
2003     }
2004
2005     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2006       setOperationAction(ISD::SMAX, VT, Legal);
2007       setOperationAction(ISD::UMAX, VT, Legal);
2008       setOperationAction(ISD::SMIN, VT, Legal);
2009       setOperationAction(ISD::UMIN, VT, Legal);
2010       setOperationAction(ISD::ABS,  VT, Legal);
2011     }
2012
2013     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2014       setOperationAction(ISD::ROTL,     VT, Custom);
2015       setOperationAction(ISD::ROTR,     VT, Custom);
2016     }
2017
2018     // Custom legalize 2x32 to get a little better code.
2019     setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2020     setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2021
2022     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2023                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2024       setOperationAction(ISD::MSCATTER, VT, Custom);
2025
2026     if (Subtarget.hasDQI()) {
2027       for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
2028                        ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
2029                        ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {
2030         setOperationAction(Opc, MVT::v2i64, Custom);
2031         setOperationAction(Opc, MVT::v4i64, Custom);
2032       }
2033       setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2034       setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2035     }
2036
2037     if (Subtarget.hasCDI()) {
2038       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2039         setOperationAction(ISD::CTLZ,            VT, Legal);
2040       }
2041     } // Subtarget.hasCDI()
2042
2043     if (Subtarget.hasVPOPCNTDQ()) {
2044       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2045         setOperationAction(ISD::CTPOP, VT, Legal);
2046     }
2047   }
2048
2049   // This block control legalization of v32i1/v64i1 which are available with
2050   // AVX512BW..
2051   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2052     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
2053     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
2054
2055     for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2056       setOperationAction(ISD::VSELECT,            VT, Expand);
2057       setOperationAction(ISD::TRUNCATE,           VT, Custom);
2058       setOperationAction(ISD::SETCC,              VT, Custom);
2059       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2060       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
2061       setOperationAction(ISD::SELECT,             VT, Custom);
2062       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
2063       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
2064       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
2065       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
2066     }
2067
2068     for (auto VT : { MVT::v16i1, MVT::v32i1 })
2069       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
2070
2071     // Extends from v32i1 masks to 256-bit vectors.
2072     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
2073     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
2074     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
2075
2076     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2077       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
2078       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2079     }
2080
2081     // These operations are handled on non-VLX by artificially widening in
2082     // isel patterns.
2083     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2084
2085     if (Subtarget.hasBITALG()) {
2086       for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2087         setOperationAction(ISD::CTPOP, VT, Legal);
2088     }
2089   }
2090
2091   if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2092     auto setGroup = [&] (MVT VT) {
2093       setOperationAction(ISD::FADD,               VT, Legal);
2094       setOperationAction(ISD::STRICT_FADD,        VT, Legal);
2095       setOperationAction(ISD::FSUB,               VT, Legal);
2096       setOperationAction(ISD::STRICT_FSUB,        VT, Legal);
2097       setOperationAction(ISD::FMUL,               VT, Legal);
2098       setOperationAction(ISD::STRICT_FMUL,        VT, Legal);
2099       setOperationAction(ISD::FDIV,               VT, Legal);
2100       setOperationAction(ISD::STRICT_FDIV,        VT, Legal);
2101       setOperationAction(ISD::FSQRT,              VT, Legal);
2102       setOperationAction(ISD::STRICT_FSQRT,       VT, Legal);
2103
2104       setOperationAction(ISD::FFLOOR,             VT, Legal);
2105       setOperationAction(ISD::STRICT_FFLOOR,      VT, Legal);
2106       setOperationAction(ISD::FCEIL,              VT, Legal);
2107       setOperationAction(ISD::STRICT_FCEIL,       VT, Legal);
2108       setOperationAction(ISD::FTRUNC,             VT, Legal);
2109       setOperationAction(ISD::STRICT_FTRUNC,      VT, Legal);
2110       setOperationAction(ISD::FRINT,              VT, Legal);
2111       setOperationAction(ISD::STRICT_FRINT,       VT, Legal);
2112       setOperationAction(ISD::FNEARBYINT,         VT, Legal);
2113       setOperationAction(ISD::STRICT_FNEARBYINT,  VT, Legal);
2114
2115       setOperationAction(ISD::FROUND,             VT, Custom);
2116
2117       setOperationAction(ISD::LOAD,               VT, Legal);
2118       setOperationAction(ISD::STORE,              VT, Legal);
2119
2120       setOperationAction(ISD::FMA,                VT, Legal);
2121       setOperationAction(ISD::STRICT_FMA,         VT, Legal);
2122       setOperationAction(ISD::VSELECT,            VT, Legal);
2123       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
2124       setOperationAction(ISD::SELECT,             VT, Custom);
2125
2126       setOperationAction(ISD::FNEG,               VT, Custom);
2127       setOperationAction(ISD::FABS,               VT, Custom);
2128       setOperationAction(ISD::FCOPYSIGN,          VT, Custom);
2129       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
2130       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
2131
2132       setOperationAction(ISD::SETCC,              VT, Custom);
2133       setOperationAction(ISD::STRICT_FSETCC,      VT, Custom);
2134       setOperationAction(ISD::STRICT_FSETCCS,     VT, Custom);
2135     };
2136
2137     // AVX512_FP16 scalar operations
2138     setGroup(MVT::f16);
2139     setOperationAction(ISD::FREM,                 MVT::f16, Promote);
2140     setOperationAction(ISD::STRICT_FREM,          MVT::f16, Promote);
2141     setOperationAction(ISD::SELECT_CC,            MVT::f16, Expand);
2142     setOperationAction(ISD::BR_CC,                MVT::f16, Expand);
2143     setOperationAction(ISD::STRICT_FROUND,        MVT::f16, Promote);
2144     setOperationAction(ISD::FROUNDEVEN,           MVT::f16, Legal);
2145     setOperationAction(ISD::STRICT_FROUNDEVEN,    MVT::f16, Legal);
2146     setOperationAction(ISD::FP_ROUND,             MVT::f16, Custom);
2147     setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);
2148     setOperationAction(ISD::FMAXIMUM,             MVT::f16, Custom);
2149     setOperationAction(ISD::FMINIMUM,             MVT::f16, Custom);
2150     setOperationAction(ISD::FP_EXTEND,            MVT::f32, Legal);
2151     setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);
2152
2153     setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
2154     setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
2155
2156     if (Subtarget.useAVX512Regs()) {
2157       setGroup(MVT::v32f16);
2158       setOperationAction(ISD::SCALAR_TO_VECTOR,       MVT::v32f16, Custom);
2159       setOperationAction(ISD::SINT_TO_FP,             MVT::v32i16, Legal);
2160       setOperationAction(ISD::STRICT_SINT_TO_FP,      MVT::v32i16, Legal);
2161       setOperationAction(ISD::UINT_TO_FP,             MVT::v32i16, Legal);
2162       setOperationAction(ISD::STRICT_UINT_TO_FP,      MVT::v32i16, Legal);
2163       setOperationAction(ISD::FP_ROUND,               MVT::v16f16, Legal);
2164       setOperationAction(ISD::STRICT_FP_ROUND,        MVT::v16f16, Legal);
2165       setOperationAction(ISD::FP_EXTEND,              MVT::v16f32, Custom);
2166       setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v16f32, Legal);
2167       setOperationAction(ISD::FP_EXTEND,              MVT::v8f64,  Custom);
2168       setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v8f64,  Legal);
2169       setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);
2170
2171       setOperationAction(ISD::FP_TO_SINT,             MVT::v32i16, Custom);
2172       setOperationAction(ISD::STRICT_FP_TO_SINT,      MVT::v32i16, Custom);
2173       setOperationAction(ISD::FP_TO_UINT,             MVT::v32i16, Custom);
2174       setOperationAction(ISD::STRICT_FP_TO_UINT,      MVT::v32i16, Custom);
2175       setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i8,  MVT::v32i16);
2176       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
2177                                  MVT::v32i16);
2178       setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i8,  MVT::v32i16);
2179       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
2180                                  MVT::v32i16);
2181       setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i1,  MVT::v32i16);
2182       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
2183                                  MVT::v32i16);
2184       setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i1,  MVT::v32i16);
2185       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
2186                                  MVT::v32i16);
2187
2188       setOperationAction(ISD::EXTRACT_SUBVECTOR,      MVT::v16f16, Legal);
2189       setOperationAction(ISD::INSERT_SUBVECTOR,       MVT::v32f16, Legal);
2190       setOperationAction(ISD::CONCAT_VECTORS,         MVT::v32f16, Custom);
2191
2192       setLoadExtAction(ISD::EXTLOAD, MVT::v8f64,  MVT::v8f16,  Legal);
2193       setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2194     }
2195
2196     if (Subtarget.hasVLX()) {
2197       setGroup(MVT::v8f16);
2198       setGroup(MVT::v16f16);
2199
2200       setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8f16,  Legal);
2201       setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16f16, Custom);
2202       setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Legal);
2203       setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v16i16, Legal);
2204       setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16,  Legal);
2205       setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i16,  Legal);
2206       setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Legal);
2207       setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v16i16, Legal);
2208       setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16,  Legal);
2209       setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v8i16,  Legal);
2210
2211       setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
2212       setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v8i16, Custom);
2213       setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Custom);
2214       setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
2215       setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Legal);
2216       setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);
2217       setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Custom);
2218       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v8f32, Legal);
2219       setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);
2220       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
2221
2222       // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2223       setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v8f16,  Custom);
2224       setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v16f16, Custom);
2225
2226       setOperationAction(ISD::EXTRACT_SUBVECTOR,    MVT::v8f16, Legal);
2227       setOperationAction(ISD::INSERT_SUBVECTOR,     MVT::v16f16, Legal);
2228       setOperationAction(ISD::CONCAT_VECTORS,       MVT::v16f16, Custom);
2229
2230       setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2231       setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2232       setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2233       setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2234
2235       // Need to custom widen these to prevent scalarization.
2236       setOperationAction(ISD::LOAD,  MVT::v4f16, Custom);
2237       setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2238     }
2239   }
2240
2241   if (!Subtarget.useSoftFloat() &&
2242       (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2243     addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2244                                                         : &X86::VR128RegClass);
2245     addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2246                                                          : &X86::VR256RegClass);
2247     // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2248     // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2249     // Set the operation action Custom to do the customization later.
2250     setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);
2251     setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::bf16, Custom);
2252     for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2253       setF16Action(VT, Expand);
2254       setOperationAction(ISD::FADD, VT, Expand);
2255       setOperationAction(ISD::FSUB, VT, Expand);
2256       setOperationAction(ISD::FMUL, VT, Expand);
2257       setOperationAction(ISD::FDIV, VT, Expand);
2258       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
2259       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
2260     }
2261     setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
2262     addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2263   }
2264
2265   if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2266     addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2267     setF16Action(MVT::v32bf16, Expand);
2268     setOperationAction(ISD::FADD, MVT::v32bf16, Expand);
2269     setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);
2270     setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
2271     setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
2272     setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
2273     setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2274     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
2275   }
2276
2277   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2278     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
2279     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2280     setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2281     setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
2282     setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2283
2284     setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
2285     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2286     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2287     setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
2288     setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2289
2290     if (Subtarget.hasBWI()) {
2291       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
2292       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
2293     }
2294
2295     if (Subtarget.hasFP16()) {
2296       // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2297       setOperationAction(ISD::FP_TO_SINT,        MVT::v2f16, Custom);
2298       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2299       setOperationAction(ISD::FP_TO_UINT,        MVT::v2f16, Custom);
2300       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2301       setOperationAction(ISD::FP_TO_SINT,        MVT::v4f16, Custom);
2302       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2303       setOperationAction(ISD::FP_TO_UINT,        MVT::v4f16, Custom);
2304       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2305       // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2306       setOperationAction(ISD::SINT_TO_FP,        MVT::v2f16, Custom);
2307       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2308       setOperationAction(ISD::UINT_TO_FP,        MVT::v2f16, Custom);
2309       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2310       setOperationAction(ISD::SINT_TO_FP,        MVT::v4f16, Custom);
2311       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2312       setOperationAction(ISD::UINT_TO_FP,        MVT::v4f16, Custom);
2313       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2314       // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2315       setOperationAction(ISD::FP_ROUND,          MVT::v2f16, Custom);
2316       setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v2f16, Custom);
2317       setOperationAction(ISD::FP_ROUND,          MVT::v4f16, Custom);
2318       setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v4f16, Custom);
2319       // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2320       setOperationAction(ISD::FP_EXTEND,         MVT::v2f16, Custom);
2321       setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v2f16, Custom);
2322       setOperationAction(ISD::FP_EXTEND,         MVT::v4f16, Custom);
2323       setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v4f16, Custom);
2324     }
2325   }
2326
2327   if (Subtarget.hasAMXTILE()) {
2328     addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2329   }
2330
2331   // We want to custom lower some of our intrinsics.
2332   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2333   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2334   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2335   if (!Subtarget.is64Bit()) {
2336     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2337   }
2338
2339   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2340   // handle type legalization for these operations here.
2341   //
2342   // FIXME: We really should do custom legalization for addition and
2343   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
2344   // than generic legalization for 64-bit multiplication-with-overflow, though.
2345   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2346     if (VT == MVT::i64 && !Subtarget.is64Bit())
2347       continue;
2348     // Add/Sub/Mul with overflow operations are custom lowered.
2349     setOperationAction(ISD::SADDO, VT, Custom);
2350     setOperationAction(ISD::UADDO, VT, Custom);
2351     setOperationAction(ISD::SSUBO, VT, Custom);
2352     setOperationAction(ISD::USUBO, VT, Custom);
2353     setOperationAction(ISD::SMULO, VT, Custom);
2354     setOperationAction(ISD::UMULO, VT, Custom);
2355
2356     // Support carry in as value rather than glue.
2357     setOperationAction(ISD::UADDO_CARRY, VT, Custom);
2358     setOperationAction(ISD::USUBO_CARRY, VT, Custom);
2359     setOperationAction(ISD::SETCCCARRY, VT, Custom);
2360     setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2361     setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2362   }
2363
2364   if (!Subtarget.is64Bit()) {
2365     // These libcalls are not available in 32-bit.
2366     setLibcallName(RTLIB::SHL_I128, nullptr);
2367     setLibcallName(RTLIB::SRL_I128, nullptr);
2368     setLibcallName(RTLIB::SRA_I128, nullptr);
2369     setLibcallName(RTLIB::MUL_I128, nullptr);
2370     // The MULO libcall is not part of libgcc, only compiler-rt.
2371     setLibcallName(RTLIB::MULO_I64, nullptr);
2372   }
2373   // The MULO libcall is not part of libgcc, only compiler-rt.
2374   setLibcallName(RTLIB::MULO_I128, nullptr);
2375
2376   // Combine sin / cos into _sincos_stret if it is available.
2377   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2378       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2379     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2380     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2381   }
2382
2383   if (Subtarget.isTargetWin64()) {
2384     setOperationAction(ISD::SDIV, MVT::i128, Custom);
2385     setOperationAction(ISD::UDIV, MVT::i128, Custom);
2386     setOperationAction(ISD::SREM, MVT::i128, Custom);
2387     setOperationAction(ISD::UREM, MVT::i128, Custom);
2388     setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
2389     setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
2390     setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
2391     setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
2392     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
2393     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
2394     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
2395     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
2396   }
2397
2398   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2399   // is. We should promote the value to 64-bits to solve this.
2400   // This is what the CRT headers do - `fmodf` is an inline header
2401   // function casting to f64 and calling `fmod`.
2402   if (Subtarget.is32Bit() &&
2403       (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2404     for (ISD::NodeType Op :
2405          {ISD::FCEIL,  ISD::STRICT_FCEIL,
2406           ISD::FCOS,   ISD::STRICT_FCOS,
2407           ISD::FEXP,   ISD::STRICT_FEXP,
2408           ISD::FFLOOR, ISD::STRICT_FFLOOR,
2409           ISD::FREM,   ISD::STRICT_FREM,
2410           ISD::FLOG,   ISD::STRICT_FLOG,
2411           ISD::FLOG10, ISD::STRICT_FLOG10,
2412           ISD::FPOW,   ISD::STRICT_FPOW,
2413           ISD::FSIN,   ISD::STRICT_FSIN})
2414       if (isOperationExpand(Op, MVT::f32))
2415         setOperationAction(Op, MVT::f32, Promote);
2416
2417   // We have target-specific dag combine patterns for the following nodes:
2418   setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
2419                        ISD::SCALAR_TO_VECTOR,
2420                        ISD::INSERT_VECTOR_ELT,
2421                        ISD::EXTRACT_VECTOR_ELT,
2422                        ISD::CONCAT_VECTORS,
2423                        ISD::INSERT_SUBVECTOR,
2424                        ISD::EXTRACT_SUBVECTOR,
2425                        ISD::BITCAST,
2426                        ISD::VSELECT,
2427                        ISD::SELECT,
2428                        ISD::SHL,
2429                        ISD::SRA,
2430                        ISD::SRL,
2431                        ISD::OR,
2432                        ISD::AND,
2433                        ISD::ADD,
2434                        ISD::FADD,
2435                        ISD::FSUB,
2436                        ISD::FNEG,
2437                        ISD::FMA,
2438                        ISD::STRICT_FMA,
2439                        ISD::FMINNUM,
2440                        ISD::FMAXNUM,
2441                        ISD::SUB,
2442                        ISD::LOAD,
2443                        ISD::MLOAD,
2444                        ISD::STORE,
2445                        ISD::MSTORE,
2446                        ISD::TRUNCATE,
2447                        ISD::ZERO_EXTEND,
2448                        ISD::ANY_EXTEND,
2449                        ISD::SIGN_EXTEND,
2450                        ISD::SIGN_EXTEND_INREG,
2451                        ISD::ANY_EXTEND_VECTOR_INREG,
2452                        ISD::SIGN_EXTEND_VECTOR_INREG,
2453                        ISD::ZERO_EXTEND_VECTOR_INREG,
2454                        ISD::SINT_TO_FP,
2455                        ISD::UINT_TO_FP,
2456                        ISD::STRICT_SINT_TO_FP,
2457                        ISD::STRICT_UINT_TO_FP,
2458                        ISD::SETCC,
2459                        ISD::MUL,
2460                        ISD::XOR,
2461                        ISD::MSCATTER,
2462                        ISD::MGATHER,
2463                        ISD::FP16_TO_FP,
2464                        ISD::FP_EXTEND,
2465                        ISD::STRICT_FP_EXTEND,
2466                        ISD::FP_ROUND,
2467                        ISD::STRICT_FP_ROUND});
2468
2469   computeRegisterProperties(Subtarget.getRegisterInfo());
2470
2471   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2472   MaxStoresPerMemsetOptSize = 8;
2473   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2474   MaxStoresPerMemcpyOptSize = 4;
2475   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2476   MaxStoresPerMemmoveOptSize = 4;
2477
2478   // TODO: These control memcmp expansion in CGP and could be raised higher, but
2479   // that needs to benchmarked and balanced with the potential use of vector
2480   // load/store types (PR33329, PR33914).
2481   MaxLoadsPerMemcmp = 2;
2482   MaxLoadsPerMemcmpOptSize = 2;
2483
2484   // Default loop alignment, which can be overridden by -align-loops.
2485   setPrefLoopAlignment(Align(16));
2486
2487   // An out-of-order CPU can speculatively execute past a predictable branch,
2488   // but a conditional move could be stalled by an expensive earlier operation.
2489   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2490   EnableExtLdPromotion = true;
2491   setPrefFunctionAlignment(Align(16));
2492
2493   verifyIntrinsicTables();
2494
2495   // Default to having -disable-strictnode-mutation on
2496   IsStrictFPEnabled = true;
2497 }
2498
2499 // This has so far only been implemented for 64-bit MachO.
2500 bool X86TargetLowering::useLoadStackGuardNode() const {
2501   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2502 }
2503
2504 bool X86TargetLowering::useStackGuardXorFP() const {
2505   // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2506   return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2507 }
2508
2509 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2510                                                const SDLoc &DL) const {
2511   EVT PtrTy = getPointerTy(DAG.getDataLayout());
2512   unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2513   MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2514   return SDValue(Node, 0);
2515 }
2516
2517 TargetLoweringBase::LegalizeTypeAction
2518 X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2519   if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2520       !Subtarget.hasBWI())
2521     return TypeSplitVector;
2522
2523   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2524       !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2525     return TypeSplitVector;
2526
2527   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2528       VT.getVectorElementType() != MVT::i1)
2529     return TypeWidenVector;
2530
2531   return TargetLoweringBase::getPreferredVectorAction(VT);
2532 }
2533
2534 FastISel *
2535 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2536                                   const TargetLibraryInfo *libInfo) const {
2537   return X86::createFastISel(funcInfo, libInfo);
2538 }
2539
2540 //===----------------------------------------------------------------------===//
2541 //                           Other Lowering Hooks
2542 //===----------------------------------------------------------------------===//
2543
2544 bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
2545                       bool AssumeSingleUse) {
2546   if (!AssumeSingleUse && !Op.hasOneUse())
2547     return false;
2548   if (!ISD::isNormalLoad(Op.getNode()))
2549     return false;
2550
2551   // If this is an unaligned vector, make sure the target supports folding it.
2552   auto *Ld = cast<LoadSDNode>(Op.getNode());
2553   if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2554       Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2555     return false;
2556
2557   // TODO: If this is a non-temporal load and the target has an instruction
2558   //       for it, it should not be folded. See "useNonTemporalLoad()".
2559
2560   return true;
2561 }
2562
2563 bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
2564                                           const X86Subtarget &Subtarget,
2565                                           bool AssumeSingleUse) {
2566   assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2567   if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2568     return false;
2569
2570   // We can not replace a wide volatile load with a broadcast-from-memory,
2571   // because that would narrow the load, which isn't legal for volatiles.
2572   auto *Ld = cast<LoadSDNode>(Op.getNode());
2573   return !Ld->isVolatile() ||
2574          Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2575 }
2576
2577 bool X86::mayFoldIntoStore(SDValue Op) {
2578   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2579 }
2580
2581 bool X86::mayFoldIntoZeroExtend(SDValue Op) {
2582   if (Op.hasOneUse()) {
2583     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
2584     return (ISD::ZERO_EXTEND == Opcode);
2585   }
2586   return false;
2587 }
2588
2589 static bool isTargetShuffle(unsigned Opcode) {
2590   switch(Opcode) {
2591   default: return false;
2592   case X86ISD::BLENDI:
2593   case X86ISD::PSHUFB:
2594   case X86ISD::PSHUFD:
2595   case X86ISD::PSHUFHW:
2596   case X86ISD::PSHUFLW:
2597   case X86ISD::SHUFP:
2598   case X86ISD::INSERTPS:
2599   case X86ISD::EXTRQI:
2600   case X86ISD::INSERTQI:
2601   case X86ISD::VALIGN:
2602   case X86ISD::PALIGNR:
2603   case X86ISD::VSHLDQ:
2604   case X86ISD::VSRLDQ:
2605   case X86ISD::MOVLHPS:
2606   case X86ISD::MOVHLPS:
2607   case X86ISD::MOVSHDUP:
2608   case X86ISD::MOVSLDUP:
2609   case X86ISD::MOVDDUP:
2610   case X86ISD::MOVSS:
2611   case X86ISD::MOVSD:
2612   case X86ISD::MOVSH:
2613   case X86ISD::UNPCKL:
2614   case X86ISD::UNPCKH:
2615   case X86ISD::VBROADCAST:
2616   case X86ISD::VPERMILPI:
2617   case X86ISD::VPERMILPV:
2618   case X86ISD::VPERM2X128:
2619   case X86ISD::SHUF128:
2620   case X86ISD::VPERMIL2:
2621   case X86ISD::VPERMI:
2622   case X86ISD::VPPERM:
2623   case X86ISD::VPERMV:
2624   case X86ISD::VPERMV3:
2625   case X86ISD::VZEXT_MOVL:
2626     return true;
2627   }
2628 }
2629
2630 static bool isTargetShuffleVariableMask(unsigned Opcode) {
2631   switch (Opcode) {
2632   default: return false;
2633   // Target Shuffles.
2634   case X86ISD::PSHUFB:
2635   case X86ISD::VPERMILPV:
2636   case X86ISD::VPERMIL2:
2637   case X86ISD::VPPERM:
2638   case X86ISD::VPERMV:
2639   case X86ISD::VPERMV3:
2640     return true;
2641   // 'Faux' Target Shuffles.
2642   case ISD::OR:
2643   case ISD::AND:
2644   case X86ISD::ANDNP:
2645     return true;
2646   }
2647 }
2648
2649 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
2650   MachineFunction &MF = DAG.getMachineFunction();
2651   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2652   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2653   int ReturnAddrIndex = FuncInfo->getRAIndex();
2654
2655   if (ReturnAddrIndex == 0) {
2656     // Set up a frame object for the return address.
2657     unsigned SlotSize = RegInfo->getSlotSize();
2658     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2659                                                           -(int64_t)SlotSize,
2660                                                           false);
2661     FuncInfo->setRAIndex(ReturnAddrIndex);
2662   }
2663
2664   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2665 }
2666
2667 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
2668                                        bool hasSymbolicDisplacement) {
2669   // Offset should fit into 32 bit immediate field.
2670   if (!isInt<32>(Offset))
2671     return false;
2672
2673   // If we don't have a symbolic displacement - we don't have any extra
2674   // restrictions.
2675   if (!hasSymbolicDisplacement)
2676     return true;
2677
2678   // FIXME: Some tweaks might be needed for medium code model.
2679   if (M != CodeModel::Small && M != CodeModel::Kernel)
2680     return false;
2681
2682   // For small code model we assume that latest object is 16MB before end of 31
2683   // bits boundary. We may also accept pretty large negative constants knowing
2684   // that all objects are in the positive half of address space.
2685   if (M == CodeModel::Small && Offset < 16*1024*1024)
2686     return true;
2687
2688   // For kernel code model we know that all object resist in the negative half
2689   // of 32bits address space. We may not accept negative offsets, since they may
2690   // be just off and we may accept pretty large positive ones.
2691   if (M == CodeModel::Kernel && Offset >= 0)
2692     return true;
2693
2694   return false;
2695 }
2696
2697 /// Return true if the condition is an signed comparison operation.
2698 static bool isX86CCSigned(unsigned X86CC) {
2699   switch (X86CC) {
2700   default:
2701     llvm_unreachable("Invalid integer condition!");
2702   case X86::COND_E:
2703   case X86::COND_NE:
2704   case X86::COND_B:
2705   case X86::COND_A:
2706   case X86::COND_BE:
2707   case X86::COND_AE:
2708     return false;
2709   case X86::COND_G:
2710   case X86::COND_GE:
2711   case X86::COND_L:
2712   case X86::COND_LE:
2713     return true;
2714   }
2715 }
2716
2717 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
2718   switch (SetCCOpcode) {
2719   default: llvm_unreachable("Invalid integer condition!");
2720   case ISD::SETEQ:  return X86::COND_E;
2721   case ISD::SETGT:  return X86::COND_G;
2722   case ISD::SETGE:  return X86::COND_GE;
2723   case ISD::SETLT:  return X86::COND_L;
2724   case ISD::SETLE:  return X86::COND_LE;
2725   case ISD::SETNE:  return X86::COND_NE;
2726   case ISD::SETULT: return X86::COND_B;
2727   case ISD::SETUGT: return X86::COND_A;
2728   case ISD::SETULE: return X86::COND_BE;
2729   case ISD::SETUGE: return X86::COND_AE;
2730   }
2731 }
2732
2733 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2734 /// condition code, returning the condition code and the LHS/RHS of the
2735 /// comparison to make.
2736 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
2737                                     bool isFP, SDValue &LHS, SDValue &RHS,
2738                                     SelectionDAG &DAG) {
2739   if (!isFP) {
2740     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2741       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2742         // X > -1   -> X == 0, jump !sign.
2743         RHS = DAG.getConstant(0, DL, RHS.getValueType());
2744         return X86::COND_NS;
2745       }
2746       if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2747         // X < 0   -> X == 0, jump on sign.
2748         return X86::COND_S;
2749       }
2750       if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2751         // X >= 0   -> X == 0, jump on !sign.
2752         return X86::COND_NS;
2753       }
2754       if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2755         // X < 1   -> X <= 0
2756         RHS = DAG.getConstant(0, DL, RHS.getValueType());
2757         return X86::COND_LE;
2758       }
2759     }
2760
2761     return TranslateIntegerX86CC(SetCCOpcode);
2762   }
2763
2764   // First determine if it is required or is profitable to flip the operands.
2765
2766   // If LHS is a foldable load, but RHS is not, flip the condition.
2767   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
2768       !ISD::isNON_EXTLoad(RHS.getNode())) {
2769     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2770     std::swap(LHS, RHS);
2771   }
2772
2773   switch (SetCCOpcode) {
2774   default: break;
2775   case ISD::SETOLT:
2776   case ISD::SETOLE:
2777   case ISD::SETUGT:
2778   case ISD::SETUGE:
2779     std::swap(LHS, RHS);
2780     break;
2781   }
2782
2783   // On a floating point condition, the flags are set as follows:
2784   // ZF  PF  CF   op
2785   //  0 | 0 | 0 | X > Y
2786   //  0 | 0 | 1 | X < Y
2787   //  1 | 0 | 0 | X == Y
2788   //  1 | 1 | 1 | unordered
2789   switch (SetCCOpcode) {
2790   default: llvm_unreachable("Condcode should be pre-legalized away");
2791   case ISD::SETUEQ:
2792   case ISD::SETEQ:   return X86::COND_E;
2793   case ISD::SETOLT:              // flipped
2794   case ISD::SETOGT:
2795   case ISD::SETGT:   return X86::COND_A;
2796   case ISD::SETOLE:              // flipped
2797   case ISD::SETOGE:
2798   case ISD::SETGE:   return X86::COND_AE;
2799   case ISD::SETUGT:              // flipped
2800   case ISD::SETULT:
2801   case ISD::SETLT:   return X86::COND_B;
2802   case ISD::SETUGE:              // flipped
2803   case ISD::SETULE:
2804   case ISD::SETLE:   return X86::COND_BE;
2805   case ISD::SETONE:
2806   case ISD::SETNE:   return X86::COND_NE;
2807   case ISD::SETUO:   return X86::COND_P;
2808   case ISD::SETO:    return X86::COND_NP;
2809   case ISD::SETOEQ:
2810   case ISD::SETUNE:  return X86::COND_INVALID;
2811   }
2812 }
2813
2814 /// Is there a floating point cmov for the specific X86 condition code?
2815 /// Current x86 isa includes the following FP cmov instructions:
2816 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2817 static bool hasFPCMov(unsigned X86CC) {
2818   switch (X86CC) {
2819   default:
2820     return false;
2821   case X86::COND_B:
2822   case X86::COND_BE:
2823   case X86::COND_E:
2824   case X86::COND_P:
2825   case X86::COND_A:
2826   case X86::COND_AE:
2827   case X86::COND_NE:
2828   case X86::COND_NP:
2829     return true;
2830   }
2831 }
2832
2833 static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
2834   return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
2835          VT.is512BitVector();
2836 }
2837
2838 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
2839                                            const CallInst &I,
2840                                            MachineFunction &MF,
2841                                            unsigned Intrinsic) const {
2842   Info.flags = MachineMemOperand::MONone;
2843   Info.offset = 0;
2844
2845   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
2846   if (!IntrData) {
2847     switch (Intrinsic) {
2848     case Intrinsic::x86_aesenc128kl:
2849     case Intrinsic::x86_aesdec128kl:
2850       Info.opc = ISD::INTRINSIC_W_CHAIN;
2851       Info.ptrVal = I.getArgOperand(1);
2852       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2853       Info.align = Align(1);
2854       Info.flags |= MachineMemOperand::MOLoad;
2855       return true;
2856     case Intrinsic::x86_aesenc256kl:
2857     case Intrinsic::x86_aesdec256kl:
2858       Info.opc = ISD::INTRINSIC_W_CHAIN;
2859       Info.ptrVal = I.getArgOperand(1);
2860       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2861       Info.align = Align(1);
2862       Info.flags |= MachineMemOperand::MOLoad;
2863       return true;
2864     case Intrinsic::x86_aesencwide128kl:
2865     case Intrinsic::x86_aesdecwide128kl:
2866       Info.opc = ISD::INTRINSIC_W_CHAIN;
2867       Info.ptrVal = I.getArgOperand(0);
2868       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2869       Info.align = Align(1);
2870       Info.flags |= MachineMemOperand::MOLoad;
2871       return true;
2872     case Intrinsic::x86_aesencwide256kl:
2873     case Intrinsic::x86_aesdecwide256kl:
2874       Info.opc = ISD::INTRINSIC_W_CHAIN;
2875       Info.ptrVal = I.getArgOperand(0);
2876       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2877       Info.align = Align(1);
2878       Info.flags |= MachineMemOperand::MOLoad;
2879       return true;
2880     case Intrinsic::x86_cmpccxadd32:
2881     case Intrinsic::x86_cmpccxadd64:
2882     case Intrinsic::x86_atomic_bts:
2883     case Intrinsic::x86_atomic_btc:
2884     case Intrinsic::x86_atomic_btr: {
2885       Info.opc = ISD::INTRINSIC_W_CHAIN;
2886       Info.ptrVal = I.getArgOperand(0);
2887       unsigned Size = I.getType()->getScalarSizeInBits();
2888       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2889       Info.align = Align(Size);
2890       Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
2891                     MachineMemOperand::MOVolatile;
2892       return true;
2893     }
2894     case Intrinsic::x86_atomic_bts_rm:
2895     case Intrinsic::x86_atomic_btc_rm:
2896     case Intrinsic::x86_atomic_btr_rm: {
2897       Info.opc = ISD::INTRINSIC_W_CHAIN;
2898       Info.ptrVal = I.getArgOperand(0);
2899       unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2900       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2901       Info.align = Align(Size);
2902       Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
2903                     MachineMemOperand::MOVolatile;
2904       return true;
2905     }
2906     case Intrinsic::x86_aadd32:
2907     case Intrinsic::x86_aadd64:
2908     case Intrinsic::x86_aand32:
2909     case Intrinsic::x86_aand64:
2910     case Intrinsic::x86_aor32:
2911     case Intrinsic::x86_aor64:
2912     case Intrinsic::x86_axor32:
2913     case Intrinsic::x86_axor64:
2914     case Intrinsic::x86_atomic_add_cc:
2915     case Intrinsic::x86_atomic_sub_cc:
2916     case Intrinsic::x86_atomic_or_cc:
2917     case Intrinsic::x86_atomic_and_cc:
2918     case Intrinsic::x86_atomic_xor_cc: {
2919       Info.opc = ISD::INTRINSIC_W_CHAIN;
2920       Info.ptrVal = I.getArgOperand(0);
2921       unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2922       Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2923       Info.align = Align(Size);
2924       Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
2925                     MachineMemOperand::MOVolatile;
2926       return true;
2927     }
2928     }
2929     return false;
2930   }
2931
2932   switch (IntrData->Type) {
2933   case TRUNCATE_TO_MEM_VI8:
2934   case TRUNCATE_TO_MEM_VI16:
2935   case TRUNCATE_TO_MEM_VI32: {
2936     Info.opc = ISD::INTRINSIC_VOID;
2937     Info.ptrVal = I.getArgOperand(0);
2938     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
2939     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
2940     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
2941       ScalarVT = MVT::i8;
2942     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
2943       ScalarVT = MVT::i16;
2944     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
2945       ScalarVT = MVT::i32;
2946
2947     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
2948     Info.align = Align(1);
2949     Info.flags |= MachineMemOperand::MOStore;
2950     break;
2951   }
2952   case GATHER:
2953   case GATHER_AVX2: {
2954     Info.opc = ISD::INTRINSIC_W_CHAIN;
2955     Info.ptrVal = nullptr;
2956     MVT DataVT = MVT::getVT(I.getType());
2957     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
2958     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
2959                                 IndexVT.getVectorNumElements());
2960     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
2961     Info.align = Align(1);
2962     Info.flags |= MachineMemOperand::MOLoad;
2963     break;
2964   }
2965   case SCATTER: {
2966     Info.opc = ISD::INTRINSIC_VOID;
2967     Info.ptrVal = nullptr;
2968     MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
2969     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
2970     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
2971                                 IndexVT.getVectorNumElements());
2972     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
2973     Info.align = Align(1);
2974     Info.flags |= MachineMemOperand::MOStore;
2975     break;
2976   }
2977   default:
2978     return false;
2979   }
2980
2981   return true;
2982 }
2983
2984 /// Returns true if the target can instruction select the
2985 /// specified FP immediate natively. If false, the legalizer will
2986 /// materialize the FP immediate as a load from a constant pool.
2987 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
2988                                      bool ForCodeSize) const {
2989   for (const APFloat &FPImm : LegalFPImmediates)
2990     if (Imm.bitwiseIsEqual(FPImm))
2991       return true;
2992   return false;
2993 }
2994
2995 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
2996                                               ISD::LoadExtType ExtTy,
2997                                               EVT NewVT) const {
2998   assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
2999
3000   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3001   // relocation target a movq or addq instruction: don't let the load shrink.
3002   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3003   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3004     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3005       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3006
3007   // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3008   // those uses are extracted directly into a store, then the extract + store
3009   // can be store-folded. Therefore, it's probably not worth splitting the load.
3010   EVT VT = Load->getValueType(0);
3011   if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3012     for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
3013       // Skip uses of the chain value. Result 0 of the node is the load value.
3014       if (UI.getUse().getResNo() != 0)
3015         continue;
3016
3017       // If this use is not an extract + store, it's probably worth splitting.
3018       if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
3019           UI->use_begin()->getOpcode() != ISD::STORE)
3020         return true;
3021     }
3022     // All non-chain uses are extract + store.
3023     return false;
3024   }
3025
3026   return true;
3027 }
3028
3029 /// Returns true if it is beneficial to convert a load of a constant
3030 /// to just the constant itself.
3031 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3032                                                           Type *Ty) const {
3033   assert(Ty->isIntegerTy());
3034
3035   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3036   if (BitSize == 0 || BitSize > 64)
3037     return false;
3038   return true;
3039 }
3040
3041 bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
3042   // If we are using XMM registers in the ABI and the condition of the select is
3043   // a floating-point compare and we have blendv or conditional move, then it is
3044   // cheaper to select instead of doing a cross-register move and creating a
3045   // load that depends on the compare result.
3046   bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3047   return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3048 }
3049
3050 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
3051   // TODO: It might be a win to ease or lift this restriction, but the generic
3052   // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3053   if (VT.isVector() && Subtarget.hasAVX512())
3054     return false;
3055
3056   return true;
3057 }
3058
3059 bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
3060                                                SDValue C) const {
3061   // TODO: We handle scalars using custom code, but generic combining could make
3062   // that unnecessary.
3063   APInt MulC;
3064   if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3065     return false;
3066
3067   // Find the type this will be legalized too. Otherwise we might prematurely
3068   // convert this to shl+add/sub and then still have to type legalize those ops.
3069   // Another choice would be to defer the decision for illegal types until
3070   // after type legalization. But constant splat vectors of i64 can't make it
3071   // through type legalization on 32-bit targets so we would need to special
3072   // case vXi64.
3073   while (getTypeAction(Context, VT) != TypeLegal)
3074     VT = getTypeToTransformTo(Context, VT);
3075
3076   // If vector multiply is legal, assume that's faster than shl + add/sub.
3077   // Multiply is a complex op with higher latency and lower throughput in
3078   // most implementations, sub-vXi32 vector multiplies are always fast,
3079   // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3080   // is always going to be slow.
3081   unsigned EltSizeInBits = VT.getScalarSizeInBits();
3082   if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3083       (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3084     return false;
3085
3086   // shl+add, shl+sub, shl+add+neg
3087   return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3088          (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3089 }
3090
3091 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
3092                                                 unsigned Index) const {
3093   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3094     return false;
3095
3096   // Mask vectors support all subregister combinations and operations that
3097   // extract half of vector.
3098   if (ResVT.getVectorElementType() == MVT::i1)
3099     return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3100                           (Index == ResVT.getVectorNumElements()));
3101
3102   return (Index % ResVT.getVectorNumElements()) == 0;
3103 }
3104
3105 bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
3106   unsigned Opc = VecOp.getOpcode();
3107
3108   // Assume target opcodes can't be scalarized.
3109   // TODO - do we have any exceptions?
3110   if (Opc >= ISD::BUILTIN_OP_END)
3111     return false;
3112
3113   // If the vector op is not supported, try to convert to scalar.
3114   EVT VecVT = VecOp.getValueType();
3115   if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3116     return true;
3117
3118   // If the vector op is supported, but the scalar op is not, the transform may
3119   // not be worthwhile.
3120   EVT ScalarVT = VecVT.getScalarType();
3121   return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3122 }
3123
3124 bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
3125                                              bool) const {
3126   // TODO: Allow vectors?
3127   if (VT.isVector())
3128     return false;
3129   return VT.isSimple() || !isOperationExpand(Opcode, VT);
3130 }
3131
3132 bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
3133   // Speculate cttz only if we can directly use TZCNT or can promote to i32.
3134   return Subtarget.hasBMI() ||
3135          (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
3136 }
3137
3138 bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
3139   // Speculate ctlz only if we can directly use LZCNT.
3140   return Subtarget.hasLZCNT();
3141 }
3142
3143 bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {
3144   // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3145   // expensive than a straight movsd. On the other hand, it's important to
3146   // shrink long double fp constant since fldt is very slow.
3147   return !Subtarget.hasSSE2() || VT == MVT::f80;
3148 }
3149
3150 bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
3151   return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3152          (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3153 }
3154
3155 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
3156                                                 const SelectionDAG &DAG,
3157                                                 const MachineMemOperand &MMO) const {
3158   if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3159       BitcastVT.getVectorElementType() == MVT::i1)
3160     return false;
3161
3162   if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3163     return false;
3164
3165   // If both types are legal vectors, it's always ok to convert them.
3166   if (LoadVT.isVector() && BitcastVT.isVector() &&
3167       isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3168     return true;
3169
3170   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3171 }
3172
3173 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
3174                                          const MachineFunction &MF) const {
3175   // Do not merge to float value size (128 bytes) if no implicit
3176   // float attribute is set.
3177   bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3178
3179   if (NoFloat) {
3180     unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3181     return (MemVT.getSizeInBits() <= MaxIntSize);
3182   }
3183   // Make sure we don't merge greater than our preferred vector
3184   // width.
3185   if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3186     return false;
3187
3188   return true;
3189 }
3190
3191 bool X86TargetLowering::isCtlzFast() const {
3192   return Subtarget.hasFastLZCNT();
3193 }
3194
3195 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
3196     const Instruction &AndI) const {
3197   return true;
3198 }
3199
3200 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
3201   EVT VT = Y.getValueType();
3202
3203   if (VT.isVector())
3204     return false;
3205
3206   if (!Subtarget.hasBMI())
3207     return false;
3208
3209   // There are only 32-bit and 64-bit forms for 'andn'.
3210   if (VT != MVT::i32 && VT != MVT::i64)
3211     return false;
3212
3213   return !isa<ConstantSDNode>(Y);
3214 }
3215
3216 bool X86TargetLowering::hasAndNot(SDValue Y) const {
3217   EVT VT = Y.getValueType();
3218
3219   if (!VT.isVector())
3220     return hasAndNotCompare(Y);
3221
3222   // Vector.
3223
3224   if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3225     return false;
3226
3227   if (VT == MVT::v4i32)
3228     return true;
3229
3230   return Subtarget.hasSSE2();
3231 }
3232
3233 bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
3234   return X.getValueType().isScalarInteger(); // 'bt'
3235 }
3236
3237 bool X86TargetLowering::
3238     shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
3239         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
3240         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3241         SelectionDAG &DAG) const {
3242   // Does baseline recommend not to perform the fold by default?
3243   if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
3244           X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3245     return false;
3246   // For scalars this transform is always beneficial.
3247   if (X.getValueType().isScalarInteger())
3248     return true;
3249   // If all the shift amounts are identical, then transform is beneficial even
3250   // with rudimentary SSE2 shifts.
3251   if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3252     return true;
3253   // If we have AVX2 with it's powerful shift operations, then it's also good.
3254   if (Subtarget.hasAVX2())
3255     return true;
3256   // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3257   return NewShiftOpcode == ISD::SHL;
3258 }
3259
3260 bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
3261   return N->getOpcode() != ISD::FP_EXTEND;
3262 }
3263
3264 bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
3265     const SDNode *N, CombineLevel Level) const {
3266   assert(((N->getOpcode() == ISD::SHL &&
3267            N->getOperand(0).getOpcode() == ISD::SRL) ||
3268           (N->getOpcode() == ISD::SRL &&
3269            N->getOperand(0).getOpcode() == ISD::SHL)) &&
3270          "Expected shift-shift mask");
3271   // TODO: Should we always create i64 masks? Or only folded immediates?
3272   EVT VT = N->getValueType(0);
3273   if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3274       (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3275     // Only fold if the shift values are equal - so it folds to AND.
3276     // TODO - we should fold if either is a non-uniform vector but we don't do
3277     // the fold for non-splats yet.
3278     return N->getOperand(1) == N->getOperand(0).getOperand(1);
3279   }
3280   return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
3281 }
3282
3283 bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
3284   EVT VT = Y.getValueType();
3285
3286   // For vectors, we don't have a preference, but we probably want a mask.
3287   if (VT.isVector())
3288     return false;
3289
3290   // 64-bit shifts on 32-bit targets produce really bad bloated code.
3291   if (VT == MVT::i64 && !Subtarget.is64Bit())
3292     return false;
3293
3294   return true;
3295 }
3296
3297 TargetLowering::ShiftLegalizationStrategy
3298 X86TargetLowering::preferredShiftLegalizationStrategy(
3299     SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3300   if (DAG.getMachineFunction().getFunction().hasMinSize() &&
3301       !Subtarget.isOSWindows())
3302     return ShiftLegalizationStrategy::LowerToLibcall;
3303   return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
3304                                                             ExpansionFactor);
3305 }
3306
3307 bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
3308   // Any legal vector type can be splatted more efficiently than
3309   // loading/spilling from memory.
3310   return isTypeLegal(VT);
3311 }
3312
3313 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
3314   MVT VT = MVT::getIntegerVT(NumBits);
3315   if (isTypeLegal(VT))
3316     return VT;
3317
3318   // PMOVMSKB can handle this.
3319   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3320     return MVT::v16i8;
3321
3322   // VPMOVMSKB can handle this.
3323   if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3324     return MVT::v32i8;
3325
3326   // TODO: Allow 64-bit type for 32-bit target.
3327   // TODO: 512-bit types should be allowed, but make sure that those
3328   // cases are handled in combineVectorSizedSetCCEquality().
3329
3330   return MVT::INVALID_SIMPLE_VALUE_TYPE;
3331 }
3332
3333 /// Val is the undef sentinel value or equal to the specified value.
3334 static bool isUndefOrEqual(int Val, int CmpVal) {
3335   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3336 }
3337
3338 /// Return true if every element in Mask is the undef sentinel value or equal to
3339 /// the specified value.
3340 static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3341   return llvm::all_of(Mask, [CmpVal](int M) {
3342     return (M == SM_SentinelUndef) || (M == CmpVal);
3343   });
3344 }
3345
3346 /// Return true if every element in Mask, beginning from position Pos and ending
3347 /// in Pos+Size is the undef sentinel value or equal to the specified value.
3348 static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3349                                   unsigned Size) {
3350   return llvm::all_of(Mask.slice(Pos, Size),
3351                       [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3352 }
3353
3354 /// Val is either the undef or zero sentinel value.
3355 static bool isUndefOrZero(int Val) {
3356   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3357 }
3358
3359 /// Return true if every element in Mask, beginning from position Pos and ending
3360 /// in Pos+Size is the undef sentinel value.
3361 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3362   return llvm::all_of(Mask.slice(Pos, Size),
3363                       [](int M) { return M == SM_SentinelUndef; });
3364 }
3365
3366 /// Return true if the mask creates a vector whose lower half is undefined.
3367 static bool isUndefLowerHalf(ArrayRef<int> Mask) {
3368   unsigned NumElts = Mask.size();
3369   return isUndefInRange(Mask, 0, NumElts / 2);
3370 }
3371
3372 /// Return true if the mask creates a vector whose upper half is undefined.
3373 static bool isUndefUpperHalf(ArrayRef<int> Mask) {
3374   unsigned NumElts = Mask.size();
3375   return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3376 }
3377
3378 /// Return true if Val falls within the specified range (L, H].
3379 static bool isInRange(int Val, int Low, int Hi) {
3380   return (Val >= Low && Val < Hi);
3381 }
3382
3383 /// Return true if the value of any element in Mask falls within the specified
3384 /// range (L, H].
3385 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3386   return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3387 }
3388
3389 /// Return true if the value of any element in Mask is the zero sentinel value.
3390 static bool isAnyZero(ArrayRef<int> Mask) {
3391   return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3392 }
3393
3394 /// Return true if the value of any element in Mask is the zero or undef
3395 /// sentinel values.
3396 static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
3397   return llvm::any_of(Mask, [](int M) {
3398     return M == SM_SentinelZero || M == SM_SentinelUndef;
3399   });
3400 }
3401
3402 /// Return true if Val is undef or if its value falls within the
3403 /// specified range (L, H].
3404 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3405   return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3406 }
3407
3408 /// Return true if every element in Mask is undef or if its value
3409 /// falls within the specified range (L, H].
3410 static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3411   return llvm::all_of(
3412       Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3413 }
3414
3415 /// Return true if Val is undef, zero or if its value falls within the
3416 /// specified range (L, H].
3417 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3418   return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3419 }
3420
3421 /// Return true if every element in Mask is undef, zero or if its value
3422 /// falls within the specified range (L, H].
3423 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3424   return llvm::all_of(
3425       Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3426 }
3427
3428 /// Return true if every element in Mask, beginning
3429 /// from position Pos and ending in Pos + Size, falls within the specified
3430 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3431 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3432                                        unsigned Size, int Low, int Step = 1) {
3433   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3434     if (!isUndefOrEqual(Mask[i], Low))
3435       return false;
3436   return true;
3437 }
3438
3439 /// Return true if every element in Mask, beginning
3440 /// from position Pos and ending in Pos+Size, falls within the specified
3441 /// sequential range (Low, Low+Size], or is undef or is zero.
3442 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3443                                              unsigned Size, int Low,
3444                                              int Step = 1) {
3445   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3446     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3447       return false;
3448   return true;
3449 }
3450
3451 /// Return true if every element in Mask, beginning
3452 /// from position Pos and ending in Pos+Size is undef or is zero.
3453 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3454                                  unsigned Size) {
3455   return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3456 }
3457
3458 /// Helper function to test whether a shuffle mask could be
3459 /// simplified by widening the elements being shuffled.
3460 ///
3461 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3462 /// leaves it in an unspecified state.
3463 ///
3464 /// NOTE: This must handle normal vector shuffle masks and *target* vector
3465 /// shuffle masks. The latter have the special property of a '-2' representing
3466 /// a zero-ed lane of a vector.
3467 static bool canWidenShuffleElements(ArrayRef<int> Mask,
3468                                     SmallVectorImpl<int> &WidenedMask) {
3469   WidenedMask.assign(Mask.size() / 2, 0);
3470   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3471     int M0 = Mask[i];
3472     int M1 = Mask[i + 1];
3473
3474     // If both elements are undef, its trivial.
3475     if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3476       WidenedMask[i / 2] = SM_SentinelUndef;
3477       continue;
3478     }
3479
3480     // Check for an undef mask and a mask value properly aligned to fit with
3481     // a pair of values. If we find such a case, use the non-undef mask's value.
3482     if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3483       WidenedMask[i / 2] = M1 / 2;
3484       continue;
3485     }
3486     if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3487       WidenedMask[i / 2] = M0 / 2;
3488       continue;
3489     }
3490
3491     // When zeroing, we need to spread the zeroing across both lanes to widen.
3492     if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3493       if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3494           (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
3495         WidenedMask[i / 2] = SM_SentinelZero;
3496         continue;
3497       }
3498       return false;
3499     }
3500
3501     // Finally check if the two mask values are adjacent and aligned with
3502     // a pair.
3503     if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3504       WidenedMask[i / 2] = M0 / 2;
3505       continue;
3506     }
3507
3508     // Otherwise we can't safely widen the elements used in this shuffle.
3509     return false;
3510   }
3511   assert(WidenedMask.size() == Mask.size() / 2 &&
3512          "Incorrect size of mask after widening the elements!");
3513
3514   return true;
3515 }
3516
3517 static bool canWidenShuffleElements(ArrayRef<int> Mask,
3518                                     const APInt &Zeroable,
3519                                     bool V2IsZero,
3520                                     SmallVectorImpl<int> &WidenedMask) {
3521   // Create an alternative mask with info about zeroable elements.
3522   // Here we do not set undef elements as zeroable.
3523   SmallVector<int, 64> ZeroableMask(Mask);
3524   if (V2IsZero) {
3525     assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3526     for (int i = 0, Size = Mask.size(); i != Size; ++i)
3527       if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3528         ZeroableMask[i] = SM_SentinelZero;
3529   }
3530   return canWidenShuffleElements(ZeroableMask, WidenedMask);
3531 }
3532
3533 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
3534   SmallVector<int, 32> WidenedMask;
3535   return canWidenShuffleElements(Mask, WidenedMask);
3536 }
3537
3538 // Attempt to narrow/widen shuffle mask until it matches the target number of
3539 // elements.
3540 static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3541                                  SmallVectorImpl<int> &ScaledMask) {
3542   unsigned NumSrcElts = Mask.size();
3543   assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3544          "Illegal shuffle scale factor");
3545
3546   // Narrowing is guaranteed to work.
3547   if (NumDstElts >= NumSrcElts) {
3548     int Scale = NumDstElts / NumSrcElts;
3549     llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3550     return true;
3551   }
3552
3553   // We have to repeat the widening until we reach the target size, but we can
3554   // split out the first widening as it sets up ScaledMask for us.
3555   if (canWidenShuffleElements(Mask, ScaledMask)) {
3556     while (ScaledMask.size() > NumDstElts) {
3557       SmallVector<int, 16> WidenedMask;
3558       if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3559         return false;
3560       ScaledMask = std::move(WidenedMask);
3561     }
3562     return true;
3563   }
3564
3565   return false;
3566 }
3567
3568 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
3569 bool X86::isZeroNode(SDValue Elt) {
3570   return isNullConstant(Elt) || isNullFPConstant(Elt);
3571 }
3572
3573 // Build a vector of constants.
3574 // Use an UNDEF node if MaskElt == -1.
3575 // Split 64-bit constants in the 32-bit mode.
3576 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
3577                               const SDLoc &dl, bool IsMask = false) {
3578
3579   SmallVector<SDValue, 32>  Ops;
3580   bool Split = false;
3581
3582   MVT ConstVecVT = VT;
3583   unsigned NumElts = VT.getVectorNumElements();
3584   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3585   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3586     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3587     Split = true;
3588   }
3589
3590   MVT EltVT = ConstVecVT.getVectorElementType();
3591   for (unsigned i = 0; i < NumElts; ++i) {
3592     bool IsUndef = Values[i] < 0 && IsMask;
3593     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3594       DAG.getConstant(Values[i], dl, EltVT);
3595     Ops.push_back(OpNode);
3596     if (Split)
3597       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3598                     DAG.getConstant(0, dl, EltVT));
3599   }
3600   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3601   if (Split)
3602     ConstsNode = DAG.getBitcast(VT, ConstsNode);
3603   return ConstsNode;
3604 }
3605
3606 static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3607                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3608   assert(Bits.size() == Undefs.getBitWidth() &&
3609          "Unequal constant and undef arrays");
3610   SmallVector<SDValue, 32> Ops;
3611   bool Split = false;
3612
3613   MVT ConstVecVT = VT;
3614   unsigned NumElts = VT.getVectorNumElements();
3615   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3616   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3617     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3618     Split = true;
3619   }
3620
3621   MVT EltVT = ConstVecVT.getVectorElementType();
3622   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3623     if (Undefs[i]) {
3624       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3625       continue;
3626     }
3627     const APInt &V = Bits[i];
3628     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3629     if (Split) {
3630       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3631       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3632     } else if (EltVT == MVT::f32) {
3633       APFloat FV(APFloat::IEEEsingle(), V);
3634       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3635     } else if (EltVT == MVT::f64) {
3636       APFloat FV(APFloat::IEEEdouble(), V);
3637       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3638     } else {
3639       Ops.push_back(DAG.getConstant(V, dl, EltVT));
3640     }
3641   }
3642
3643   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3644   return DAG.getBitcast(VT, ConstsNode);
3645 }
3646
3647 static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT,
3648                               SelectionDAG &DAG, const SDLoc &dl) {
3649   APInt Undefs = APInt::getZero(Bits.size());
3650   return getConstVector(Bits, Undefs, VT, DAG, dl);
3651 }
3652
3653 /// Returns a vector of specified type with all zero elements.
3654 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
3655                              SelectionDAG &DAG, const SDLoc &dl) {
3656   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
3657           VT.getVectorElementType() == MVT::i1) &&
3658          "Unexpected vector type");
3659
3660   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
3661   // type. This ensures they get CSE'd. But if the integer type is not
3662   // available, use a floating-point +0.0 instead.
3663   SDValue Vec;
3664   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
3665     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
3666   } else if (VT.isFloatingPoint()) {
3667     Vec = DAG.getConstantFP(+0.0, dl, VT);
3668   } else if (VT.getVectorElementType() == MVT::i1) {
3669     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
3670            "Unexpected vector type");
3671     Vec = DAG.getConstant(0, dl, VT);
3672   } else {
3673     unsigned Num32BitElts = VT.getSizeInBits() / 32;
3674     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
3675   }
3676   return DAG.getBitcast(VT, Vec);
3677 }
3678
3679 // Helper to determine if the ops are all the extracted subvectors come from a
3680 // single source. If we allow commute they don't have to be in order (Lo/Hi).
3681 static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
3682   if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3683       RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3684       LHS.getValueType() != RHS.getValueType() ||
3685       LHS.getOperand(0) != RHS.getOperand(0))
3686     return SDValue();
3687
3688   SDValue Src = LHS.getOperand(0);
3689   if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
3690     return SDValue();
3691
3692   unsigned NumElts = LHS.getValueType().getVectorNumElements();
3693   if ((LHS.getConstantOperandAPInt(1) == 0 &&
3694        RHS.getConstantOperandAPInt(1) == NumElts) ||
3695       (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
3696        LHS.getConstantOperandAPInt(1) == NumElts))
3697     return Src;
3698
3699   return SDValue();
3700 }
3701
3702 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
3703                                 const SDLoc &dl, unsigned vectorWidth) {
3704   EVT VT = Vec.getValueType();
3705   EVT ElVT = VT.getVectorElementType();
3706   unsigned Factor = VT.getSizeInBits() / vectorWidth;
3707   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
3708                                   VT.getVectorNumElements() / Factor);
3709
3710   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
3711   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
3712   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3713
3714   // This is the index of the first element of the vectorWidth-bit chunk
3715   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3716   IdxVal &= ~(ElemsPerChunk - 1);
3717
3718   // If the input is a buildvector just emit a smaller one.
3719   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
3720     return DAG.getBuildVector(ResultVT, dl,
3721                               Vec->ops().slice(IdxVal, ElemsPerChunk));
3722
3723   // Check if we're extracting the upper undef of a widening pattern.
3724   if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
3725       Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
3726       isNullConstant(Vec.getOperand(2)))
3727     return DAG.getUNDEF(ResultVT);
3728
3729   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3730   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
3731 }
3732
3733 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
3734 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
3735 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
3736 /// instructions or a simple subregister reference. Idx is an index in the
3737 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
3738 /// lowering EXTRACT_VECTOR_ELT operations easier.
3739 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
3740                                    SelectionDAG &DAG, const SDLoc &dl) {
3741   assert((Vec.getValueType().is256BitVector() ||
3742           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
3743   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
3744 }
3745
3746 /// Generate a DAG to grab 256-bits from a 512-bit vector.
3747 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
3748                                    SelectionDAG &DAG, const SDLoc &dl) {
3749   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
3750   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
3751 }
3752
3753 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3754                                SelectionDAG &DAG, const SDLoc &dl,
3755                                unsigned vectorWidth) {
3756   assert((vectorWidth == 128 || vectorWidth == 256) &&
3757          "Unsupported vector width");
3758   // Inserting UNDEF is Result
3759   if (Vec.isUndef())
3760     return Result;
3761   EVT VT = Vec.getValueType();
3762   EVT ElVT = VT.getVectorElementType();
3763   EVT ResultVT = Result.getValueType();
3764
3765   // Insert the relevant vectorWidth bits.
3766   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
3767   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3768
3769   // This is the index of the first element of the vectorWidth-bit chunk
3770   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3771   IdxVal &= ~(ElemsPerChunk - 1);
3772
3773   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3774   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
3775 }
3776
3777 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
3778 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
3779 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
3780 /// simple superregister reference.  Idx is an index in the 128 bits
3781 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
3782 /// lowering INSERT_VECTOR_ELT operations easier.
3783 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3784                                   SelectionDAG &DAG, const SDLoc &dl) {
3785   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
3786   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
3787 }
3788
3789 /// Widen a vector to a larger size with the same scalar type, with the new
3790 /// elements either zero or undef.
3791 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
3792                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
3793                               const SDLoc &dl) {
3794   assert(Vec.getValueSizeInBits().getFixedValue() <= VT.getFixedSizeInBits() &&
3795          Vec.getValueType().getScalarType() == VT.getScalarType() &&
3796          "Unsupported vector widening type");
3797   SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
3798                                 : DAG.getUNDEF(VT);
3799   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
3800                      DAG.getIntPtrConstant(0, dl));
3801 }
3802
3803 /// Widen a vector to a larger size with the same scalar type, with the new
3804 /// elements either zero or undef.
3805 static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
3806                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
3807                               const SDLoc &dl, unsigned WideSizeInBits) {
3808   assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
3809          (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
3810          "Unsupported vector widening type");
3811   unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
3812   MVT SVT = Vec.getSimpleValueType().getScalarType();
3813   MVT VT = MVT::getVectorVT(SVT, WideNumElts);
3814   return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
3815 }
3816
3817 /// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
3818 /// and bitcast with integer types.
3819 static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
3820   assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
3821   unsigned NumElts = VT.getVectorNumElements();
3822   if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
3823     return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
3824   return VT;
3825 }
3826
3827 /// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
3828 /// bitcast with integer types.
3829 static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
3830                                const X86Subtarget &Subtarget, SelectionDAG &DAG,
3831                                const SDLoc &dl) {
3832   MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
3833   return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
3834 }
3835
3836 // Helper function to collect subvector ops that are concatenated together,
3837 // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
3838 // The subvectors in Ops are guaranteed to be the same type.
3839 static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
3840                              SelectionDAG &DAG) {
3841   assert(Ops.empty() && "Expected an empty ops vector");
3842
3843   if (N->getOpcode() == ISD::CONCAT_VECTORS) {
3844     Ops.append(N->op_begin(), N->op_end());
3845     return true;
3846   }
3847
3848   if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
3849     SDValue Src = N->getOperand(0);
3850     SDValue Sub = N->getOperand(1);
3851     const APInt &Idx = N->getConstantOperandAPInt(2);
3852     EVT VT = Src.getValueType();
3853     EVT SubVT = Sub.getValueType();
3854
3855     // TODO - Handle more general insert_subvector chains.
3856     if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
3857       // insert_subvector(undef, x, lo)
3858       if (Idx == 0 && Src.isUndef()) {
3859         Ops.push_back(Sub);
3860         Ops.push_back(DAG.getUNDEF(SubVT));
3861         return true;
3862       }
3863       if (Idx == (VT.getVectorNumElements() / 2)) {
3864         // insert_subvector(insert_subvector(undef, x, lo), y, hi)
3865         if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
3866             Src.getOperand(1).getValueType() == SubVT &&
3867             isNullConstant(Src.getOperand(2))) {
3868           Ops.push_back(Src.getOperand(1));
3869           Ops.push_back(Sub);
3870           return true;
3871         }
3872         // insert_subvector(x, extract_subvector(x, lo), hi)
3873         if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3874             Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
3875           Ops.append(2, Sub);
3876           return true;
3877         }
3878         // insert_subvector(undef, x, hi)
3879         if (Src.isUndef()) {
3880           Ops.push_back(DAG.getUNDEF(SubVT));
3881           Ops.push_back(Sub);
3882           return true;
3883         }
3884       }
3885     }
3886   }
3887
3888   return false;
3889 }
3890
3891 // Helper to check if \p V can be split into subvectors and the upper subvectors
3892 // are all undef. In which case return the lower subvector.
3893 static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL,
3894                                      SelectionDAG &DAG) {
3895   SmallVector<SDValue> SubOps;
3896   if (!collectConcatOps(V.getNode(), SubOps, DAG))
3897     return SDValue();
3898
3899   unsigned NumSubOps = SubOps.size();
3900   unsigned HalfNumSubOps = NumSubOps / 2;
3901   assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
3902
3903   ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
3904   if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
3905     return SDValue();
3906
3907   EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
3908   ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
3909   return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
3910 }
3911
3912 // Helper to check if we can access all the constituent subvectors without any
3913 // extract ops.
3914 static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG) {
3915   SmallVector<SDValue> Ops;
3916   return collectConcatOps(N, Ops, DAG);
3917 }
3918
3919 static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
3920                                                const SDLoc &dl) {
3921   EVT VT = Op.getValueType();
3922   unsigned NumElems = VT.getVectorNumElements();
3923   unsigned SizeInBits = VT.getSizeInBits();
3924   assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
3925          "Can't split odd sized vector");
3926
3927   // If this is a splat value (with no-undefs) then use the lower subvector,
3928   // which should be a free extraction.
3929   SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
3930   if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
3931     return std::make_pair(Lo, Lo);
3932
3933   SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
3934   return std::make_pair(Lo, Hi);
3935 }
3936
3937 /// Break an operation into 2 half sized ops and then concatenate the results.
3938 static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {
3939   unsigned NumOps = Op.getNumOperands();
3940   EVT VT = Op.getValueType();
3941   SDLoc dl(Op);
3942
3943   // Extract the LHS Lo/Hi vectors
3944   SmallVector<SDValue> LoOps(NumOps, SDValue());
3945   SmallVector<SDValue> HiOps(NumOps, SDValue());
3946   for (unsigned I = 0; I != NumOps; ++I) {
3947     SDValue SrcOp = Op.getOperand(I);
3948     if (!SrcOp.getValueType().isVector()) {
3949       LoOps[I] = HiOps[I] = SrcOp;
3950       continue;
3951     }
3952     std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
3953   }
3954
3955   EVT LoVT, HiVT;
3956   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
3957   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
3958                      DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
3959                      DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
3960 }
3961
3962 /// Break an unary integer operation into 2 half sized ops and then
3963 /// concatenate the result back.
3964 static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
3965   // Make sure we only try to split 256/512-bit types to avoid creating
3966   // narrow vectors.
3967   EVT VT = Op.getValueType();
3968   (void)VT;
3969   assert((Op.getOperand(0).getValueType().is256BitVector() ||
3970           Op.getOperand(0).getValueType().is512BitVector()) &&
3971          (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
3972   assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
3973              VT.getVectorNumElements() &&
3974          "Unexpected VTs!");
3975   return splitVectorOp(Op, DAG);
3976 }
3977
3978 /// Break a binary integer operation into 2 half sized ops and then
3979 /// concatenate the result back.
3980 static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
3981   // Assert that all the types match.
3982   EVT VT = Op.getValueType();
3983   (void)VT;
3984   assert(Op.getOperand(0).getValueType() == VT &&
3985          Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
3986   assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
3987   return splitVectorOp(Op, DAG);
3988 }
3989
3990 // Helper for splitting operands of an operation to legal target size and
3991 // apply a function on each part.
3992 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
3993 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
3994 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
3995 // The argument Builder is a function that will be applied on each split part:
3996 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
3997 template <typename F>
3998 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
3999                          const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4000                          F Builder, bool CheckBWI = true) {
4001   assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4002   unsigned NumSubs = 1;
4003   if ((CheckBWI && Subtarget.useBWIRegs()) ||
4004       (!CheckBWI && Subtarget.useAVX512Regs())) {
4005     if (VT.getSizeInBits() > 512) {
4006       NumSubs = VT.getSizeInBits() / 512;
4007       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4008     }
4009   } else if (Subtarget.hasAVX2()) {
4010     if (VT.getSizeInBits() > 256) {
4011       NumSubs = VT.getSizeInBits() / 256;
4012       assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4013     }
4014   } else {
4015     if (VT.getSizeInBits() > 128) {
4016       NumSubs = VT.getSizeInBits() / 128;
4017       assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4018     }
4019   }
4020
4021   if (NumSubs == 1)
4022     return Builder(DAG, DL, Ops);
4023
4024   SmallVector<SDValue, 4> Subs;
4025   for (unsigned i = 0; i != NumSubs; ++i) {
4026     SmallVector<SDValue, 2> SubOps;
4027     for (SDValue Op : Ops) {
4028       EVT OpVT = Op.getValueType();
4029       unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4030       unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4031       SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4032     }
4033     Subs.push_back(Builder(DAG, DL, SubOps));
4034   }
4035   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4036 }
4037
4038 // Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4039 // targets.
4040 static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4041                              ArrayRef<SDValue> Ops, SelectionDAG &DAG,
4042                              const X86Subtarget &Subtarget) {
4043   assert(Subtarget.hasAVX512() && "AVX512 target expected");
4044   MVT SVT = VT.getScalarType();
4045
4046   // If we have a 32/64 splatted constant, splat it to DstTy to
4047   // encourage a foldable broadcast'd operand.
4048   auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4049     unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4050     // AVX512 broadcasts 32/64-bit operands.
4051     // TODO: Support float once getAVX512Node is used by fp-ops.
4052     if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4053         !DAG.getTargetLoweringInfo().isTypeLegal(SVT))
4054       return SDValue();
4055     // If we're not widening, don't bother if we're not bitcasting.
4056     if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4057       return SDValue();
4058     if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4059       APInt SplatValue, SplatUndef;
4060       unsigned SplatBitSize;
4061       bool HasAnyUndefs;
4062       if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4063                               HasAnyUndefs, OpEltSizeInBits) &&
4064           !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4065         return DAG.getConstant(SplatValue, DL, DstVT);
4066     }
4067     return SDValue();
4068   };
4069
4070   bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4071
4072   MVT DstVT = VT;
4073   if (Widen)
4074     DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4075
4076   // Canonicalize src operands.
4077   SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
4078   for (SDValue &Op : SrcOps) {
4079     MVT OpVT = Op.getSimpleValueType();
4080     // Just pass through scalar operands.
4081     if (!OpVT.isVector())
4082       continue;
4083     assert(OpVT == VT && "Vector type mismatch");
4084
4085     if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4086       Op = BroadcastOp;
4087       continue;
4088     }
4089
4090     // Just widen the subvector by inserting into an undef wide vector.
4091     if (Widen)
4092       Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4093   }
4094
4095   SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4096
4097   // Perform the 512-bit op then extract the bottom subvector.
4098   if (Widen)
4099     Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4100   return Res;
4101 }
4102
4103 /// Insert i1-subvector to i1-vector.
4104 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
4105                                 const X86Subtarget &Subtarget) {
4106
4107   SDLoc dl(Op);
4108   SDValue Vec = Op.getOperand(0);
4109   SDValue SubVec = Op.getOperand(1);
4110   SDValue Idx = Op.getOperand(2);
4111   unsigned IdxVal = Op.getConstantOperandVal(2);
4112
4113   // Inserting undef is a nop. We can just return the original vector.
4114   if (SubVec.isUndef())
4115     return Vec;
4116
4117   if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4118     return Op;
4119
4120   MVT OpVT = Op.getSimpleValueType();
4121   unsigned NumElems = OpVT.getVectorNumElements();
4122   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4123
4124   // Extend to natively supported kshift.
4125   MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4126
4127   // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4128   // if necessary.
4129   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4130     // May need to promote to a legal type.
4131     Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4132                      DAG.getConstant(0, dl, WideOpVT),
4133                      SubVec, Idx);
4134     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4135   }
4136
4137   MVT SubVecVT = SubVec.getSimpleValueType();
4138   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4139   assert(IdxVal + SubVecNumElems <= NumElems &&
4140          IdxVal % SubVecVT.getSizeInBits() == 0 &&
4141          "Unexpected index value in INSERT_SUBVECTOR");
4142
4143   SDValue Undef = DAG.getUNDEF(WideOpVT);
4144
4145   if (IdxVal == 0) {
4146     // Zero lower bits of the Vec
4147     SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4148     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4149                       ZeroIdx);
4150     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4151     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4152     // Merge them together, SubVec should be zero extended.
4153     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4154                          DAG.getConstant(0, dl, WideOpVT),
4155                          SubVec, ZeroIdx);
4156     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4157     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4158   }
4159
4160   SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4161                        Undef, SubVec, ZeroIdx);
4162
4163   if (Vec.isUndef()) {
4164     assert(IdxVal != 0 && "Unexpected index");
4165     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4166                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4167     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4168   }
4169
4170   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
4171     assert(IdxVal != 0 && "Unexpected index");
4172     // If upper elements of Vec are known undef, then just shift into place.
4173     if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4174                      [](SDValue V) { return V.isUndef(); })) {
4175       SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4176                            DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4177     } else {
4178       NumElems = WideOpVT.getVectorNumElements();
4179       unsigned ShiftLeft = NumElems - SubVecNumElems;
4180       unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4181       SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4182                            DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4183       if (ShiftRight != 0)
4184         SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4185                              DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4186     }
4187     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4188   }
4189
4190   // Simple case when we put subvector in the upper part
4191   if (IdxVal + SubVecNumElems == NumElems) {
4192     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4193                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4194     if (SubVecNumElems * 2 == NumElems) {
4195       // Special case, use legal zero extending insert_subvector. This allows
4196       // isel to optimize when bits are known zero.
4197       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4198       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4199                         DAG.getConstant(0, dl, WideOpVT),
4200                         Vec, ZeroIdx);
4201     } else {
4202       // Otherwise use explicit shifts to zero the bits.
4203       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4204                         Undef, Vec, ZeroIdx);
4205       NumElems = WideOpVT.getVectorNumElements();
4206       SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4207       Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4208       Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4209     }
4210     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4211     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4212   }
4213
4214   // Inserting into the middle is more complicated.
4215
4216   NumElems = WideOpVT.getVectorNumElements();
4217
4218   // Widen the vector if needed.
4219   Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4220
4221   unsigned ShiftLeft = NumElems - SubVecNumElems;
4222   unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4223
4224   // Do an optimization for the most frequently used types.
4225   if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4226     APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4227     Mask0.flipAllBits();
4228     SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4229     SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4230     Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4231     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4232                          DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4233     SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4234                          DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4235     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4236
4237     // Reduce to original width if needed.
4238     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4239   }
4240
4241   // Clear the upper bits of the subvector and move it to its insert position.
4242   SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4243                        DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4244   SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4245                        DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4246
4247   // Isolate the bits below the insertion point.
4248   unsigned LowShift = NumElems - IdxVal;
4249   SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4250                             DAG.getTargetConstant(LowShift, dl, MVT::i8));
4251   Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4252                     DAG.getTargetConstant(LowShift, dl, MVT::i8));
4253
4254   // Isolate the bits after the last inserted bit.
4255   unsigned HighShift = IdxVal + SubVecNumElems;
4256   SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4257                             DAG.getTargetConstant(HighShift, dl, MVT::i8));
4258   High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4259                     DAG.getTargetConstant(HighShift, dl, MVT::i8));
4260
4261   // Now OR all 3 pieces together.
4262   Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4263   SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4264
4265   // Reduce to original width if needed.
4266   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4267 }
4268
4269 static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
4270                                 const SDLoc &dl) {
4271   assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4272   EVT SubVT = V1.getValueType();
4273   EVT SubSVT = SubVT.getScalarType();
4274   unsigned SubNumElts = SubVT.getVectorNumElements();
4275   unsigned SubVectorWidth = SubVT.getSizeInBits();
4276   EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4277   SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4278   return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4279 }
4280
4281 /// Returns a vector of specified type with all bits set.
4282 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4283 /// Then bitcast to their original type, ensuring they get CSE'd.
4284 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4285   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4286          "Expected a 128/256/512-bit vector type");
4287
4288   APInt Ones = APInt::getAllOnes(32);
4289   unsigned NumElts = VT.getSizeInBits() / 32;
4290   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
4291   return DAG.getBitcast(VT, Vec);
4292 }
4293
4294 static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4295                                       SDValue In, SelectionDAG &DAG) {
4296   EVT InVT = In.getValueType();
4297   assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4298   assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4299           ISD::ZERO_EXTEND == Opcode) &&
4300          "Unknown extension opcode");
4301
4302   // For 256-bit vectors, we only need the lower (128-bit) input half.
4303   // For 512-bit vectors, we only need the lower input half or quarter.
4304   if (InVT.getSizeInBits() > 128) {
4305     assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4306            "Expected VTs to be the same size!");
4307     unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4308     In = extractSubVector(In, 0, DAG, DL,
4309                           std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4310     InVT = In.getValueType();
4311   }
4312
4313   if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4314     Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4315
4316   return DAG.getNode(Opcode, DL, VT, In);
4317 }
4318
4319 // Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4320 static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4321                             SDValue Mask, SelectionDAG &DAG) {
4322   LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4323   RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4324   return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4325 }
4326
4327 void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
4328                                    bool Lo, bool Unary) {
4329   assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4330          "Illegal vector type to unpack");
4331   assert(Mask.empty() && "Expected an empty shuffle mask vector");
4332   int NumElts = VT.getVectorNumElements();
4333   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4334   for (int i = 0; i < NumElts; ++i) {
4335     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4336     int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4337     Pos += (Unary ? 0 : NumElts * (i % 2));
4338     Pos += (Lo ? 0 : NumEltsInLane / 2);
4339     Mask.push_back(Pos);
4340   }
4341 }
4342
4343 /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4344 /// imposed by AVX and specific to the unary pattern. Example:
4345 /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4346 /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4347 void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
4348                                    bool Lo) {
4349   assert(Mask.empty() && "Expected an empty shuffle mask vector");
4350   int NumElts = VT.getVectorNumElements();
4351   for (int i = 0; i < NumElts; ++i) {
4352     int Pos = i / 2;
4353     Pos += (Lo ? 0 : NumElts / 2);
4354     Mask.push_back(Pos);
4355   }
4356 }
4357
4358 // Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4359 static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4360                                 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4361   if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&
4362       (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4363     SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4364     for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4365       int M = Mask[I];
4366       if (M < 0)
4367         continue;
4368       SDValue V = (M < NumElts) ? V1 : V2;
4369       if (V.isUndef())
4370         continue;
4371       Ops[I] = V.getOperand(M % NumElts);
4372     }
4373     return DAG.getBuildVector(VT, dl, Ops);
4374   }
4375
4376   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4377 }
4378
4379 /// Returns a vector_shuffle node for an unpackl operation.
4380 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4381                           SDValue V1, SDValue V2) {
4382   SmallVector<int, 8> Mask;
4383   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4384   return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4385 }
4386
4387 /// Returns a vector_shuffle node for an unpackh operation.
4388 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4389                           SDValue V1, SDValue V2) {
4390   SmallVector<int, 8> Mask;
4391   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4392   return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4393 }
4394
4395 /// Returns a node that packs the LHS + RHS nodes together at half width.
4396 /// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4397 /// TODO: Add subvector splitting if/when we have a need for it.
4398 static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4399                        const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4400                        bool PackHiHalf = false) {
4401   MVT OpVT = LHS.getSimpleValueType();
4402   unsigned EltSizeInBits = VT.getScalarSizeInBits();
4403   bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4404   assert(OpVT == RHS.getSimpleValueType() &&
4405          VT.getSizeInBits() == OpVT.getSizeInBits() &&
4406          (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4407          "Unexpected PACK operand types");
4408   assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4409          "Unexpected PACK result type");
4410
4411   // Rely on vector shuffles for vXi64 -> vXi32 packing.
4412   if (EltSizeInBits == 32) {
4413     SmallVector<int> PackMask;
4414     int Offset = PackHiHalf ? 1 : 0;
4415     int NumElts = VT.getVectorNumElements();
4416     for (int I = 0; I != NumElts; I += 4) {
4417       PackMask.push_back(I + Offset);
4418       PackMask.push_back(I + Offset + 2);
4419       PackMask.push_back(I + Offset + NumElts);
4420       PackMask.push_back(I + Offset + NumElts + 2);
4421     }
4422     return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4423                                 DAG.getBitcast(VT, RHS), PackMask);
4424   }
4425
4426   // See if we already have sufficient leading bits for PACKSS/PACKUS.
4427   if (!PackHiHalf) {
4428     if (UsePackUS &&
4429         DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4430         DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4431       return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4432
4433     if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4434         DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4435       return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4436   }
4437
4438   // Fallback to sign/zero extending the requested half and pack.
4439   SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4440   if (UsePackUS) {
4441     if (PackHiHalf) {
4442       LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4443       RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4444     } else {
4445       SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4446       LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4447       RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4448     };
4449     return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4450   };
4451
4452   if (!PackHiHalf) {
4453     LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4454     RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4455   }
4456   LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4457   RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4458   return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4459 }
4460
4461 /// Return a vector_shuffle of the specified vector of zero or undef vector.
4462 /// This produces a shuffle where the low element of V2 is swizzled into the
4463 /// zero/undef vector, landing at element Idx.
4464 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
4465 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
4466                                            bool IsZero,
4467                                            const X86Subtarget &Subtarget,
4468                                            SelectionDAG &DAG) {
4469   MVT VT = V2.getSimpleValueType();
4470   SDValue V1 = IsZero
4471     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4472   int NumElems = VT.getVectorNumElements();
4473   SmallVector<int, 16> MaskVec(NumElems);
4474   for (int i = 0; i != NumElems; ++i)
4475     // If this is the insertion idx, put the low elt of V2 here.
4476     MaskVec[i] = (i == Idx) ? NumElems : i;
4477   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4478 }
4479
4480 static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
4481   if (Ptr.getOpcode() == X86ISD::Wrapper ||
4482       Ptr.getOpcode() == X86ISD::WrapperRIP)
4483     Ptr = Ptr.getOperand(0);
4484
4485   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
4486   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4487     return nullptr;
4488
4489   return CNode->getConstVal();
4490 }
4491
4492 static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
4493   if (!Load || !ISD::isNormalLoad(Load))
4494     return nullptr;
4495   return getTargetConstantFromBasePtr(Load->getBasePtr());
4496 }
4497
4498 static const Constant *getTargetConstantFromNode(SDValue Op) {
4499   Op = peekThroughBitcasts(Op);
4500   return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4501 }
4502
4503 const Constant *
4504 X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
4505   assert(LD && "Unexpected null LoadSDNode");
4506   return getTargetConstantFromNode(LD);
4507 }
4508
4509 // Extract raw constant bits from constant pools.
4510 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4511                                           APInt &UndefElts,
4512                                           SmallVectorImpl<APInt> &EltBits,
4513                                           bool AllowWholeUndefs = true,
4514                                           bool AllowPartialUndefs = true) {
4515   assert(EltBits.empty() && "Expected an empty EltBits vector");
4516
4517   Op = peekThroughBitcasts(Op);
4518
4519   EVT VT = Op.getValueType();
4520   unsigned SizeInBits = VT.getSizeInBits();
4521   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4522   unsigned NumElts = SizeInBits / EltSizeInBits;
4523
4524   // Bitcast a source array of element bits to the target size.
4525   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4526     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4527     unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4528     assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4529            "Constant bit sizes don't match");
4530
4531     // Don't split if we don't allow undef bits.
4532     bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4533     if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4534       return false;
4535
4536     // If we're already the right size, don't bother bitcasting.
4537     if (NumSrcElts == NumElts) {
4538       UndefElts = UndefSrcElts;
4539       EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4540       return true;
4541     }
4542
4543     // Extract all the undef/constant element data and pack into single bitsets.
4544     APInt UndefBits(SizeInBits, 0);
4545     APInt MaskBits(SizeInBits, 0);
4546
4547     for (unsigned i = 0; i != NumSrcElts; ++i) {
4548       unsigned BitOffset = i * SrcEltSizeInBits;
4549       if (UndefSrcElts[i])
4550         UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4551       MaskBits.insertBits(SrcEltBits[i], BitOffset);
4552     }
4553
4554     // Split the undef/constant single bitset data into the target elements.
4555     UndefElts = APInt(NumElts, 0);
4556     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4557
4558     for (unsigned i = 0; i != NumElts; ++i) {
4559       unsigned BitOffset = i * EltSizeInBits;
4560       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4561
4562       // Only treat an element as UNDEF if all bits are UNDEF.
4563       if (UndefEltBits.isAllOnes()) {
4564         if (!AllowWholeUndefs)
4565           return false;
4566         UndefElts.setBit(i);
4567         continue;
4568       }
4569
4570       // If only some bits are UNDEF then treat them as zero (or bail if not
4571       // supported).
4572       if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4573         return false;
4574
4575       EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4576     }
4577     return true;
4578   };
4579
4580   // Collect constant bits and insert into mask/undef bit masks.
4581   auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4582                                 unsigned UndefBitIndex) {
4583     if (!Cst)
4584       return false;
4585     if (isa<UndefValue>(Cst)) {
4586       Undefs.setBit(UndefBitIndex);
4587       return true;
4588     }
4589     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4590       Mask = CInt->getValue();
4591       return true;
4592     }
4593     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4594       Mask = CFP->getValueAPF().bitcastToAPInt();
4595       return true;
4596     }
4597     if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4598       Type *Ty = CDS->getType();
4599       Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
4600       Type *EltTy = CDS->getElementType();
4601       bool IsInteger = EltTy->isIntegerTy();
4602       bool IsFP =
4603           EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4604       if (!IsInteger && !IsFP)
4605         return false;
4606       unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4607       for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4608         if (IsInteger)
4609           Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4610         else
4611           Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4612                           I * EltBits);
4613       return true;
4614     }
4615     return false;
4616   };
4617
4618   // Handle UNDEFs.
4619   if (Op.isUndef()) {
4620     APInt UndefSrcElts = APInt::getAllOnes(NumElts);
4621     SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
4622     return CastBitData(UndefSrcElts, SrcEltBits);
4623   }
4624
4625   // Extract scalar constant bits.
4626   if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
4627     APInt UndefSrcElts = APInt::getZero(1);
4628     SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
4629     return CastBitData(UndefSrcElts, SrcEltBits);
4630   }
4631   if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
4632     APInt UndefSrcElts = APInt::getZero(1);
4633     APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
4634     SmallVector<APInt, 64> SrcEltBits(1, RawBits);
4635     return CastBitData(UndefSrcElts, SrcEltBits);
4636   }
4637
4638   // Extract constant bits from build vector.
4639   if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
4640     BitVector Undefs;
4641     SmallVector<APInt> SrcEltBits;
4642     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4643     if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
4644       APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
4645       for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
4646         if (Undefs[I])
4647           UndefSrcElts.setBit(I);
4648       return CastBitData(UndefSrcElts, SrcEltBits);
4649     }
4650   }
4651
4652   // Extract constant bits from constant pool vector.
4653   if (auto *Cst = getTargetConstantFromNode(Op)) {
4654     Type *CstTy = Cst->getType();
4655     unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4656     if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
4657       return false;
4658
4659     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
4660     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4661
4662     APInt UndefSrcElts(NumSrcElts, 0);
4663     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
4664     for (unsigned i = 0; i != NumSrcElts; ++i)
4665       if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
4666                                UndefSrcElts, i))
4667         return false;
4668
4669     return CastBitData(UndefSrcElts, SrcEltBits);
4670   }
4671
4672   // Extract constant bits from a broadcasted constant pool scalar.
4673   if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
4674       EltSizeInBits <= VT.getScalarSizeInBits()) {
4675     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4676     if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
4677       return false;
4678
4679     SDValue Ptr = MemIntr->getBasePtr();
4680     if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
4681       unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4682       unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4683
4684       APInt UndefSrcElts(NumSrcElts, 0);
4685       SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
4686       if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
4687         if (UndefSrcElts[0])
4688           UndefSrcElts.setBits(0, NumSrcElts);
4689         if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
4690           SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
4691         SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
4692         return CastBitData(UndefSrcElts, SrcEltBits);
4693       }
4694     }
4695   }
4696
4697   // Extract constant bits from a subvector broadcast.
4698   if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
4699     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4700     SDValue Ptr = MemIntr->getBasePtr();
4701     // The source constant may be larger than the subvector broadcast,
4702     // ensure we extract the correct subvector constants.
4703     if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
4704       Type *CstTy = Cst->getType();
4705       unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4706       unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
4707       if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
4708           (SizeInBits % SubVecSizeInBits) != 0)
4709         return false;
4710       unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
4711       unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
4712       unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
4713       APInt UndefSubElts(NumSubElts, 0);
4714       SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
4715                                         APInt(CstEltSizeInBits, 0));
4716       for (unsigned i = 0; i != NumSubElts; ++i) {
4717         if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
4718                                  UndefSubElts, i))
4719           return false;
4720         for (unsigned j = 1; j != NumSubVecs; ++j)
4721           SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
4722       }
4723       UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
4724                                      UndefSubElts);
4725       return CastBitData(UndefSubElts, SubEltBits);
4726     }
4727   }
4728
4729   // Extract a rematerialized scalar constant insertion.
4730   if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
4731       Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
4732       isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
4733     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4734     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4735
4736     APInt UndefSrcElts(NumSrcElts, 0);
4737     SmallVector<APInt, 64> SrcEltBits;
4738     auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
4739     SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
4740     SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
4741     return CastBitData(UndefSrcElts, SrcEltBits);
4742   }
4743
4744   // Insert constant bits from a base and sub vector sources.
4745   if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
4746     // If bitcasts to larger elements we might lose track of undefs - don't
4747     // allow any to be safe.
4748     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4749     bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
4750
4751     APInt UndefSrcElts, UndefSubElts;
4752     SmallVector<APInt, 32> EltSrcBits, EltSubBits;
4753     if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
4754                                       UndefSubElts, EltSubBits,
4755                                       AllowWholeUndefs && AllowUndefs,
4756                                       AllowPartialUndefs && AllowUndefs) &&
4757         getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
4758                                       UndefSrcElts, EltSrcBits,
4759                                       AllowWholeUndefs && AllowUndefs,
4760                                       AllowPartialUndefs && AllowUndefs)) {
4761       unsigned BaseIdx = Op.getConstantOperandVal(2);
4762       UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
4763       for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
4764         EltSrcBits[BaseIdx + i] = EltSubBits[i];
4765       return CastBitData(UndefSrcElts, EltSrcBits);
4766     }
4767   }
4768
4769   // Extract constant bits from a subvector's source.
4770   if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4771     // TODO - support extract_subvector through bitcasts.
4772     if (EltSizeInBits != VT.getScalarSizeInBits())
4773       return false;
4774
4775     if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
4776                                       UndefElts, EltBits, AllowWholeUndefs,
4777                                       AllowPartialUndefs)) {
4778       EVT SrcVT = Op.getOperand(0).getValueType();
4779       unsigned NumSrcElts = SrcVT.getVectorNumElements();
4780       unsigned NumSubElts = VT.getVectorNumElements();
4781       unsigned BaseIdx = Op.getConstantOperandVal(1);
4782       UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
4783       if ((BaseIdx + NumSubElts) != NumSrcElts)
4784         EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
4785       if (BaseIdx != 0)
4786         EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
4787       return true;
4788     }
4789   }
4790
4791   // Extract constant bits from shuffle node sources.
4792   if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
4793     // TODO - support shuffle through bitcasts.
4794     if (EltSizeInBits != VT.getScalarSizeInBits())
4795       return false;
4796
4797     ArrayRef<int> Mask = SVN->getMask();
4798     if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
4799         llvm::any_of(Mask, [](int M) { return M < 0; }))
4800       return false;
4801
4802     APInt UndefElts0, UndefElts1;
4803     SmallVector<APInt, 32> EltBits0, EltBits1;
4804     if (isAnyInRange(Mask, 0, NumElts) &&
4805         !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
4806                                        UndefElts0, EltBits0, AllowWholeUndefs,
4807                                        AllowPartialUndefs))
4808       return false;
4809     if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
4810         !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
4811                                        UndefElts1, EltBits1, AllowWholeUndefs,
4812                                        AllowPartialUndefs))
4813       return false;
4814
4815     UndefElts = APInt::getZero(NumElts);
4816     for (int i = 0; i != (int)NumElts; ++i) {
4817       int M = Mask[i];
4818       if (M < 0) {
4819         UndefElts.setBit(i);
4820         EltBits.push_back(APInt::getZero(EltSizeInBits));
4821       } else if (M < (int)NumElts) {
4822         if (UndefElts0[M])
4823           UndefElts.setBit(i);
4824         EltBits.push_back(EltBits0[M]);
4825       } else {
4826         if (UndefElts1[M - NumElts])
4827           UndefElts.setBit(i);
4828         EltBits.push_back(EltBits1[M - NumElts]);
4829       }
4830     }
4831     return true;
4832   }
4833
4834   return false;
4835 }
4836
4837 namespace llvm {
4838 namespace X86 {
4839 bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
4840   APInt UndefElts;
4841   SmallVector<APInt, 16> EltBits;
4842   if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
4843                                     UndefElts, EltBits, true,
4844                                     AllowPartialUndefs)) {
4845     int SplatIndex = -1;
4846     for (int i = 0, e = EltBits.size(); i != e; ++i) {
4847       if (UndefElts[i])
4848         continue;
4849       if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
4850         SplatIndex = -1;
4851         break;
4852       }
4853       SplatIndex = i;
4854     }
4855     if (0 <= SplatIndex) {
4856       SplatVal = EltBits[SplatIndex];
4857       return true;
4858     }
4859   }
4860
4861   return false;
4862 }
4863 } // namespace X86
4864 } // namespace llvm
4865
4866 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
4867                                         unsigned MaskEltSizeInBits,
4868                                         SmallVectorImpl<uint64_t> &RawMask,
4869                                         APInt &UndefElts) {
4870   // Extract the raw target constant bits.
4871   SmallVector<APInt, 64> EltBits;
4872   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
4873                                      EltBits, /* AllowWholeUndefs */ true,
4874                                      /* AllowPartialUndefs */ false))
4875     return false;
4876
4877   // Insert the extracted elements into the mask.
4878   for (const APInt &Elt : EltBits)
4879     RawMask.push_back(Elt.getZExtValue());
4880
4881   return true;
4882 }
4883
4884 // Match not(xor X, -1) -> X.
4885 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
4886 // Match not(extract_subvector(xor X, -1)) -> extract_subvector(X).
4887 // Match not(concat_vectors(xor X, -1, xor Y, -1)) -> concat_vectors(X, Y).
4888 static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
4889   V = peekThroughBitcasts(V);
4890   if (V.getOpcode() == ISD::XOR &&
4891       (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
4892        isAllOnesConstant(V.getOperand(1))))
4893     return V.getOperand(0);
4894   if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4895       (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
4896     if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
4897       Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
4898       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
4899                          Not, V.getOperand(1));
4900     }
4901   }
4902   if (V.getOpcode() == X86ISD::PCMPGT &&
4903       !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
4904       !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
4905       V.getOperand(0).hasOneUse()) {
4906     APInt UndefElts;
4907     SmallVector<APInt> EltBits;
4908     if (getTargetConstantBitsFromNode(V.getOperand(0),
4909                                       V.getScalarValueSizeInBits(), UndefElts,
4910                                       EltBits)) {
4911       // Don't fold min_signed_value -> (min_signed_value - 1)
4912       bool MinSigned = false;
4913       for (APInt &Elt : EltBits) {
4914         MinSigned |= Elt.isMinSignedValue();
4915         Elt -= 1;
4916       }
4917       if (!MinSigned) {
4918         SDLoc DL(V);
4919         MVT VT = V.getSimpleValueType();
4920         return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
4921                            getConstVector(EltBits, UndefElts, VT, DAG, DL));
4922       }
4923     }
4924   }
4925   SmallVector<SDValue, 2> CatOps;
4926   if (collectConcatOps(V.getNode(), CatOps, DAG)) {
4927     for (SDValue &CatOp : CatOps) {
4928       SDValue NotCat = IsNOT(CatOp, DAG);
4929       if (!NotCat) return SDValue();
4930       CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
4931     }
4932     return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
4933   }
4934   return SDValue();
4935 }
4936
4937 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
4938 /// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
4939 /// Note: This ignores saturation, so inputs must be checked first.
4940 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
4941                                   bool Unary, unsigned NumStages = 1) {
4942   assert(Mask.empty() && "Expected an empty shuffle mask vector");
4943   unsigned NumElts = VT.getVectorNumElements();
4944   unsigned NumLanes = VT.getSizeInBits() / 128;
4945   unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
4946   unsigned Offset = Unary ? 0 : NumElts;
4947   unsigned Repetitions = 1u << (NumStages - 1);
4948   unsigned Increment = 1u << NumStages;
4949   assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
4950
4951   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
4952     for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
4953       for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
4954         Mask.push_back(Elt + (Lane * NumEltsPerLane));
4955       for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
4956         Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
4957     }
4958   }
4959 }
4960
4961 // Split the demanded elts of a PACKSS/PACKUS node between its operands.
4962 static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
4963                                 APInt &DemandedLHS, APInt &DemandedRHS) {
4964   int NumLanes = VT.getSizeInBits() / 128;
4965   int NumElts = DemandedElts.getBitWidth();
4966   int NumInnerElts = NumElts / 2;
4967   int NumEltsPerLane = NumElts / NumLanes;
4968   int NumInnerEltsPerLane = NumInnerElts / NumLanes;
4969
4970   DemandedLHS = APInt::getZero(NumInnerElts);
4971   DemandedRHS = APInt::getZero(NumInnerElts);
4972
4973   // Map DemandedElts to the packed operands.
4974   for (int Lane = 0; Lane != NumLanes; ++Lane) {
4975     for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
4976       int OuterIdx = (Lane * NumEltsPerLane) + Elt;
4977       int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
4978       if (DemandedElts[OuterIdx])
4979         DemandedLHS.setBit(InnerIdx);
4980       if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
4981         DemandedRHS.setBit(InnerIdx);
4982     }
4983   }
4984 }
4985
4986 // Split the demanded elts of a HADD/HSUB node between its operands.
4987 static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
4988                                  APInt &DemandedLHS, APInt &DemandedRHS) {
4989   int NumLanes = VT.getSizeInBits() / 128;
4990   int NumElts = DemandedElts.getBitWidth();
4991   int NumEltsPerLane = NumElts / NumLanes;
4992   int HalfEltsPerLane = NumEltsPerLane / 2;
4993
4994   DemandedLHS = APInt::getZero(NumElts);
4995   DemandedRHS = APInt::getZero(NumElts);
4996
4997   // Map DemandedElts to the horizontal operands.
4998   for (int Idx = 0; Idx != NumElts; ++Idx) {
4999     if (!DemandedElts[Idx])
5000       continue;
5001     int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
5002     int LocalIdx = Idx % NumEltsPerLane;
5003     if (LocalIdx < HalfEltsPerLane) {
5004       DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5005       DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5006     } else {
5007       LocalIdx -= HalfEltsPerLane;
5008       DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5009       DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5010     }
5011   }
5012 }
5013
5014 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5015 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5016 /// operands in \p Ops, and returns true.
5017 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5018 /// IsUnary for shuffles which use a single input multiple times, and in those
5019 /// cases it will adjust the mask to only have indices within that single input.
5020 /// It is an error to call this with non-empty Mask/Ops vectors.
5021 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5022                                  SmallVectorImpl<SDValue> &Ops,
5023                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5024   unsigned NumElems = VT.getVectorNumElements();
5025   unsigned MaskEltSize = VT.getScalarSizeInBits();
5026   SmallVector<uint64_t, 32> RawMask;
5027   APInt RawUndefs;
5028   uint64_t ImmN;
5029
5030   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5031   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5032
5033   IsUnary = false;
5034   bool IsFakeUnary = false;
5035   switch (N->getOpcode()) {
5036   case X86ISD::BLENDI:
5037     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5038     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5039     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5040     DecodeBLENDMask(NumElems, ImmN, Mask);
5041     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5042     break;
5043   case X86ISD::SHUFP:
5044     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5045     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5046     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5047     DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5048     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5049     break;
5050   case X86ISD::INSERTPS:
5051     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5052     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5053     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5054     DecodeINSERTPSMask(ImmN, Mask);
5055     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5056     break;
5057   case X86ISD::EXTRQI:
5058     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5059     if (isa<ConstantSDNode>(N->getOperand(1)) &&
5060         isa<ConstantSDNode>(N->getOperand(2))) {
5061       int BitLen = N->getConstantOperandVal(1);
5062       int BitIdx = N->getConstantOperandVal(2);
5063       DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5064       IsUnary = true;
5065     }
5066     break;
5067   case X86ISD::INSERTQI:
5068     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5069     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5070     if (isa<ConstantSDNode>(N->getOperand(2)) &&
5071         isa<ConstantSDNode>(N->getOperand(3))) {
5072       int BitLen = N->getConstantOperandVal(2);
5073       int BitIdx = N->getConstantOperandVal(3);
5074       DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5075       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5076     }
5077     break;
5078   case X86ISD::UNPCKH:
5079     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5080     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5081     DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5082     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5083     break;
5084   case X86ISD::UNPCKL:
5085     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5086     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5087     DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5088     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5089     break;
5090   case X86ISD::MOVHLPS:
5091     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5092     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5093     DecodeMOVHLPSMask(NumElems, Mask);
5094     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5095     break;
5096   case X86ISD::MOVLHPS:
5097     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5098     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5099     DecodeMOVLHPSMask(NumElems, Mask);
5100     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5101     break;
5102   case X86ISD::VALIGN:
5103     assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5104            "Only 32-bit and 64-bit elements are supported!");
5105     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5106     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5107     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5108     DecodeVALIGNMask(NumElems, ImmN, Mask);
5109     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5110     Ops.push_back(N->getOperand(1));
5111     Ops.push_back(N->getOperand(0));
5112     break;
5113   case X86ISD::PALIGNR:
5114     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5115     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5116     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5117     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5118     DecodePALIGNRMask(NumElems, ImmN, Mask);
5119     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5120     Ops.push_back(N->getOperand(1));
5121     Ops.push_back(N->getOperand(0));
5122     break;
5123   case X86ISD::VSHLDQ:
5124     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5125     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5126     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5127     DecodePSLLDQMask(NumElems, ImmN, Mask);
5128     IsUnary = true;
5129     break;
5130   case X86ISD::VSRLDQ:
5131     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5132     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5133     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5134     DecodePSRLDQMask(NumElems, ImmN, Mask);
5135     IsUnary = true;
5136     break;
5137   case X86ISD::PSHUFD:
5138   case X86ISD::VPERMILPI:
5139     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5140     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5141     DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5142     IsUnary = true;
5143     break;
5144   case X86ISD::PSHUFHW:
5145     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5146     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5147     DecodePSHUFHWMask(NumElems, ImmN, Mask);
5148     IsUnary = true;
5149     break;
5150   case X86ISD::PSHUFLW:
5151     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5152     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5153     DecodePSHUFLWMask(NumElems, ImmN, Mask);
5154     IsUnary = true;
5155     break;
5156   case X86ISD::VZEXT_MOVL:
5157     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5158     DecodeZeroMoveLowMask(NumElems, Mask);
5159     IsUnary = true;
5160     break;
5161   case X86ISD::VBROADCAST:
5162     // We only decode broadcasts of same-sized vectors, peeking through to
5163     // extracted subvectors is likely to cause hasOneUse issues with
5164     // SimplifyDemandedBits etc.
5165     if (N->getOperand(0).getValueType() == VT) {
5166       DecodeVectorBroadcast(NumElems, Mask);
5167       IsUnary = true;
5168       break;
5169     }
5170     return false;
5171   case X86ISD::VPERMILPV: {
5172     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5173     IsUnary = true;
5174     SDValue MaskNode = N->getOperand(1);
5175     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5176                                     RawUndefs)) {
5177       DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5178       break;
5179     }
5180     return false;
5181   }
5182   case X86ISD::PSHUFB: {
5183     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5184     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5185     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5186     IsUnary = true;
5187     SDValue MaskNode = N->getOperand(1);
5188     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5189       DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5190       break;
5191     }
5192     return false;
5193   }
5194   case X86ISD::VPERMI:
5195     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5196     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5197     DecodeVPERMMask(NumElems, ImmN, Mask);
5198     IsUnary = true;
5199     break;
5200   case X86ISD::MOVSS:
5201   case X86ISD::MOVSD:
5202   case X86ISD::MOVSH:
5203     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5204     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5205     DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5206     break;
5207   case X86ISD::VPERM2X128:
5208     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5209     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5210     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5211     DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5212     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5213     break;
5214   case X86ISD::SHUF128:
5215     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5216     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5217     ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
5218     decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5219     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5220     break;
5221   case X86ISD::MOVSLDUP:
5222     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5223     DecodeMOVSLDUPMask(NumElems, Mask);
5224     IsUnary = true;
5225     break;
5226   case X86ISD::MOVSHDUP:
5227     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5228     DecodeMOVSHDUPMask(NumElems, Mask);
5229     IsUnary = true;
5230     break;
5231   case X86ISD::MOVDDUP:
5232     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5233     DecodeMOVDDUPMask(NumElems, Mask);
5234     IsUnary = true;
5235     break;
5236   case X86ISD::VPERMIL2: {
5237     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5238     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5239     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5240     SDValue MaskNode = N->getOperand(2);
5241     SDValue CtrlNode = N->getOperand(3);
5242     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5243       unsigned CtrlImm = CtrlOp->getZExtValue();
5244       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5245                                       RawUndefs)) {
5246         DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5247                             Mask);
5248         break;
5249       }
5250     }
5251     return false;
5252   }
5253   case X86ISD::VPPERM: {
5254     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5255     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5256     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5257     SDValue MaskNode = N->getOperand(2);
5258     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5259       DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5260       break;
5261     }
5262     return false;
5263   }
5264   case X86ISD::VPERMV: {
5265     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5266     IsUnary = true;
5267     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5268     Ops.push_back(N->getOperand(1));
5269     SDValue MaskNode = N->getOperand(0);
5270     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5271                                     RawUndefs)) {
5272       DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5273       break;
5274     }
5275     return false;
5276   }
5277   case X86ISD::VPERMV3: {
5278     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5279     assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5280     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5281     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5282     Ops.push_back(N->getOperand(0));
5283     Ops.push_back(N->getOperand(2));
5284     SDValue MaskNode = N->getOperand(1);
5285     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5286                                     RawUndefs)) {
5287       DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5288       break;
5289     }
5290     return false;
5291   }
5292   default: llvm_unreachable("unknown target shuffle node");
5293   }
5294
5295   // Empty mask indicates the decode failed.
5296   if (Mask.empty())
5297     return false;
5298
5299   // Check if we're getting a shuffle mask with zero'd elements.
5300   if (!AllowSentinelZero && isAnyZero(Mask))
5301     return false;
5302
5303   // If we have a fake unary shuffle, the shuffle mask is spread across two
5304   // inputs that are actually the same node. Re-map the mask to always point
5305   // into the first input.
5306   if (IsFakeUnary)
5307     for (int &M : Mask)
5308       if (M >= (int)Mask.size())
5309         M -= Mask.size();
5310
5311   // If we didn't already add operands in the opcode-specific code, default to
5312   // adding 1 or 2 operands starting at 0.
5313   if (Ops.empty()) {
5314     Ops.push_back(N->getOperand(0));
5315     if (!IsUnary || IsFakeUnary)
5316       Ops.push_back(N->getOperand(1));
5317   }
5318
5319   return true;
5320 }
5321
5322 // Wrapper for getTargetShuffleMask with InUnary;
5323 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5324                                  SmallVectorImpl<SDValue> &Ops,
5325                                  SmallVectorImpl<int> &Mask) {
5326   bool IsUnary;
5327   return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
5328 }
5329
5330 /// Compute whether each element of a shuffle is zeroable.
5331 ///
5332 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
5333 /// Either it is an undef element in the shuffle mask, the element of the input
5334 /// referenced is undef, or the element of the input referenced is known to be
5335 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5336 /// as many lanes with this technique as possible to simplify the remaining
5337 /// shuffle.
5338 static void computeZeroableShuffleElements(ArrayRef<int> Mask,
5339                                            SDValue V1, SDValue V2,
5340                                            APInt &KnownUndef, APInt &KnownZero) {
5341   int Size = Mask.size();
5342   KnownUndef = KnownZero = APInt::getZero(Size);
5343
5344   V1 = peekThroughBitcasts(V1);
5345   V2 = peekThroughBitcasts(V2);
5346
5347   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5348   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5349
5350   int VectorSizeInBits = V1.getValueSizeInBits();
5351   int ScalarSizeInBits = VectorSizeInBits / Size;
5352   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5353
5354   for (int i = 0; i < Size; ++i) {
5355     int M = Mask[i];
5356     // Handle the easy cases.
5357     if (M < 0) {
5358       KnownUndef.setBit(i);
5359       continue;
5360     }
5361     if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5362       KnownZero.setBit(i);
5363       continue;
5364     }
5365
5366     // Determine shuffle input and normalize the mask.
5367     SDValue V = M < Size ? V1 : V2;
5368     M %= Size;
5369
5370     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5371     if (V.getOpcode() != ISD::BUILD_VECTOR)
5372       continue;
5373
5374     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5375     // the (larger) source element must be UNDEF/ZERO.
5376     if ((Size % V.getNumOperands()) == 0) {
5377       int Scale = Size / V->getNumOperands();
5378       SDValue Op = V.getOperand(M / Scale);
5379       if (Op.isUndef())
5380         KnownUndef.setBit(i);
5381       if (X86::isZeroNode(Op))
5382         KnownZero.setBit(i);
5383       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5384         APInt Val = Cst->getAPIntValue();
5385         Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5386         if (Val == 0)
5387           KnownZero.setBit(i);
5388       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5389         APInt Val = Cst->getValueAPF().bitcastToAPInt();
5390         Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5391         if (Val == 0)
5392           KnownZero.setBit(i);
5393       }
5394       continue;
5395     }
5396
5397     // If the BUILD_VECTOR has more elements then all the (smaller) source
5398     // elements must be UNDEF or ZERO.
5399     if ((V.getNumOperands() % Size) == 0) {
5400       int Scale = V->getNumOperands() / Size;
5401       bool AllUndef = true;
5402       bool AllZero = true;
5403       for (int j = 0; j < Scale; ++j) {
5404         SDValue Op = V.getOperand((M * Scale) + j);
5405         AllUndef &= Op.isUndef();
5406         AllZero &= X86::isZeroNode(Op);
5407       }
5408       if (AllUndef)
5409         KnownUndef.setBit(i);
5410       if (AllZero)
5411         KnownZero.setBit(i);
5412       continue;
5413     }
5414   }
5415 }
5416
5417 /// Decode a target shuffle mask and inputs and see if any values are
5418 /// known to be undef or zero from their inputs.
5419 /// Returns true if the target shuffle mask was decoded.
5420 /// FIXME: Merge this with computeZeroableShuffleElements?
5421 static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
5422                                          SmallVectorImpl<SDValue> &Ops,
5423                                          APInt &KnownUndef, APInt &KnownZero) {
5424   bool IsUnary;
5425   if (!isTargetShuffle(N.getOpcode()))
5426     return false;
5427
5428   MVT VT = N.getSimpleValueType();
5429   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5430     return false;
5431
5432   int Size = Mask.size();
5433   SDValue V1 = Ops[0];
5434   SDValue V2 = IsUnary ? V1 : Ops[1];
5435   KnownUndef = KnownZero = APInt::getZero(Size);
5436
5437   V1 = peekThroughBitcasts(V1);
5438   V2 = peekThroughBitcasts(V2);
5439
5440   assert((VT.getSizeInBits() % Size) == 0 &&
5441          "Illegal split of shuffle value type");
5442   unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5443
5444   // Extract known constant input data.
5445   APInt UndefSrcElts[2];
5446   SmallVector<APInt, 32> SrcEltBits[2];
5447   bool IsSrcConstant[2] = {
5448       getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5449                                     SrcEltBits[0], true, false),
5450       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5451                                     SrcEltBits[1], true, false)};
5452
5453   for (int i = 0; i < Size; ++i) {
5454     int M = Mask[i];
5455
5456     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5457     if (M < 0) {
5458       assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5459       if (SM_SentinelUndef == M)
5460         KnownUndef.setBit(i);
5461       if (SM_SentinelZero == M)
5462         KnownZero.setBit(i);
5463       continue;
5464     }
5465
5466     // Determine shuffle input and normalize the mask.
5467     unsigned SrcIdx = M / Size;
5468     SDValue V = M < Size ? V1 : V2;
5469     M %= Size;
5470
5471     // We are referencing an UNDEF input.
5472     if (V.isUndef()) {
5473       KnownUndef.setBit(i);
5474       continue;
5475     }
5476
5477     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5478     // TODO: We currently only set UNDEF for integer types - floats use the same
5479     // registers as vectors and many of the scalar folded loads rely on the
5480     // SCALAR_TO_VECTOR pattern.
5481     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5482         (Size % V.getValueType().getVectorNumElements()) == 0) {
5483       int Scale = Size / V.getValueType().getVectorNumElements();
5484       int Idx = M / Scale;
5485       if (Idx != 0 && !VT.isFloatingPoint())
5486         KnownUndef.setBit(i);
5487       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5488         KnownZero.setBit(i);
5489       continue;
5490     }
5491
5492     // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5493     // base vectors.
5494     if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5495       SDValue Vec = V.getOperand(0);
5496       int NumVecElts = Vec.getValueType().getVectorNumElements();
5497       if (Vec.isUndef() && Size == NumVecElts) {
5498         int Idx = V.getConstantOperandVal(2);
5499         int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5500         if (M < Idx || (Idx + NumSubElts) <= M)
5501           KnownUndef.setBit(i);
5502       }
5503       continue;
5504     }
5505
5506     // Attempt to extract from the source's constant bits.
5507     if (IsSrcConstant[SrcIdx]) {
5508       if (UndefSrcElts[SrcIdx][M])
5509         KnownUndef.setBit(i);
5510       else if (SrcEltBits[SrcIdx][M] == 0)
5511         KnownZero.setBit(i);
5512     }
5513   }
5514
5515   assert(VT.getVectorNumElements() == (unsigned)Size &&
5516          "Different mask size from vector size!");
5517   return true;
5518 }
5519
5520 // Replace target shuffle mask elements with known undef/zero sentinels.
5521 static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
5522                                               const APInt &KnownUndef,
5523                                               const APInt &KnownZero,
5524                                               bool ResolveKnownZeros= true) {
5525   unsigned NumElts = Mask.size();
5526   assert(KnownUndef.getBitWidth() == NumElts &&
5527          KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5528
5529   for (unsigned i = 0; i != NumElts; ++i) {
5530     if (KnownUndef[i])
5531       Mask[i] = SM_SentinelUndef;
5532     else if (ResolveKnownZeros && KnownZero[i])
5533       Mask[i] = SM_SentinelZero;
5534   }
5535 }
5536
5537 // Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5538 static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
5539                                               APInt &KnownUndef,
5540                                               APInt &KnownZero) {
5541   unsigned NumElts = Mask.size();
5542   KnownUndef = KnownZero = APInt::getZero(NumElts);
5543
5544   for (unsigned i = 0; i != NumElts; ++i) {
5545     int M = Mask[i];
5546     if (SM_SentinelUndef == M)
5547       KnownUndef.setBit(i);
5548     if (SM_SentinelZero == M)
5549       KnownZero.setBit(i);
5550   }
5551 }
5552
5553 // Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5554 static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
5555                                          SDValue Cond, bool IsBLENDV = false) {
5556   EVT CondVT = Cond.getValueType();
5557   unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5558   unsigned NumElts = CondVT.getVectorNumElements();
5559
5560   APInt UndefElts;
5561   SmallVector<APInt, 32> EltBits;
5562   if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5563                                      true, false))
5564     return false;
5565
5566   Mask.resize(NumElts, SM_SentinelUndef);
5567
5568   for (int i = 0; i != (int)NumElts; ++i) {
5569     Mask[i] = i;
5570     // Arbitrarily choose from the 2nd operand if the select condition element
5571     // is undef.
5572     // TODO: Can we do better by matching patterns such as even/odd?
5573     if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5574         (IsBLENDV && EltBits[i].isNonNegative()))
5575       Mask[i] += NumElts;
5576   }
5577
5578   return true;
5579 }
5580
5581 // Forward declaration (for getFauxShuffleMask recursive check).
5582 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5583                                    SmallVectorImpl<SDValue> &Inputs,
5584                                    SmallVectorImpl<int> &Mask,
5585                                    const SelectionDAG &DAG, unsigned Depth,
5586                                    bool ResolveKnownElts);
5587
5588 // Attempt to decode ops that could be represented as a shuffle mask.
5589 // The decoded shuffle mask may contain a different number of elements to the
5590 // destination value type.
5591 // TODO: Merge into getTargetShuffleInputs()
5592 static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
5593                                SmallVectorImpl<int> &Mask,
5594                                SmallVectorImpl<SDValue> &Ops,
5595                                const SelectionDAG &DAG, unsigned Depth,
5596                                bool ResolveKnownElts) {
5597   Mask.clear();
5598   Ops.clear();
5599
5600   MVT VT = N.getSimpleValueType();
5601   unsigned NumElts = VT.getVectorNumElements();
5602   unsigned NumSizeInBits = VT.getSizeInBits();
5603   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5604   if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
5605     return false;
5606   assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
5607   unsigned NumSizeInBytes = NumSizeInBits / 8;
5608   unsigned NumBytesPerElt = NumBitsPerElt / 8;
5609
5610   unsigned Opcode = N.getOpcode();
5611   switch (Opcode) {
5612   case ISD::VECTOR_SHUFFLE: {
5613     // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
5614     ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
5615     if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
5616       Mask.append(ShuffleMask.begin(), ShuffleMask.end());
5617       Ops.push_back(N.getOperand(0));
5618       Ops.push_back(N.getOperand(1));
5619       return true;
5620     }
5621     return false;
5622   }
5623   case ISD::AND:
5624   case X86ISD::ANDNP: {
5625     // Attempt to decode as a per-byte mask.
5626     APInt UndefElts;
5627     SmallVector<APInt, 32> EltBits;
5628     SDValue N0 = N.getOperand(0);
5629     SDValue N1 = N.getOperand(1);
5630     bool IsAndN = (X86ISD::ANDNP == Opcode);
5631     uint64_t ZeroMask = IsAndN ? 255 : 0;
5632     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5633       return false;
5634     // We can't assume an undef src element gives an undef dst - the other src
5635     // might be zero.
5636     if (!UndefElts.isZero())
5637       return false;
5638     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5639       const APInt &ByteBits = EltBits[i];
5640       if (ByteBits != 0 && ByteBits != 255)
5641         return false;
5642       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5643     }
5644     Ops.push_back(IsAndN ? N1 : N0);
5645     return true;
5646   }
5647   case ISD::OR: {
5648     // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
5649     // is a valid shuffle index.
5650     SDValue N0 = peekThroughBitcasts(N.getOperand(0));
5651     SDValue N1 = peekThroughBitcasts(N.getOperand(1));
5652     if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
5653       return false;
5654
5655     SmallVector<int, 64> SrcMask0, SrcMask1;
5656     SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
5657     APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());
5658     APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());
5659     if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
5660                                 Depth + 1, true) ||
5661         !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
5662                                 Depth + 1, true))
5663       return false;
5664
5665     size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
5666     SmallVector<int, 64> Mask0, Mask1;
5667     narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
5668     narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
5669     for (int i = 0; i != (int)MaskSize; ++i) {
5670       // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
5671       // loops converting between OR and BLEND shuffles due to
5672       // canWidenShuffleElements merging away undef elements, meaning we
5673       // fail to recognise the OR as the undef element isn't known zero.
5674       if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
5675         Mask.push_back(SM_SentinelZero);
5676       else if (Mask1[i] == SM_SentinelZero)
5677         Mask.push_back(i);
5678       else if (Mask0[i] == SM_SentinelZero)
5679         Mask.push_back(i + MaskSize);
5680       else
5681         return false;
5682     }
5683     Ops.push_back(N0);
5684     Ops.push_back(N1);
5685     return true;
5686   }
5687   case ISD::INSERT_SUBVECTOR: {
5688     SDValue Src = N.getOperand(0);
5689     SDValue Sub = N.getOperand(1);
5690     EVT SubVT = Sub.getValueType();
5691     unsigned NumSubElts = SubVT.getVectorNumElements();
5692     if (!N->isOnlyUserOf(Sub.getNode()))
5693       return false;
5694     SDValue SubBC = peekThroughBitcasts(Sub);
5695     uint64_t InsertIdx = N.getConstantOperandVal(2);
5696     // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
5697     if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5698         SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
5699       uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
5700       SDValue SubBCSrc = SubBC.getOperand(0);
5701       unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
5702       unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
5703       assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
5704              "Subvector valuetype mismatch");
5705       InsertIdx *= (MaxElts / NumElts);
5706       ExtractIdx *= (MaxElts / NumSubSrcBCElts);
5707       NumSubElts *= (MaxElts / NumElts);
5708       bool SrcIsUndef = Src.isUndef();
5709       for (int i = 0; i != (int)MaxElts; ++i)
5710         Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
5711       for (int i = 0; i != (int)NumSubElts; ++i)
5712         Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
5713       if (!SrcIsUndef)
5714         Ops.push_back(Src);
5715       Ops.push_back(SubBCSrc);
5716       return true;
5717     }
5718     // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
5719     SmallVector<int, 64> SubMask;
5720     SmallVector<SDValue, 2> SubInputs;
5721     SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
5722     EVT SubSrcVT = SubSrc.getValueType();
5723     if (!SubSrcVT.isVector())
5724       return false;
5725
5726     APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
5727     if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
5728                                 Depth + 1, ResolveKnownElts))
5729       return false;
5730
5731     // Subvector shuffle inputs must not be larger than the subvector.
5732     if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
5733           return SubVT.getFixedSizeInBits() <
5734                  SubInput.getValueSizeInBits().getFixedValue();
5735         }))
5736       return false;
5737
5738     if (SubMask.size() != NumSubElts) {
5739       assert(((SubMask.size() % NumSubElts) == 0 ||
5740               (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
5741       if ((NumSubElts % SubMask.size()) == 0) {
5742         int Scale = NumSubElts / SubMask.size();
5743         SmallVector<int,64> ScaledSubMask;
5744         narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
5745         SubMask = ScaledSubMask;
5746       } else {
5747         int Scale = SubMask.size() / NumSubElts;
5748         NumSubElts = SubMask.size();
5749         NumElts *= Scale;
5750         InsertIdx *= Scale;
5751       }
5752     }
5753     Ops.push_back(Src);
5754     Ops.append(SubInputs.begin(), SubInputs.end());
5755     if (ISD::isBuildVectorAllZeros(Src.getNode()))
5756       Mask.append(NumElts, SM_SentinelZero);
5757     else
5758       for (int i = 0; i != (int)NumElts; ++i)
5759         Mask.push_back(i);
5760     for (int i = 0; i != (int)NumSubElts; ++i) {
5761       int M = SubMask[i];
5762       if (0 <= M) {
5763         int InputIdx = M / NumSubElts;
5764         M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
5765       }
5766       Mask[i + InsertIdx] = M;
5767     }
5768     return true;
5769   }
5770   case X86ISD::PINSRB:
5771   case X86ISD::PINSRW:
5772   case ISD::SCALAR_TO_VECTOR:
5773   case ISD::INSERT_VECTOR_ELT: {
5774     // Match against a insert_vector_elt/scalar_to_vector of an extract from a
5775     // vector, for matching src/dst vector types.
5776     SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
5777
5778     unsigned DstIdx = 0;
5779     if (Opcode != ISD::SCALAR_TO_VECTOR) {
5780       // Check we have an in-range constant insertion index.
5781       if (!isa<ConstantSDNode>(N.getOperand(2)) ||
5782           N.getConstantOperandAPInt(2).uge(NumElts))
5783         return false;
5784       DstIdx = N.getConstantOperandVal(2);
5785
5786       // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
5787       if (X86::isZeroNode(Scl)) {
5788         Ops.push_back(N.getOperand(0));
5789         for (unsigned i = 0; i != NumElts; ++i)
5790           Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
5791         return true;
5792       }
5793     }
5794
5795     // Peek through trunc/aext/zext.
5796     // TODO: aext shouldn't require SM_SentinelZero padding.
5797     // TODO: handle shift of scalars.
5798     unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
5799     while (Scl.getOpcode() == ISD::TRUNCATE ||
5800            Scl.getOpcode() == ISD::ANY_EXTEND ||
5801            Scl.getOpcode() == ISD::ZERO_EXTEND) {
5802       Scl = Scl.getOperand(0);
5803       MinBitsPerElt =
5804           std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
5805     }
5806     if ((MinBitsPerElt % 8) != 0)
5807       return false;
5808
5809     // Attempt to find the source vector the scalar was extracted from.
5810     SDValue SrcExtract;
5811     if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
5812          Scl.getOpcode() == X86ISD::PEXTRW ||
5813          Scl.getOpcode() == X86ISD::PEXTRB) &&
5814         Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
5815       SrcExtract = Scl;
5816     }
5817     if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
5818       return false;
5819
5820     SDValue SrcVec = SrcExtract.getOperand(0);
5821     EVT SrcVT = SrcVec.getValueType();
5822     if (!SrcVT.getScalarType().isByteSized())
5823       return false;
5824     unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
5825     unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
5826     unsigned DstByte = DstIdx * NumBytesPerElt;
5827     MinBitsPerElt =
5828         std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
5829
5830     // Create 'identity' byte level shuffle mask and then add inserted bytes.
5831     if (Opcode == ISD::SCALAR_TO_VECTOR) {
5832       Ops.push_back(SrcVec);
5833       Mask.append(NumSizeInBytes, SM_SentinelUndef);
5834     } else {
5835       Ops.push_back(SrcVec);
5836       Ops.push_back(N.getOperand(0));
5837       for (int i = 0; i != (int)NumSizeInBytes; ++i)
5838         Mask.push_back(NumSizeInBytes + i);
5839     }
5840
5841     unsigned MinBytesPerElts = MinBitsPerElt / 8;
5842     MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
5843     for (unsigned i = 0; i != MinBytesPerElts; ++i)
5844       Mask[DstByte + i] = SrcByte + i;
5845     for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
5846       Mask[DstByte + i] = SM_SentinelZero;
5847     return true;
5848   }
5849   case X86ISD::PACKSS:
5850   case X86ISD::PACKUS: {
5851     SDValue N0 = N.getOperand(0);
5852     SDValue N1 = N.getOperand(1);
5853     assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
5854            N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
5855            "Unexpected input value type");
5856
5857     APInt EltsLHS, EltsRHS;
5858     getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
5859
5860     // If we know input saturation won't happen (or we don't care for particular
5861     // lanes), we can treat this as a truncation shuffle.
5862     bool Offset0 = false, Offset1 = false;
5863     if (Opcode == X86ISD::PACKSS) {
5864       if ((!(N0.isUndef() || EltsLHS.isZero()) &&
5865            DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
5866           (!(N1.isUndef() || EltsRHS.isZero()) &&
5867            DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
5868         return false;
5869       // We can't easily fold ASHR into a shuffle, but if it was feeding a
5870       // PACKSS then it was likely being used for sign-extension for a
5871       // truncation, so just peek through and adjust the mask accordingly.
5872       if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
5873           N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
5874         Offset0 = true;
5875         N0 = N0.getOperand(0);
5876       }
5877       if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
5878           N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
5879         Offset1 = true;
5880         N1 = N1.getOperand(0);
5881       }
5882     } else {
5883       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
5884       if ((!(N0.isUndef() || EltsLHS.isZero()) &&
5885            !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
5886           (!(N1.isUndef() || EltsRHS.isZero()) &&
5887            !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
5888         return false;
5889     }
5890
5891     bool IsUnary = (N0 == N1);
5892
5893     Ops.push_back(N0);
5894     if (!IsUnary)
5895       Ops.push_back(N1);
5896
5897     createPackShuffleMask(VT, Mask, IsUnary);
5898
5899     if (Offset0 || Offset1) {
5900       for (int &M : Mask)
5901         if ((Offset0 && isInRange(M, 0, NumElts)) ||
5902             (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
5903           ++M;
5904     }
5905     return true;
5906   }
5907   case ISD::VSELECT:
5908   case X86ISD::BLENDV: {
5909     SDValue Cond = N.getOperand(0);
5910     if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
5911       Ops.push_back(N.getOperand(1));
5912       Ops.push_back(N.getOperand(2));
5913       return true;
5914     }
5915     return false;
5916   }
5917   case X86ISD::VTRUNC: {
5918     SDValue Src = N.getOperand(0);
5919     EVT SrcVT = Src.getValueType();
5920     // Truncated source must be a simple vector.
5921     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
5922         (SrcVT.getScalarSizeInBits() % 8) != 0)
5923       return false;
5924     unsigned NumSrcElts = SrcVT.getVectorNumElements();
5925     unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
5926     unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
5927     assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
5928     for (unsigned i = 0; i != NumSrcElts; ++i)
5929       Mask.push_back(i * Scale);
5930     Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
5931     Ops.push_back(Src);
5932     return true;
5933   }
5934   case X86ISD::VSHLI:
5935   case X86ISD::VSRLI: {
5936     uint64_t ShiftVal = N.getConstantOperandVal(1);
5937     // Out of range bit shifts are guaranteed to be zero.
5938     if (NumBitsPerElt <= ShiftVal) {
5939       Mask.append(NumElts, SM_SentinelZero);
5940       return true;
5941     }
5942
5943     // We can only decode 'whole byte' bit shifts as shuffles.
5944     if ((ShiftVal % 8) != 0)
5945       break;
5946
5947     uint64_t ByteShift = ShiftVal / 8;
5948     Ops.push_back(N.getOperand(0));
5949
5950     // Clear mask to all zeros and insert the shifted byte indices.
5951     Mask.append(NumSizeInBytes, SM_SentinelZero);
5952
5953     if (X86ISD::VSHLI == Opcode) {
5954       for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
5955         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5956           Mask[i + j] = i + j - ByteShift;
5957     } else {
5958       for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
5959         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5960           Mask[i + j - ByteShift] = i + j;
5961     }
5962     return true;
5963   }
5964   case X86ISD::VROTLI:
5965   case X86ISD::VROTRI: {
5966     // We can only decode 'whole byte' bit rotates as shuffles.
5967     uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
5968     if ((RotateVal % 8) != 0)
5969       return false;
5970     Ops.push_back(N.getOperand(0));
5971     int Offset = RotateVal / 8;
5972     Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
5973     for (int i = 0; i != (int)NumElts; ++i) {
5974       int BaseIdx = i * NumBytesPerElt;
5975       for (int j = 0; j != (int)NumBytesPerElt; ++j) {
5976         Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
5977       }
5978     }
5979     return true;
5980   }
5981   case X86ISD::VBROADCAST: {
5982     SDValue Src = N.getOperand(0);
5983     if (!Src.getSimpleValueType().isVector()) {
5984       if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5985           !isNullConstant(Src.getOperand(1)) ||
5986           Src.getOperand(0).getValueType().getScalarType() !=
5987               VT.getScalarType())
5988         return false;
5989       Src = Src.getOperand(0);
5990     }
5991     Ops.push_back(Src);
5992     Mask.append(NumElts, 0);
5993     return true;
5994   }
5995   case ISD::SIGN_EXTEND_VECTOR_INREG: {
5996     SDValue Src = N.getOperand(0);
5997     EVT SrcVT = Src.getValueType();
5998     unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
5999
6000     // Extended source must be a simple vector.
6001     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6002         (NumBitsPerSrcElt % 8) != 0)
6003       return false;
6004
6005     // We can only handle all-signbits extensions.
6006     APInt DemandedSrcElts =
6007         DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6008     if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6009       return false;
6010
6011     assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6012     unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6013     for (unsigned I = 0; I != NumElts; ++I)
6014       Mask.append(Scale, I);
6015     Ops.push_back(Src);
6016     return true;
6017   }
6018   case ISD::ZERO_EXTEND:
6019   case ISD::ANY_EXTEND:
6020   case ISD::ZERO_EXTEND_VECTOR_INREG:
6021   case ISD::ANY_EXTEND_VECTOR_INREG: {
6022     SDValue Src = N.getOperand(0);
6023     EVT SrcVT = Src.getValueType();
6024
6025     // Extended source must be a simple vector.
6026     if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6027         (SrcVT.getScalarSizeInBits() % 8) != 0)
6028       return false;
6029
6030     bool IsAnyExtend =
6031         (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6032     DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6033                          IsAnyExtend, Mask);
6034     Ops.push_back(Src);
6035     return true;
6036   }
6037   }
6038
6039   return false;
6040 }
6041
6042 /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6043 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6044                                               SmallVectorImpl<int> &Mask) {
6045   int MaskWidth = Mask.size();
6046   SmallVector<SDValue, 16> UsedInputs;
6047   for (int i = 0, e = Inputs.size(); i < e; ++i) {
6048     int lo = UsedInputs.size() * MaskWidth;
6049     int hi = lo + MaskWidth;
6050
6051     // Strip UNDEF input usage.
6052     if (Inputs[i].isUndef())
6053       for (int &M : Mask)
6054         if ((lo <= M) && (M < hi))
6055           M = SM_SentinelUndef;
6056
6057     // Check for unused inputs.
6058     if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6059       for (int &M : Mask)
6060         if (lo <= M)
6061           M -= MaskWidth;
6062       continue;
6063     }
6064
6065     // Check for repeated inputs.
6066     bool IsRepeat = false;
6067     for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6068       if (UsedInputs[j] != Inputs[i])
6069         continue;
6070       for (int &M : Mask)
6071         if (lo <= M)
6072           M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6073       IsRepeat = true;
6074       break;
6075     }
6076     if (IsRepeat)
6077       continue;
6078
6079     UsedInputs.push_back(Inputs[i]);
6080   }
6081   Inputs = UsedInputs;
6082 }
6083
6084 /// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6085 /// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6086 /// Returns true if the target shuffle mask was decoded.
6087 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6088                                    SmallVectorImpl<SDValue> &Inputs,
6089                                    SmallVectorImpl<int> &Mask,
6090                                    APInt &KnownUndef, APInt &KnownZero,
6091                                    const SelectionDAG &DAG, unsigned Depth,
6092                                    bool ResolveKnownElts) {
6093   if (Depth >= SelectionDAG::MaxRecursionDepth)
6094     return false; // Limit search depth.
6095
6096   EVT VT = Op.getValueType();
6097   if (!VT.isSimple() || !VT.isVector())
6098     return false;
6099
6100   if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6101     if (ResolveKnownElts)
6102       resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6103     return true;
6104   }
6105   if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6106                          ResolveKnownElts)) {
6107     resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6108     return true;
6109   }
6110   return false;
6111 }
6112
6113 static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6114                                    SmallVectorImpl<SDValue> &Inputs,
6115                                    SmallVectorImpl<int> &Mask,
6116                                    const SelectionDAG &DAG, unsigned Depth,
6117                                    bool ResolveKnownElts) {
6118   APInt KnownUndef, KnownZero;
6119   return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6120                                 KnownZero, DAG, Depth, ResolveKnownElts);
6121 }
6122
6123 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
6124                                    SmallVectorImpl<int> &Mask,
6125                                    const SelectionDAG &DAG, unsigned Depth = 0,
6126                                    bool ResolveKnownElts = true) {
6127   EVT VT = Op.getValueType();
6128   if (!VT.isSimple() || !VT.isVector())
6129     return false;
6130
6131   unsigned NumElts = Op.getValueType().getVectorNumElements();
6132   APInt DemandedElts = APInt::getAllOnes(NumElts);
6133   return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6134                                 ResolveKnownElts);
6135 }
6136
6137 // Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6138 static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6139                                  EVT MemVT, MemSDNode *Mem, unsigned Offset,
6140                                  SelectionDAG &DAG) {
6141   assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6142           Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6143          "Unknown broadcast load type");
6144
6145   // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6146   if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6147     return SDValue();
6148
6149   SDValue Ptr =
6150       DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
6151   SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6152   SDValue Ops[] = {Mem->getChain(), Ptr};
6153   SDValue BcstLd = DAG.getMemIntrinsicNode(
6154       Opcode, DL, Tys, Ops, MemVT,
6155       DAG.getMachineFunction().getMachineMemOperand(
6156           Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6157   DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6158   return BcstLd;
6159 }
6160
6161 /// Returns the scalar element that will make up the i'th
6162 /// element of the result of the vector shuffle.
6163 static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6164                                    SelectionDAG &DAG, unsigned Depth) {
6165   if (Depth >= SelectionDAG::MaxRecursionDepth)
6166     return SDValue(); // Limit search depth.
6167
6168   EVT VT = Op.getValueType();
6169   unsigned Opcode = Op.getOpcode();
6170   unsigned NumElems = VT.getVectorNumElements();
6171
6172   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6173   if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6174     int Elt = SV->getMaskElt(Index);
6175
6176     if (Elt < 0)
6177       return DAG.getUNDEF(VT.getVectorElementType());
6178
6179     SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6180     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6181   }
6182
6183   // Recurse into target specific vector shuffles to find scalars.
6184   if (isTargetShuffle(Opcode)) {
6185     MVT ShufVT = VT.getSimpleVT();
6186     MVT ShufSVT = ShufVT.getVectorElementType();
6187     int NumElems = (int)ShufVT.getVectorNumElements();
6188     SmallVector<int, 16> ShuffleMask;
6189     SmallVector<SDValue, 16> ShuffleOps;
6190     if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
6191                               ShuffleMask))
6192       return SDValue();
6193
6194     int Elt = ShuffleMask[Index];
6195     if (Elt == SM_SentinelZero)
6196       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6197                                  : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6198     if (Elt == SM_SentinelUndef)
6199       return DAG.getUNDEF(ShufSVT);
6200
6201     assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6202     SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6203     return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6204   }
6205
6206   // Recurse into insert_subvector base/sub vector to find scalars.
6207   if (Opcode == ISD::INSERT_SUBVECTOR) {
6208     SDValue Vec = Op.getOperand(0);
6209     SDValue Sub = Op.getOperand(1);
6210     uint64_t SubIdx = Op.getConstantOperandVal(2);
6211     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6212
6213     if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6214       return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6215     return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6216   }
6217
6218   // Recurse into concat_vectors sub vector to find scalars.
6219   if (Opcode == ISD::CONCAT_VECTORS) {
6220     EVT SubVT = Op.getOperand(0).getValueType();
6221     unsigned NumSubElts = SubVT.getVectorNumElements();
6222     uint64_t SubIdx = Index / NumSubElts;
6223     uint64_t SubElt = Index % NumSubElts;
6224     return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6225   }
6226
6227   // Recurse into extract_subvector src vector to find scalars.
6228   if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6229     SDValue Src = Op.getOperand(0);
6230     uint64_t SrcIdx = Op.getConstantOperandVal(1);
6231     return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6232   }
6233
6234   // We only peek through bitcasts of the same vector width.
6235   if (Opcode == ISD::BITCAST) {
6236     SDValue Src = Op.getOperand(0);
6237     EVT SrcVT = Src.getValueType();
6238     if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6239       return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6240     return SDValue();
6241   }
6242
6243   // Actual nodes that may contain scalar elements
6244
6245   // For insert_vector_elt - either return the index matching scalar or recurse
6246   // into the base vector.
6247   if (Opcode == ISD::INSERT_VECTOR_ELT &&
6248       isa<ConstantSDNode>(Op.getOperand(2))) {
6249     if (Op.getConstantOperandAPInt(2) == Index)
6250       return Op.getOperand(1);
6251     return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6252   }
6253
6254   if (Opcode == ISD::SCALAR_TO_VECTOR)
6255     return (Index == 0) ? Op.getOperand(0)
6256                         : DAG.getUNDEF(VT.getVectorElementType());
6257
6258   if (Opcode == ISD::BUILD_VECTOR)
6259     return Op.getOperand(Index);
6260
6261   return SDValue();
6262 }
6263
6264 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6265 static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
6266                                         unsigned NumNonZero, unsigned NumZero,
6267                                         SelectionDAG &DAG,
6268                                         const X86Subtarget &Subtarget) {
6269   MVT VT = Op.getSimpleValueType();
6270   unsigned NumElts = VT.getVectorNumElements();
6271   assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6272           ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6273          "Illegal vector insertion");
6274
6275   SDLoc dl(Op);
6276   SDValue V;
6277   bool First = true;
6278
6279   for (unsigned i = 0; i < NumElts; ++i) {
6280     bool IsNonZero = NonZeroMask[i];
6281     if (!IsNonZero)
6282       continue;
6283
6284     // If the build vector contains zeros or our first insertion is not the
6285     // first index then insert into zero vector to break any register
6286     // dependency else use SCALAR_TO_VECTOR.
6287     if (First) {
6288       First = false;
6289       if (NumZero || 0 != i)
6290         V = getZeroVector(VT, Subtarget, DAG, dl);
6291       else {
6292         assert(0 == i && "Expected insertion into zero-index");
6293         V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6294         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6295         V = DAG.getBitcast(VT, V);
6296         continue;
6297       }
6298     }
6299     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6300                     DAG.getIntPtrConstant(i, dl));
6301   }
6302
6303   return V;
6304 }
6305
6306 /// Custom lower build_vector of v16i8.
6307 static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
6308                                      unsigned NumNonZero, unsigned NumZero,
6309                                      SelectionDAG &DAG,
6310                                      const X86Subtarget &Subtarget) {
6311   if (NumNonZero > 8 && !Subtarget.hasSSE41())
6312     return SDValue();
6313
6314   // SSE4.1 - use PINSRB to insert each byte directly.
6315   if (Subtarget.hasSSE41())
6316     return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
6317                                     Subtarget);
6318
6319   SDLoc dl(Op);
6320   SDValue V;
6321
6322   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6323   // If both the lowest 16-bits are non-zero, then convert to MOVD.
6324   if (!NonZeroMask.extractBits(2, 0).isZero() &&
6325       !NonZeroMask.extractBits(2, 2).isZero()) {
6326     for (unsigned I = 0; I != 4; ++I) {
6327       if (!NonZeroMask[I])
6328         continue;
6329       SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), dl, MVT::i32);
6330       if (I != 0)
6331         Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
6332                           DAG.getConstant(I * 8, dl, MVT::i8));
6333       V = V ? DAG.getNode(ISD::OR, dl, MVT::i32, V, Elt) : Elt;
6334     }
6335     assert(V && "Failed to fold v16i8 vector to zero");
6336     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6337     V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6338     V = DAG.getBitcast(MVT::v8i16, V);
6339   }
6340   for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6341     bool ThisIsNonZero = NonZeroMask[i];
6342     bool NextIsNonZero = NonZeroMask[i + 1];
6343     if (!ThisIsNonZero && !NextIsNonZero)
6344       continue;
6345
6346     SDValue Elt;
6347     if (ThisIsNonZero) {
6348       if (NumZero || NextIsNonZero)
6349         Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6350       else
6351         Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6352     }
6353
6354     if (NextIsNonZero) {
6355       SDValue NextElt = Op.getOperand(i + 1);
6356       if (i == 0 && NumZero)
6357         NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
6358       else
6359         NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
6360       NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
6361                             DAG.getConstant(8, dl, MVT::i8));
6362       if (ThisIsNonZero)
6363         Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
6364       else
6365         Elt = NextElt;
6366     }
6367
6368     // If our first insertion is not the first index or zeros are needed, then
6369     // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6370     // elements undefined).
6371     if (!V) {
6372       if (i != 0 || NumZero)
6373         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6374       else {
6375         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
6376         V = DAG.getBitcast(MVT::v8i16, V);
6377         continue;
6378       }
6379     }
6380     Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
6381     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
6382                     DAG.getIntPtrConstant(i / 2, dl));
6383   }
6384
6385   return DAG.getBitcast(MVT::v16i8, V);
6386 }
6387
6388 /// Custom lower build_vector of v8i16.
6389 static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
6390                                      unsigned NumNonZero, unsigned NumZero,
6391                                      SelectionDAG &DAG,
6392                                      const X86Subtarget &Subtarget) {
6393   if (NumNonZero > 4 && !Subtarget.hasSSE41())
6394     return SDValue();
6395
6396   // Use PINSRW to insert each byte directly.
6397   return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
6398                                   Subtarget);
6399 }
6400
6401 /// Custom lower build_vector of v4i32 or v4f32.
6402 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6403                                      const X86Subtarget &Subtarget) {
6404   // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6405   // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6406   // Because we're creating a less complicated build vector here, we may enable
6407   // further folding of the MOVDDUP via shuffle transforms.
6408   if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6409       Op.getOperand(0) == Op.getOperand(2) &&
6410       Op.getOperand(1) == Op.getOperand(3) &&
6411       Op.getOperand(0) != Op.getOperand(1)) {
6412     SDLoc DL(Op);
6413     MVT VT = Op.getSimpleValueType();
6414     MVT EltVT = VT.getVectorElementType();
6415     // Create a new build vector with the first 2 elements followed by undef
6416     // padding, bitcast to v2f64, duplicate, and bitcast back.
6417     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6418                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6419     SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6420     SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6421     return DAG.getBitcast(VT, Dup);
6422   }
6423
6424   // Find all zeroable elements.
6425   std::bitset<4> Zeroable, Undefs;
6426   for (int i = 0; i < 4; ++i) {
6427     SDValue Elt = Op.getOperand(i);
6428     Undefs[i] = Elt.isUndef();
6429     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6430   }
6431   assert(Zeroable.size() - Zeroable.count() > 1 &&
6432          "We expect at least two non-zero elements!");
6433
6434   // We only know how to deal with build_vector nodes where elements are either
6435   // zeroable or extract_vector_elt with constant index.
6436   SDValue FirstNonZero;
6437   unsigned FirstNonZeroIdx;
6438   for (unsigned i = 0; i < 4; ++i) {
6439     if (Zeroable[i])
6440       continue;
6441     SDValue Elt = Op.getOperand(i);
6442     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6443         !isa<ConstantSDNode>(Elt.getOperand(1)))
6444       return SDValue();
6445     // Make sure that this node is extracting from a 128-bit vector.
6446     MVT VT = Elt.getOperand(0).getSimpleValueType();
6447     if (!VT.is128BitVector())
6448       return SDValue();
6449     if (!FirstNonZero.getNode()) {
6450       FirstNonZero = Elt;
6451       FirstNonZeroIdx = i;
6452     }
6453   }
6454
6455   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6456   SDValue V1 = FirstNonZero.getOperand(0);
6457   MVT VT = V1.getSimpleValueType();
6458
6459   // See if this build_vector can be lowered as a blend with zero.
6460   SDValue Elt;
6461   unsigned EltMaskIdx, EltIdx;
6462   int Mask[4];
6463   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6464     if (Zeroable[EltIdx]) {
6465       // The zero vector will be on the right hand side.
6466       Mask[EltIdx] = EltIdx+4;
6467       continue;
6468     }
6469
6470     Elt = Op->getOperand(EltIdx);
6471     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6472     EltMaskIdx = Elt.getConstantOperandVal(1);
6473     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6474       break;
6475     Mask[EltIdx] = EltIdx;
6476   }
6477
6478   if (EltIdx == 4) {
6479     // Let the shuffle legalizer deal with blend operations.
6480     SDValue VZeroOrUndef = (Zeroable == Undefs)
6481                                ? DAG.getUNDEF(VT)
6482                                : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6483     if (V1.getSimpleValueType() != VT)
6484       V1 = DAG.getBitcast(VT, V1);
6485     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6486   }
6487
6488   // See if we can lower this build_vector to a INSERTPS.
6489   if (!Subtarget.hasSSE41())
6490     return SDValue();
6491
6492   SDValue V2 = Elt.getOperand(0);
6493   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6494     V1 = SDValue();
6495
6496   bool CanFold = true;
6497   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6498     if (Zeroable[i])
6499       continue;
6500
6501     SDValue Current = Op->getOperand(i);
6502     SDValue SrcVector = Current->getOperand(0);
6503     if (!V1.getNode())
6504       V1 = SrcVector;
6505     CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6506   }
6507
6508   if (!CanFold)
6509     return SDValue();
6510
6511   assert(V1.getNode() && "Expected at least two non-zero elements!");
6512   if (V1.getSimpleValueType() != MVT::v4f32)
6513     V1 = DAG.getBitcast(MVT::v4f32, V1);
6514   if (V2.getSimpleValueType() != MVT::v4f32)
6515     V2 = DAG.getBitcast(MVT::v4f32, V2);
6516
6517   // Ok, we can emit an INSERTPS instruction.
6518   unsigned ZMask = Zeroable.to_ulong();
6519
6520   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6521   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6522   SDLoc DL(Op);
6523   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6524                                DAG.getIntPtrConstant(InsertPSMask, DL, true));
6525   return DAG.getBitcast(VT, Result);
6526 }
6527
6528 /// Return a vector logical shift node.
6529 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6530                          SelectionDAG &DAG, const TargetLowering &TLI,
6531                          const SDLoc &dl) {
6532   assert(VT.is128BitVector() && "Unknown type for VShift");
6533   MVT ShVT = MVT::v16i8;
6534   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6535   SrcOp = DAG.getBitcast(ShVT, SrcOp);
6536   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6537   SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6538   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6539 }
6540
6541 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6542                                       SelectionDAG &DAG) {
6543
6544   // Check if the scalar load can be widened into a vector load. And if
6545   // the address is "base + cst" see if the cst can be "absorbed" into
6546   // the shuffle mask.
6547   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6548     SDValue Ptr = LD->getBasePtr();
6549     if (!ISD::isNormalLoad(LD) || !LD->isSimple())
6550       return SDValue();
6551     EVT PVT = LD->getValueType(0);
6552     if (PVT != MVT::i32 && PVT != MVT::f32)
6553       return SDValue();
6554
6555     int FI = -1;
6556     int64_t Offset = 0;
6557     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6558       FI = FINode->getIndex();
6559       Offset = 0;
6560     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6561                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6562       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6563       Offset = Ptr.getConstantOperandVal(1);
6564       Ptr = Ptr.getOperand(0);
6565     } else {
6566       return SDValue();
6567     }
6568
6569     // FIXME: 256-bit vector instructions don't require a strict alignment,
6570     // improve this code to support it better.
6571     Align RequiredAlign(VT.getSizeInBits() / 8);
6572     SDValue Chain = LD->getChain();
6573     // Make sure the stack object alignment is at least 16 or 32.
6574     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6575     MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
6576     if (!InferredAlign || *InferredAlign < RequiredAlign) {
6577       if (MFI.isFixedObjectIndex(FI)) {
6578         // Can't change the alignment. FIXME: It's possible to compute
6579         // the exact stack offset and reference FI + adjust offset instead.
6580         // If someone *really* cares about this. That's the way to implement it.
6581         return SDValue();
6582       } else {
6583         MFI.setObjectAlignment(FI, RequiredAlign);
6584       }
6585     }
6586
6587     // (Offset % 16 or 32) must be multiple of 4. Then address is then
6588     // Ptr + (Offset & ~15).
6589     if (Offset < 0)
6590       return SDValue();
6591     if ((Offset % RequiredAlign.value()) & 3)
6592       return SDValue();
6593     int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
6594     if (StartOffset) {
6595       SDLoc DL(Ptr);
6596       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6597                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6598     }
6599
6600     int EltNo = (Offset - StartOffset) >> 2;
6601     unsigned NumElems = VT.getVectorNumElements();
6602
6603     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6604     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6605                              LD->getPointerInfo().getWithOffset(StartOffset));
6606
6607     SmallVector<int, 8> Mask(NumElems, EltNo);
6608
6609     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6610   }
6611
6612   return SDValue();
6613 }
6614
6615 // Recurse to find a LoadSDNode source and the accumulated ByteOffest.
6616 static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
6617   if (ISD::isNON_EXTLoad(Elt.getNode())) {
6618     auto *BaseLd = cast<LoadSDNode>(Elt);
6619     if (!BaseLd->isSimple())
6620       return false;
6621     Ld = BaseLd;
6622     ByteOffset = 0;
6623     return true;
6624   }
6625
6626   switch (Elt.getOpcode()) {
6627   case ISD::BITCAST:
6628   case ISD::TRUNCATE:
6629   case ISD::SCALAR_TO_VECTOR:
6630     return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
6631   case ISD::SRL:
6632     if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6633       uint64_t Amt = AmtC->getZExtValue();
6634       if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
6635         ByteOffset += Amt / 8;
6636         return true;
6637       }
6638     }
6639     break;
6640   case ISD::EXTRACT_VECTOR_ELT:
6641     if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6642       SDValue Src = Elt.getOperand(0);
6643       unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
6644       unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
6645       if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
6646           findEltLoadSrc(Src, Ld, ByteOffset)) {
6647         uint64_t Idx = IdxC->getZExtValue();
6648         ByteOffset += Idx * (SrcSizeInBits / 8);
6649         return true;
6650       }
6651     }
6652     break;
6653   }
6654
6655   return false;
6656 }
6657
6658 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6659 /// elements can be replaced by a single large load which has the same value as
6660 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6661 ///
6662 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6663 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6664                                         const SDLoc &DL, SelectionDAG &DAG,
6665                                         const X86Subtarget &Subtarget,
6666                                         bool IsAfterLegalize) {
6667   if ((VT.getScalarSizeInBits() % 8) != 0)
6668     return SDValue();
6669
6670   unsigned NumElems = Elts.size();
6671
6672   int LastLoadedElt = -1;
6673   APInt LoadMask = APInt::getZero(NumElems);
6674   APInt ZeroMask = APInt::getZero(NumElems);
6675   APInt UndefMask = APInt::getZero(NumElems);
6676
6677   SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
6678   SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
6679
6680   // For each element in the initializer, see if we've found a load, zero or an
6681   // undef.
6682   for (unsigned i = 0; i < NumElems; ++i) {
6683     SDValue Elt = peekThroughBitcasts(Elts[i]);
6684     if (!Elt.getNode())
6685       return SDValue();
6686     if (Elt.isUndef()) {
6687       UndefMask.setBit(i);
6688       continue;
6689     }
6690     if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
6691       ZeroMask.setBit(i);
6692       continue;
6693     }
6694
6695     // Each loaded element must be the correct fractional portion of the
6696     // requested vector load.
6697     unsigned EltSizeInBits = Elt.getValueSizeInBits();
6698     if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
6699       return SDValue();
6700
6701     if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
6702       return SDValue();
6703     unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
6704     if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
6705       return SDValue();
6706
6707     LoadMask.setBit(i);
6708     LastLoadedElt = i;
6709   }
6710   assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
6711              NumElems &&
6712          "Incomplete element masks");
6713
6714   // Handle Special Cases - all undef or undef/zero.
6715   if (UndefMask.popcount() == NumElems)
6716     return DAG.getUNDEF(VT);
6717   if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
6718     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6719                           : DAG.getConstantFP(0.0, DL, VT);
6720
6721   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6722   int FirstLoadedElt = LoadMask.countr_zero();
6723   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6724   EVT EltBaseVT = EltBase.getValueType();
6725   assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
6726          "Register/Memory size mismatch");
6727   LoadSDNode *LDBase = Loads[FirstLoadedElt];
6728   assert(LDBase && "Did not find base load for merging consecutive loads");
6729   unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
6730   unsigned BaseSizeInBytes = BaseSizeInBits / 8;
6731   int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
6732   int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
6733   assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
6734
6735   // TODO: Support offsetting the base load.
6736   if (ByteOffsets[FirstLoadedElt] != 0)
6737     return SDValue();
6738
6739   // Check to see if the element's load is consecutive to the base load
6740   // or offset from a previous (already checked) load.
6741   auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
6742     LoadSDNode *Ld = Loads[EltIdx];
6743     int64_t ByteOffset = ByteOffsets[EltIdx];
6744     if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
6745       int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
6746       return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
6747               Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
6748     }
6749     return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
6750                                               EltIdx - FirstLoadedElt);
6751   };
6752
6753   // Consecutive loads can contain UNDEFS but not ZERO elements.
6754   // Consecutive loads with UNDEFs and ZEROs elements require a
6755   // an additional shuffle stage to clear the ZERO elements.
6756   bool IsConsecutiveLoad = true;
6757   bool IsConsecutiveLoadWithZeros = true;
6758   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6759     if (LoadMask[i]) {
6760       if (!CheckConsecutiveLoad(LDBase, i)) {
6761         IsConsecutiveLoad = false;
6762         IsConsecutiveLoadWithZeros = false;
6763         break;
6764       }
6765     } else if (ZeroMask[i]) {
6766       IsConsecutiveLoad = false;
6767     }
6768   }
6769
6770   auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6771     auto MMOFlags = LDBase->getMemOperand()->getFlags();
6772     assert(LDBase->isSimple() &&
6773            "Cannot merge volatile or atomic loads.");
6774     SDValue NewLd =
6775         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6776                     LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
6777                     MMOFlags);
6778     for (auto *LD : Loads)
6779       if (LD)
6780         DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6781     return NewLd;
6782   };
6783
6784   // Check if the base load is entirely dereferenceable.
6785   bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
6786       VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
6787
6788   // LOAD - all consecutive load/undefs (must start/end with a load or be
6789   // entirely dereferenceable). If we have found an entire vector of loads and
6790   // undefs, then return a large load of the entire vector width starting at the
6791   // base pointer. If the vector contains zeros, then attempt to shuffle those
6792   // elements.
6793   if (FirstLoadedElt == 0 &&
6794       (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
6795       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6796     if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6797       return SDValue();
6798
6799     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6800     // will lower to regular temporal loads and use the cache.
6801     if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
6802         VT.is256BitVector() && !Subtarget.hasInt256())
6803       return SDValue();
6804
6805     if (NumElems == 1)
6806       return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
6807
6808     if (!ZeroMask)
6809       return CreateLoad(VT, LDBase);
6810
6811     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6812     // vector and a zero vector to clear out the zero elements.
6813     if (!IsAfterLegalize && VT.isVector()) {
6814       unsigned NumMaskElts = VT.getVectorNumElements();
6815       if ((NumMaskElts % NumElems) == 0) {
6816         unsigned Scale = NumMaskElts / NumElems;
6817         SmallVector<int, 4> ClearMask(NumMaskElts, -1);
6818         for (unsigned i = 0; i < NumElems; ++i) {
6819           if (UndefMask[i])
6820             continue;
6821           int Offset = ZeroMask[i] ? NumMaskElts : 0;
6822           for (unsigned j = 0; j != Scale; ++j)
6823             ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
6824         }
6825         SDValue V = CreateLoad(VT, LDBase);
6826         SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6827                                    : DAG.getConstantFP(0.0, DL, VT);
6828         return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6829       }
6830     }
6831   }
6832
6833   // If the upper half of a ymm/zmm load is undef then just load the lower half.
6834   if (VT.is256BitVector() || VT.is512BitVector()) {
6835     unsigned HalfNumElems = NumElems / 2;
6836     if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
6837       EVT HalfVT =
6838           EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
6839       SDValue HalfLD =
6840           EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
6841                                    DAG, Subtarget, IsAfterLegalize);
6842       if (HalfLD)
6843         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
6844                            HalfLD, DAG.getIntPtrConstant(0, DL));
6845     }
6846   }
6847
6848   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6849   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6850       ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
6851        LoadSizeInBits == 64) &&
6852       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6853     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
6854                                       : MVT::getIntegerVT(LoadSizeInBits);
6855     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
6856     // Allow v4f32 on SSE1 only targets.
6857     // FIXME: Add more isel patterns so we can just use VT directly.
6858     if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
6859       VecVT = MVT::v4f32;
6860     if (TLI.isTypeLegal(VecVT)) {
6861       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6862       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6863       SDValue ResNode = DAG.getMemIntrinsicNode(
6864           X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
6865           LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
6866       for (auto *LD : Loads)
6867         if (LD)
6868           DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6869       return DAG.getBitcast(VT, ResNode);
6870     }
6871   }
6872
6873   // BROADCAST - match the smallest possible repetition pattern, load that
6874   // scalar/subvector element and then broadcast to the entire vector.
6875   if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
6876       (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
6877     for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
6878       unsigned RepeatSize = SubElems * BaseSizeInBits;
6879       unsigned ScalarSize = std::min(RepeatSize, 64u);
6880       if (!Subtarget.hasAVX2() && ScalarSize < 32)
6881         continue;
6882
6883       // Don't attempt a 1:N subvector broadcast - it should be caught by
6884       // combineConcatVectorOps, else will cause infinite loops.
6885       if (RepeatSize > ScalarSize && SubElems == 1)
6886         continue;
6887
6888       bool Match = true;
6889       SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
6890       for (unsigned i = 0; i != NumElems && Match; ++i) {
6891         if (!LoadMask[i])
6892           continue;
6893         SDValue Elt = peekThroughBitcasts(Elts[i]);
6894         if (RepeatedLoads[i % SubElems].isUndef())
6895           RepeatedLoads[i % SubElems] = Elt;
6896         else
6897           Match &= (RepeatedLoads[i % SubElems] == Elt);
6898       }
6899
6900       // We must have loads at both ends of the repetition.
6901       Match &= !RepeatedLoads.front().isUndef();
6902       Match &= !RepeatedLoads.back().isUndef();
6903       if (!Match)
6904         continue;
6905
6906       EVT RepeatVT =
6907           VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
6908               ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
6909               : EVT::getFloatingPointVT(ScalarSize);
6910       if (RepeatSize > ScalarSize)
6911         RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
6912                                     RepeatSize / ScalarSize);
6913       EVT BroadcastVT =
6914           EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
6915                            VT.getSizeInBits() / ScalarSize);
6916       if (TLI.isTypeLegal(BroadcastVT)) {
6917         if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
6918                 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
6919           SDValue Broadcast = RepeatLoad;
6920           if (RepeatSize > ScalarSize) {
6921             while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
6922               Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
6923           } else {
6924             if (!Subtarget.hasAVX2() &&
6925                 !X86::mayFoldLoadIntoBroadcastFromMem(
6926                     RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
6927                     Subtarget,
6928                     /*AssumeSingleUse=*/true))
6929               return SDValue();
6930             Broadcast =
6931                 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
6932           }
6933           return DAG.getBitcast(VT, Broadcast);
6934         }
6935       }
6936     }
6937   }
6938
6939   return SDValue();
6940 }
6941
6942 // Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
6943 // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
6944 // are consecutive, non-overlapping, and in the right order.
6945 static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
6946                                          SelectionDAG &DAG,
6947                                          const X86Subtarget &Subtarget,
6948                                          bool IsAfterLegalize) {
6949   SmallVector<SDValue, 64> Elts;
6950   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
6951     if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
6952       Elts.push_back(Elt);
6953       continue;
6954     }
6955     return SDValue();
6956   }
6957   assert(Elts.size() == VT.getVectorNumElements());
6958   return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
6959                                   IsAfterLegalize);
6960 }
6961
6962 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6963                                    unsigned SplatBitSize, LLVMContext &C) {
6964   unsigned ScalarSize = VT.getScalarSizeInBits();
6965
6966   auto getConstantScalar = [&](const APInt &Val) -> Constant * {
6967     if (VT.isFloatingPoint()) {
6968       if (ScalarSize == 16)
6969         return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
6970       if (ScalarSize == 32)
6971         return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6972       assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6973       return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6974     }
6975     return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6976   };
6977
6978   if (ScalarSize == SplatBitSize)
6979     return getConstantScalar(SplatValue);
6980
6981   unsigned NumElm = SplatBitSize / ScalarSize;
6982   SmallVector<Constant *, 32> ConstantVec;
6983   for (unsigned I = 0; I != NumElm; ++I) {
6984     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
6985     ConstantVec.push_back(getConstantScalar(Val));
6986   }
6987   return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6988 }
6989
6990 static bool isFoldableUseOfShuffle(SDNode *N) {
6991   for (auto *U : N->uses()) {
6992     unsigned Opc = U->getOpcode();
6993     // VPERMV/VPERMV3 shuffles can never fold their index operands.
6994     if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
6995       return false;
6996     if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
6997       return false;
6998     if (isTargetShuffle(Opc))
6999       return true;
7000     if (Opc == ISD::BITCAST) // Ignore bitcasts
7001       return isFoldableUseOfShuffle(U);
7002     if (N->hasOneUse()) {
7003       // TODO, there may be some general way to know if a SDNode can
7004       // be folded. We now only know whether an MI is foldable.
7005       if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7006         return false;
7007       return true;
7008     }
7009   }
7010   return false;
7011 }
7012
7013 /// Attempt to use the vbroadcast instruction to generate a splat value
7014 /// from a splat BUILD_VECTOR which uses:
7015 ///  a. A single scalar load, or a constant.
7016 ///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7017 ///
7018 /// The VBROADCAST node is returned when a pattern is found,
7019 /// or SDValue() otherwise.
7020 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
7021                                            const X86Subtarget &Subtarget,
7022                                            SelectionDAG &DAG) {
7023   // VBROADCAST requires AVX.
7024   // TODO: Splats could be generated for non-AVX CPUs using SSE
7025   // instructions, but there's less potential gain for only 128-bit vectors.
7026   if (!Subtarget.hasAVX())
7027     return SDValue();
7028
7029   MVT VT = BVOp->getSimpleValueType(0);
7030   unsigned NumElts = VT.getVectorNumElements();
7031   SDLoc dl(BVOp);
7032
7033   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7034          "Unsupported vector type for broadcast.");
7035
7036   // See if the build vector is a repeating sequence of scalars (inc. splat).
7037   SDValue Ld;
7038   BitVector UndefElements;
7039   SmallVector<SDValue, 16> Sequence;
7040   if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7041     assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7042     if (Sequence.size() == 1)
7043       Ld = Sequence[0];
7044   }
7045
7046   // Attempt to use VBROADCASTM
7047   // From this pattern:
7048   // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7049   // b. t1 = (build_vector t0 t0)
7050   //
7051   // Create (VBROADCASTM v2i1 X)
7052   if (!Sequence.empty() && Subtarget.hasCDI()) {
7053     // If not a splat, are the upper sequence values zeroable?
7054     unsigned SeqLen = Sequence.size();
7055     bool UpperZeroOrUndef =
7056         SeqLen == 1 ||
7057         llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
7058           return !V || V.isUndef() || isNullConstant(V);
7059         });
7060     SDValue Op0 = Sequence[0];
7061     if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7062                              (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7063                               Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7064       SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7065                              ? Op0.getOperand(0)
7066                              : Op0.getOperand(0).getOperand(0);
7067       MVT MaskVT = BOperand.getSimpleValueType();
7068       MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7069       if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||  // for broadcastmb2q
7070           (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7071         MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7072         if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7073           unsigned Scale = 512 / VT.getSizeInBits();
7074           BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7075         }
7076         SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7077         if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7078           Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7079         return DAG.getBitcast(VT, Bcst);
7080       }
7081     }
7082   }
7083
7084   unsigned NumUndefElts = UndefElements.count();
7085   if (!Ld || (NumElts - NumUndefElts) <= 1) {
7086     APInt SplatValue, Undef;
7087     unsigned SplatBitSize;
7088     bool HasUndef;
7089     // Check if this is a repeated constant pattern suitable for broadcasting.
7090     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7091         SplatBitSize > VT.getScalarSizeInBits() &&
7092         SplatBitSize < VT.getSizeInBits()) {
7093       // Avoid replacing with broadcast when it's a use of a shuffle
7094       // instruction to preserve the present custom lowering of shuffles.
7095       if (isFoldableUseOfShuffle(BVOp))
7096         return SDValue();
7097       // replace BUILD_VECTOR with broadcast of the repeated constants.
7098       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7099       LLVMContext *Ctx = DAG.getContext();
7100       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7101       if (SplatBitSize == 32 || SplatBitSize == 64 ||
7102           (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7103         // Load the constant scalar/subvector and broadcast it.
7104         MVT CVT = MVT::getIntegerVT(SplatBitSize);
7105         Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7106         SDValue CP = DAG.getConstantPool(C, PVT);
7107         unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7108
7109         Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7110         SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7111         SDValue Ops[] = {DAG.getEntryNode(), CP};
7112         MachinePointerInfo MPI =
7113             MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
7114         SDValue Brdcst =
7115             DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7116                                     MPI, Alignment, MachineMemOperand::MOLoad);
7117         return DAG.getBitcast(VT, Brdcst);
7118       }
7119       if (SplatBitSize > 64) {
7120         // Load the vector of constants and broadcast it.
7121         Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7122         SDValue VCP = DAG.getConstantPool(VecC, PVT);
7123         unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7124         MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7125         Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7126         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7127         SDValue Ops[] = {DAG.getEntryNode(), VCP};
7128         MachinePointerInfo MPI =
7129             MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
7130         return DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, dl, Tys,
7131                                        Ops, VVT, MPI, Alignment,
7132                                        MachineMemOperand::MOLoad);
7133       }
7134     }
7135
7136     // If we are moving a scalar into a vector (Ld must be set and all elements
7137     // but 1 are undef) and that operation is not obviously supported by
7138     // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7139     // That's better than general shuffling and may eliminate a load to GPR and
7140     // move from scalar to vector register.
7141     if (!Ld || NumElts - NumUndefElts != 1)
7142       return SDValue();
7143     unsigned ScalarSize = Ld.getValueSizeInBits();
7144     if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7145       return SDValue();
7146   }
7147
7148   bool ConstSplatVal =
7149       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7150   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7151
7152   // TODO: Handle broadcasts of non-constant sequences.
7153
7154   // Make sure that all of the users of a non-constant load are from the
7155   // BUILD_VECTOR node.
7156   // FIXME: Is the use count needed for non-constant, non-load case?
7157   if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7158     return SDValue();
7159
7160   unsigned ScalarSize = Ld.getValueSizeInBits();
7161   bool IsGE256 = (VT.getSizeInBits() >= 256);
7162
7163   // When optimizing for size, generate up to 5 extra bytes for a broadcast
7164   // instruction to save 8 or more bytes of constant pool data.
7165   // TODO: If multiple splats are generated to load the same constant,
7166   // it may be detrimental to overall size. There needs to be a way to detect
7167   // that condition to know if this is truly a size win.
7168   bool OptForSize = DAG.shouldOptForSize();
7169
7170   // Handle broadcasting a single constant scalar from the constant pool
7171   // into a vector.
7172   // On Sandybridge (no AVX2), it is still better to load a constant vector
7173   // from the constant pool and not to broadcast it from a scalar.
7174   // But override that restriction when optimizing for size.
7175   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7176   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7177     EVT CVT = Ld.getValueType();
7178     assert(!CVT.isVector() && "Must not broadcast a vector type");
7179
7180     // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7181     // For size optimization, also splat v2f64 and v2i64, and for size opt
7182     // with AVX2, also splat i8 and i16.
7183     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7184     if (ScalarSize == 32 ||
7185         (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7186         CVT == MVT::f16 ||
7187         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7188       const Constant *C = nullptr;
7189       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7190         C = CI->getConstantIntValue();
7191       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7192         C = CF->getConstantFPValue();
7193
7194       assert(C && "Invalid constant type");
7195
7196       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7197       SDValue CP =
7198           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
7199       Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7200
7201       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7202       SDValue Ops[] = {DAG.getEntryNode(), CP};
7203       MachinePointerInfo MPI =
7204           MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
7205       return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7206                                      MPI, Alignment, MachineMemOperand::MOLoad);
7207     }
7208   }
7209
7210   // Handle AVX2 in-register broadcasts.
7211   if (!IsLoad && Subtarget.hasInt256() &&
7212       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7213     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7214
7215   // The scalar source must be a normal load.
7216   if (!IsLoad)
7217     return SDValue();
7218
7219   // Make sure the non-chain result is only used by this build vector.
7220   if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7221     return SDValue();
7222
7223   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7224       (Subtarget.hasVLX() && ScalarSize == 64)) {
7225     auto *LN = cast<LoadSDNode>(Ld);
7226     SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7227     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7228     SDValue BCast =
7229         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
7230                                 LN->getMemoryVT(), LN->getMemOperand());
7231     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7232     return BCast;
7233   }
7234
7235   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7236   // double since there is no vbroadcastsd xmm
7237   if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7238       (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7239     auto *LN = cast<LoadSDNode>(Ld);
7240     SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7241     SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7242     SDValue BCast =
7243         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
7244                                 LN->getMemoryVT(), LN->getMemOperand());
7245     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7246     return BCast;
7247   }
7248
7249   if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7250     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7251
7252   // Unsupported broadcast.
7253   return SDValue();
7254 }
7255
7256 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
7257 /// underlying vector and index.
7258 ///
7259 /// Modifies \p ExtractedFromVec to the real vector and returns the real
7260 /// index.
7261 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7262                                          SDValue ExtIdx) {
7263   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7264   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7265     return Idx;
7266
7267   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7268   // lowered this:
7269   //   (extract_vector_elt (v8f32 %1), Constant<6>)
7270   // to:
7271   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
7272   //                           (extract_subvector (v8f32 %0), Constant<4>),
7273   //                           undef)
7274   //                       Constant<0>)
7275   // In this case the vector is the extract_subvector expression and the index
7276   // is 2, as specified by the shuffle.
7277   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7278   SDValue ShuffleVec = SVOp->getOperand(0);
7279   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7280   assert(ShuffleVecVT.getVectorElementType() ==
7281          ExtractedFromVec.getSimpleValueType().getVectorElementType());
7282
7283   int ShuffleIdx = SVOp->getMaskElt(Idx);
7284   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7285     ExtractedFromVec = ShuffleVec;
7286     return ShuffleIdx;
7287   }
7288   return Idx;
7289 }
7290
7291 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7292   MVT VT = Op.getSimpleValueType();
7293
7294   // Skip if insert_vec_elt is not supported.
7295   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7296   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7297     return SDValue();
7298
7299   SDLoc DL(Op);
7300   unsigned NumElems = Op.getNumOperands();
7301
7302   SDValue VecIn1;
7303   SDValue VecIn2;
7304   SmallVector<unsigned, 4> InsertIndices;
7305   SmallVector<int, 8> Mask(NumElems, -1);
7306
7307   for (unsigned i = 0; i != NumElems; ++i) {
7308     unsigned Opc = Op.getOperand(i).getOpcode();
7309
7310     if (Opc == ISD::UNDEF)
7311       continue;
7312
7313     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7314       // Quit if more than 1 elements need inserting.
7315       if (InsertIndices.size() > 1)
7316         return SDValue();
7317
7318       InsertIndices.push_back(i);
7319       continue;
7320     }
7321
7322     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7323     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7324
7325     // Quit if non-constant index.
7326     if (!isa<ConstantSDNode>(ExtIdx))
7327       return SDValue();
7328     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7329
7330     // Quit if extracted from vector of different type.
7331     if (ExtractedFromVec.getValueType() != VT)
7332       return SDValue();
7333
7334     if (!VecIn1.getNode())
7335       VecIn1 = ExtractedFromVec;
7336     else if (VecIn1 != ExtractedFromVec) {
7337       if (!VecIn2.getNode())
7338         VecIn2 = ExtractedFromVec;
7339       else if (VecIn2 != ExtractedFromVec)
7340         // Quit if more than 2 vectors to shuffle
7341         return SDValue();
7342     }
7343
7344     if (ExtractedFromVec == VecIn1)
7345       Mask[i] = Idx;
7346     else if (ExtractedFromVec == VecIn2)
7347       Mask[i] = Idx + NumElems;
7348   }
7349
7350   if (!VecIn1.getNode())
7351     return SDValue();
7352
7353   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7354   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7355
7356   for (unsigned Idx : InsertIndices)
7357     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7358                      DAG.getIntPtrConstant(Idx, DL));
7359
7360   return NV;
7361 }
7362
7363 // Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7364 static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,
7365                                        const X86Subtarget &Subtarget) {
7366   MVT VT = Op.getSimpleValueType();
7367   MVT IVT = VT.changeVectorElementTypeToInteger();
7368   SmallVector<SDValue, 16> NewOps;
7369   for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7370     NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
7371   SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7372   return DAG.getBitcast(VT, Res);
7373 }
7374
7375 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7376 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
7377                                      const X86Subtarget &Subtarget) {
7378
7379   MVT VT = Op.getSimpleValueType();
7380   assert((VT.getVectorElementType() == MVT::i1) &&
7381          "Unexpected type in LowerBUILD_VECTORvXi1!");
7382
7383   SDLoc dl(Op);
7384   if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7385       ISD::isBuildVectorAllOnes(Op.getNode()))
7386     return Op;
7387
7388   uint64_t Immediate = 0;
7389   SmallVector<unsigned, 16> NonConstIdx;
7390   bool IsSplat = true;
7391   bool HasConstElts = false;
7392   int SplatIdx = -1;
7393   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7394     SDValue In = Op.getOperand(idx);
7395     if (In.isUndef())
7396       continue;
7397     if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7398       Immediate |= (InC->getZExtValue() & 0x1) << idx;
7399       HasConstElts = true;
7400     } else {
7401       NonConstIdx.push_back(idx);
7402     }
7403     if (SplatIdx < 0)
7404       SplatIdx = idx;
7405     else if (In != Op.getOperand(SplatIdx))
7406       IsSplat = false;
7407   }
7408
7409   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7410   if (IsSplat) {
7411     // The build_vector allows the scalar element to be larger than the vector
7412     // element type. We need to mask it to use as a condition unless we know
7413     // the upper bits are zero.
7414     // FIXME: Use computeKnownBits instead of checking specific opcode?
7415     SDValue Cond = Op.getOperand(SplatIdx);
7416     assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7417     if (Cond.getOpcode() != ISD::SETCC)
7418       Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7419                          DAG.getConstant(1, dl, MVT::i8));
7420
7421     // Perform the select in the scalar domain so we can use cmov.
7422     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7423       SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7424                                      DAG.getAllOnesConstant(dl, MVT::i32),
7425                                      DAG.getConstant(0, dl, MVT::i32));
7426       Select = DAG.getBitcast(MVT::v32i1, Select);
7427       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7428     } else {
7429       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7430       SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7431                                      DAG.getAllOnesConstant(dl, ImmVT),
7432                                      DAG.getConstant(0, dl, ImmVT));
7433       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7434       Select = DAG.getBitcast(VecVT, Select);
7435       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7436                          DAG.getIntPtrConstant(0, dl));
7437     }
7438   }
7439
7440   // insert elements one by one
7441   SDValue DstVec;
7442   if (HasConstElts) {
7443     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7444       SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7445       SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7446       ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7447       ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7448       DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7449     } else {
7450       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7451       SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7452       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7453       DstVec = DAG.getBitcast(VecVT, Imm);
7454       DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7455                            DAG.getIntPtrConstant(0, dl));
7456     }
7457   } else
7458     DstVec = DAG.getUNDEF(VT);
7459
7460   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7461     unsigned InsertIdx = NonConstIdx[i];
7462     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7463                          Op.getOperand(InsertIdx),
7464                          DAG.getIntPtrConstant(InsertIdx, dl));
7465   }
7466   return DstVec;
7467 }
7468
7469 LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7470   switch (Opcode) {
7471   case X86ISD::PACKSS:
7472   case X86ISD::PACKUS:
7473   case X86ISD::FHADD:
7474   case X86ISD::FHSUB:
7475   case X86ISD::HADD:
7476   case X86ISD::HSUB:
7477     return true;
7478   }
7479   return false;
7480 }
7481
7482 /// This is a helper function of LowerToHorizontalOp().
7483 /// This function checks that the build_vector \p N in input implements a
7484 /// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7485 /// may not match the layout of an x86 256-bit horizontal instruction.
7486 /// In other words, if this returns true, then some extraction/insertion will
7487 /// be required to produce a valid horizontal instruction.
7488 ///
7489 /// Parameter \p Opcode defines the kind of horizontal operation to match.
7490 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7491 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7492 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7493 /// arithmetic sub.
7494 ///
7495 /// This function only analyzes elements of \p N whose indices are
7496 /// in range [BaseIdx, LastIdx).
7497 ///
7498 /// TODO: This function was originally used to match both real and fake partial
7499 /// horizontal operations, but the index-matching logic is incorrect for that.
7500 /// See the corrected implementation in isHopBuildVector(). Can we reduce this
7501 /// code because it is only used for partial h-op matching now?
7502 static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7503                                   SelectionDAG &DAG,
7504                                   unsigned BaseIdx, unsigned LastIdx,
7505                                   SDValue &V0, SDValue &V1) {
7506   EVT VT = N->getValueType(0);
7507   assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7508   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7509   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7510          "Invalid Vector in input!");
7511
7512   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7513   bool CanFold = true;
7514   unsigned ExpectedVExtractIdx = BaseIdx;
7515   unsigned NumElts = LastIdx - BaseIdx;
7516   V0 = DAG.getUNDEF(VT);
7517   V1 = DAG.getUNDEF(VT);
7518
7519   // Check if N implements a horizontal binop.
7520   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7521     SDValue Op = N->getOperand(i + BaseIdx);
7522
7523     // Skip UNDEFs.
7524     if (Op->isUndef()) {
7525       // Update the expected vector extract index.
7526       if (i * 2 == NumElts)
7527         ExpectedVExtractIdx = BaseIdx;
7528       ExpectedVExtractIdx += 2;
7529       continue;
7530     }
7531
7532     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7533
7534     if (!CanFold)
7535       break;
7536
7537     SDValue Op0 = Op.getOperand(0);
7538     SDValue Op1 = Op.getOperand(1);
7539
7540     // Try to match the following pattern:
7541     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7542     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7543         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7544         Op0.getOperand(0) == Op1.getOperand(0) &&
7545         isa<ConstantSDNode>(Op0.getOperand(1)) &&
7546         isa<ConstantSDNode>(Op1.getOperand(1)));
7547     if (!CanFold)
7548       break;
7549
7550     unsigned I0 = Op0.getConstantOperandVal(1);
7551     unsigned I1 = Op1.getConstantOperandVal(1);
7552
7553     if (i * 2 < NumElts) {
7554       if (V0.isUndef()) {
7555         V0 = Op0.getOperand(0);
7556         if (V0.getValueType() != VT)
7557           return false;
7558       }
7559     } else {
7560       if (V1.isUndef()) {
7561         V1 = Op0.getOperand(0);
7562         if (V1.getValueType() != VT)
7563           return false;
7564       }
7565       if (i * 2 == NumElts)
7566         ExpectedVExtractIdx = BaseIdx;
7567     }
7568
7569     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7570     if (I0 == ExpectedVExtractIdx)
7571       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7572     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7573       // Try to match the following dag sequence:
7574       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7575       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7576     } else
7577       CanFold = false;
7578
7579     ExpectedVExtractIdx += 2;
7580   }
7581
7582   return CanFold;
7583 }
7584
7585 /// Emit a sequence of two 128-bit horizontal add/sub followed by
7586 /// a concat_vector.
7587 ///
7588 /// This is a helper function of LowerToHorizontalOp().
7589 /// This function expects two 256-bit vectors called V0 and V1.
7590 /// At first, each vector is split into two separate 128-bit vectors.
7591 /// Then, the resulting 128-bit vectors are used to implement two
7592 /// horizontal binary operations.
7593 ///
7594 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7595 ///
7596 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7597 /// the two new horizontal binop.
7598 /// When Mode is set, the first horizontal binop dag node would take as input
7599 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7600 /// horizontal binop dag node would take as input the lower 128-bit of V1
7601 /// and the upper 128-bit of V1.
7602 ///   Example:
7603 ///     HADD V0_LO, V0_HI
7604 ///     HADD V1_LO, V1_HI
7605 ///
7606 /// Otherwise, the first horizontal binop dag node takes as input the lower
7607 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7608 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7609 ///   Example:
7610 ///     HADD V0_LO, V1_LO
7611 ///     HADD V0_HI, V1_HI
7612 ///
7613 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7614 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7615 /// the upper 128-bits of the result.
7616 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7617                                      const SDLoc &DL, SelectionDAG &DAG,
7618                                      unsigned X86Opcode, bool Mode,
7619                                      bool isUndefLO, bool isUndefHI) {
7620   MVT VT = V0.getSimpleValueType();
7621   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7622          "Invalid nodes in input!");
7623
7624   unsigned NumElts = VT.getVectorNumElements();
7625   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7626   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7627   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7628   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7629   MVT NewVT = V0_LO.getSimpleValueType();
7630
7631   SDValue LO = DAG.getUNDEF(NewVT);
7632   SDValue HI = DAG.getUNDEF(NewVT);
7633
7634   if (Mode) {
7635     // Don't emit a horizontal binop if the result is expected to be UNDEF.
7636     if (!isUndefLO && !V0->isUndef())
7637       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7638     if (!isUndefHI && !V1->isUndef())
7639       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7640   } else {
7641     // Don't emit a horizontal binop if the result is expected to be UNDEF.
7642     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7643       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7644
7645     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7646       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7647   }
7648
7649   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7650 }
7651
7652 /// Returns true iff \p BV builds a vector with the result equivalent to
7653 /// the result of ADDSUB/SUBADD operation.
7654 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7655 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7656 /// \p Opnd0 and \p Opnd1.
7657 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
7658                              const X86Subtarget &Subtarget, SelectionDAG &DAG,
7659                              SDValue &Opnd0, SDValue &Opnd1,
7660                              unsigned &NumExtracts,
7661                              bool &IsSubAdd) {
7662
7663   MVT VT = BV->getSimpleValueType(0);
7664   if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7665     return false;
7666
7667   unsigned NumElts = VT.getVectorNumElements();
7668   SDValue InVec0 = DAG.getUNDEF(VT);
7669   SDValue InVec1 = DAG.getUNDEF(VT);
7670
7671   NumExtracts = 0;
7672
7673   // Odd-numbered elements in the input build vector are obtained from
7674   // adding/subtracting two integer/float elements.
7675   // Even-numbered elements in the input build vector are obtained from
7676   // subtracting/adding two integer/float elements.
7677   unsigned Opc[2] = {0, 0};
7678   for (unsigned i = 0, e = NumElts; i != e; ++i) {
7679     SDValue Op = BV->getOperand(i);
7680
7681     // Skip 'undef' values.
7682     unsigned Opcode = Op.getOpcode();
7683     if (Opcode == ISD::UNDEF)
7684       continue;
7685
7686     // Early exit if we found an unexpected opcode.
7687     if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7688       return false;
7689
7690     SDValue Op0 = Op.getOperand(0);
7691     SDValue Op1 = Op.getOperand(1);
7692
7693     // Try to match the following pattern:
7694     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7695     // Early exit if we cannot match that sequence.
7696     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7697         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7698         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7699         Op0.getOperand(1) != Op1.getOperand(1))
7700       return false;
7701
7702     unsigned I0 = Op0.getConstantOperandVal(1);
7703     if (I0 != i)
7704       return false;
7705
7706     // We found a valid add/sub node, make sure its the same opcode as previous
7707     // elements for this parity.
7708     if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7709       return false;
7710     Opc[i % 2] = Opcode;
7711
7712     // Update InVec0 and InVec1.
7713     if (InVec0.isUndef()) {
7714       InVec0 = Op0.getOperand(0);
7715       if (InVec0.getSimpleValueType() != VT)
7716         return false;
7717     }
7718     if (InVec1.isUndef()) {
7719       InVec1 = Op1.getOperand(0);
7720       if (InVec1.getSimpleValueType() != VT)
7721         return false;
7722     }
7723
7724     // Make sure that operands in input to each add/sub node always
7725     // come from a same pair of vectors.
7726     if (InVec0 != Op0.getOperand(0)) {
7727       if (Opcode == ISD::FSUB)
7728         return false;
7729
7730       // FADD is commutable. Try to commute the operands
7731       // and then test again.
7732       std::swap(Op0, Op1);
7733       if (InVec0 != Op0.getOperand(0))
7734         return false;
7735     }
7736
7737     if (InVec1 != Op1.getOperand(0))
7738       return false;
7739
7740     // Increment the number of extractions done.
7741     ++NumExtracts;
7742   }
7743
7744   // Ensure we have found an opcode for both parities and that they are
7745   // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7746   // inputs are undef.
7747   if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7748       InVec0.isUndef() || InVec1.isUndef())
7749     return false;
7750
7751   IsSubAdd = Opc[0] == ISD::FADD;
7752
7753   Opnd0 = InVec0;
7754   Opnd1 = InVec1;
7755   return true;
7756 }
7757
7758 /// Returns true if is possible to fold MUL and an idiom that has already been
7759 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7760 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7761 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7762 ///
7763 /// Prior to calling this function it should be known that there is some
7764 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7765 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7766 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7767 /// of \p Opnd0 uses is expected to be equal to 2.
7768 /// For example, this function may be called for the following IR:
7769 ///    %AB = fmul fast <2 x double> %A, %B
7770 ///    %Sub = fsub fast <2 x double> %AB, %C
7771 ///    %Add = fadd fast <2 x double> %AB, %C
7772 ///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7773 ///                            <2 x i32> <i32 0, i32 3>
7774 /// There is a def for %Addsub here, which potentially can be replaced by
7775 /// X86ISD::ADDSUB operation:
7776 ///    %Addsub = X86ISD::ADDSUB %AB, %C
7777 /// and such ADDSUB can further be replaced with FMADDSUB:
7778 ///    %Addsub = FMADDSUB %A, %B, %C.
7779 ///
7780 /// The main reason why this method is called before the replacement of the
7781 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7782 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7783 /// FMADDSUB is.
7784 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
7785                                  SelectionDAG &DAG,
7786                                  SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7787                                  unsigned ExpectedUses) {
7788   if (Opnd0.getOpcode() != ISD::FMUL ||
7789       !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
7790     return false;
7791
7792   // FIXME: These checks must match the similar ones in
7793   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7794   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7795   // or MUL + ADDSUB to FMADDSUB.
7796   const TargetOptions &Options = DAG.getTarget().Options;
7797   bool AllowFusion =
7798       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7799   if (!AllowFusion)
7800     return false;
7801
7802   Opnd2 = Opnd1;
7803   Opnd1 = Opnd0.getOperand(1);
7804   Opnd0 = Opnd0.getOperand(0);
7805
7806   return true;
7807 }
7808
7809 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
7810 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
7811 /// X86ISD::FMSUBADD node.
7812 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7813                                        const X86Subtarget &Subtarget,
7814                                        SelectionDAG &DAG) {
7815   SDValue Opnd0, Opnd1;
7816   unsigned NumExtracts;
7817   bool IsSubAdd;
7818   if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
7819                         IsSubAdd))
7820     return SDValue();
7821
7822   MVT VT = BV->getSimpleValueType(0);
7823   SDLoc DL(BV);
7824
7825   // Try to generate X86ISD::FMADDSUB node here.
7826   SDValue Opnd2;
7827   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
7828     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
7829     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
7830   }
7831
7832   // We only support ADDSUB.
7833   if (IsSubAdd)
7834     return SDValue();
7835
7836   // There are no known X86 targets with 512-bit ADDSUB instructions!
7837   // Convert to blend(fsub,fadd).
7838   if (VT.is512BitVector()) {
7839     SmallVector<int> Mask;
7840     for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
7841         Mask.push_back(I);
7842         Mask.push_back(I + E + 1);
7843     }
7844     SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
7845     SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
7846     return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
7847   }
7848
7849   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7850 }
7851
7852 static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
7853                              unsigned &HOpcode, SDValue &V0, SDValue &V1) {
7854   // Initialize outputs to known values.
7855   MVT VT = BV->getSimpleValueType(0);
7856   HOpcode = ISD::DELETED_NODE;
7857   V0 = DAG.getUNDEF(VT);
7858   V1 = DAG.getUNDEF(VT);
7859
7860   // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
7861   // half of the result is calculated independently from the 128-bit halves of
7862   // the inputs, so that makes the index-checking logic below more complicated.
7863   unsigned NumElts = VT.getVectorNumElements();
7864   unsigned GenericOpcode = ISD::DELETED_NODE;
7865   unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
7866   unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
7867   unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
7868   for (unsigned i = 0; i != Num128BitChunks; ++i) {
7869     for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
7870       // Ignore undef elements.
7871       SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
7872       if (Op.isUndef())
7873         continue;
7874
7875       // If there's an opcode mismatch, we're done.
7876       if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
7877         return false;
7878
7879       // Initialize horizontal opcode.
7880       if (HOpcode == ISD::DELETED_NODE) {
7881         GenericOpcode = Op.getOpcode();
7882         switch (GenericOpcode) {
7883         case ISD::ADD: HOpcode = X86ISD::HADD; break;
7884         case ISD::SUB: HOpcode = X86ISD::HSUB; break;
7885         case ISD::FADD: HOpcode = X86ISD::FHADD; break;
7886         case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
7887         default: return false;
7888         }
7889       }
7890
7891       SDValue Op0 = Op.getOperand(0);
7892       SDValue Op1 = Op.getOperand(1);
7893       if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7894           Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7895           Op0.getOperand(0) != Op1.getOperand(0) ||
7896           !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7897           !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
7898         return false;
7899
7900       // The source vector is chosen based on which 64-bit half of the
7901       // destination vector is being calculated.
7902       if (j < NumEltsIn64Bits) {
7903         if (V0.isUndef())
7904           V0 = Op0.getOperand(0);
7905       } else {
7906         if (V1.isUndef())
7907           V1 = Op0.getOperand(0);
7908       }
7909
7910       SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
7911       if (SourceVec != Op0.getOperand(0))
7912         return false;
7913
7914       // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
7915       unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
7916       unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
7917       unsigned ExpectedIndex = i * NumEltsIn128Bits +
7918                                (j % NumEltsIn64Bits) * 2;
7919       if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
7920         continue;
7921
7922       // If this is not a commutative op, this does not match.
7923       if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
7924         return false;
7925
7926       // Addition is commutative, so try swapping the extract indexes.
7927       // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
7928       if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
7929         continue;
7930
7931       // Extract indexes do not match horizontal requirement.
7932       return false;
7933     }
7934   }
7935   // We matched. Opcode and operands are returned by reference as arguments.
7936   return true;
7937 }
7938
7939 static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
7940                                     SelectionDAG &DAG, unsigned HOpcode,
7941                                     SDValue V0, SDValue V1) {
7942   // If either input vector is not the same size as the build vector,
7943   // extract/insert the low bits to the correct size.
7944   // This is free (examples: zmm --> xmm, xmm --> ymm).
7945   MVT VT = BV->getSimpleValueType(0);
7946   unsigned Width = VT.getSizeInBits();
7947   if (V0.getValueSizeInBits() > Width)
7948     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
7949   else if (V0.getValueSizeInBits() < Width)
7950     V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
7951
7952   if (V1.getValueSizeInBits() > Width)
7953     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
7954   else if (V1.getValueSizeInBits() < Width)
7955     V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
7956
7957   unsigned NumElts = VT.getVectorNumElements();
7958   APInt DemandedElts = APInt::getAllOnes(NumElts);
7959   for (unsigned i = 0; i != NumElts; ++i)
7960     if (BV->getOperand(i).isUndef())
7961       DemandedElts.clearBit(i);
7962
7963   // If we don't need the upper xmm, then perform as a xmm hop.
7964   unsigned HalfNumElts = NumElts / 2;
7965   if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
7966     MVT HalfVT = VT.getHalfNumVectorElementsVT();
7967     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
7968     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
7969     SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
7970     return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
7971   }
7972
7973   return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
7974 }
7975
7976 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7977 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7978                                    const X86Subtarget &Subtarget,
7979                                    SelectionDAG &DAG) {
7980   // We need at least 2 non-undef elements to make this worthwhile by default.
7981   unsigned NumNonUndefs =
7982       count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
7983   if (NumNonUndefs < 2)
7984     return SDValue();
7985
7986   // There are 4 sets of horizontal math operations distinguished by type:
7987   // int/FP at 128-bit/256-bit. Each type was introduced with a different
7988   // subtarget feature. Try to match those "native" patterns first.
7989   MVT VT = BV->getSimpleValueType(0);
7990   if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
7991       ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
7992       ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
7993       ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
7994     unsigned HOpcode;
7995     SDValue V0, V1;
7996     if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
7997       return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
7998   }
7999
8000   // Try harder to match 256-bit ops by using extract/concat.
8001   if (!Subtarget.hasAVX() || !VT.is256BitVector())
8002     return SDValue();
8003
8004   // Count the number of UNDEF operands in the build_vector in input.
8005   unsigned NumElts = VT.getVectorNumElements();
8006   unsigned Half = NumElts / 2;
8007   unsigned NumUndefsLO = 0;
8008   unsigned NumUndefsHI = 0;
8009   for (unsigned i = 0, e = Half; i != e; ++i)
8010     if (BV->getOperand(i)->isUndef())
8011       NumUndefsLO++;
8012
8013   for (unsigned i = Half, e = NumElts; i != e; ++i)
8014     if (BV->getOperand(i)->isUndef())
8015       NumUndefsHI++;
8016
8017   SDLoc DL(BV);
8018   SDValue InVec0, InVec1;
8019   if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8020     SDValue InVec2, InVec3;
8021     unsigned X86Opcode;
8022     bool CanFold = true;
8023
8024     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
8025         isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
8026                               InVec3) &&
8027         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8028         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8029       X86Opcode = X86ISD::HADD;
8030     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
8031                                    InVec1) &&
8032              isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
8033                                    InVec3) &&
8034              ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8035              ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8036       X86Opcode = X86ISD::HSUB;
8037     else
8038       CanFold = false;
8039
8040     if (CanFold) {
8041       // Do not try to expand this build_vector into a pair of horizontal
8042       // add/sub if we can emit a pair of scalar add/sub.
8043       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8044         return SDValue();
8045
8046       // Convert this build_vector into a pair of horizontal binops followed by
8047       // a concat vector. We must adjust the outputs from the partial horizontal
8048       // matching calls above to account for undefined vector halves.
8049       SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8050       SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8051       assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8052       bool isUndefLO = NumUndefsLO == Half;
8053       bool isUndefHI = NumUndefsHI == Half;
8054       return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8055                                    isUndefHI);
8056     }
8057   }
8058
8059   if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8060       VT == MVT::v16i16) {
8061     unsigned X86Opcode;
8062     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
8063       X86Opcode = X86ISD::HADD;
8064     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
8065                                    InVec1))
8066       X86Opcode = X86ISD::HSUB;
8067     else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
8068                                    InVec1))
8069       X86Opcode = X86ISD::FHADD;
8070     else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
8071                                    InVec1))
8072       X86Opcode = X86ISD::FHSUB;
8073     else
8074       return SDValue();
8075
8076     // Don't try to expand this build_vector into a pair of horizontal add/sub
8077     // if we can simply emit a pair of scalar add/sub.
8078     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8079       return SDValue();
8080
8081     // Convert this build_vector into two horizontal add/sub followed by
8082     // a concat vector.
8083     bool isUndefLO = NumUndefsLO == Half;
8084     bool isUndefHI = NumUndefsHI == Half;
8085     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8086                                  isUndefLO, isUndefHI);
8087   }
8088
8089   return SDValue();
8090 }
8091
8092 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8093                           SelectionDAG &DAG);
8094
8095 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
8096 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8097 /// just apply the bit to the vectors.
8098 /// NOTE: Its not in our interest to start make a general purpose vectorizer
8099 /// from this, but enough scalar bit operations are created from the later
8100 /// legalization + scalarization stages to need basic support.
8101 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
8102                                        const X86Subtarget &Subtarget,
8103                                        SelectionDAG &DAG) {
8104   SDLoc DL(Op);
8105   MVT VT = Op->getSimpleValueType(0);
8106   unsigned NumElems = VT.getVectorNumElements();
8107   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8108
8109   // Check that all elements have the same opcode.
8110   // TODO: Should we allow UNDEFS and if so how many?
8111   unsigned Opcode = Op->getOperand(0).getOpcode();
8112   for (unsigned i = 1; i < NumElems; ++i)
8113     if (Opcode != Op->getOperand(i).getOpcode())
8114       return SDValue();
8115
8116   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8117   bool IsShift = false;
8118   switch (Opcode) {
8119   default:
8120     return SDValue();
8121   case ISD::SHL:
8122   case ISD::SRL:
8123   case ISD::SRA:
8124     IsShift = true;
8125     break;
8126   case ISD::AND:
8127   case ISD::XOR:
8128   case ISD::OR:
8129     // Don't do this if the buildvector is a splat - we'd replace one
8130     // constant with an entire vector.
8131     if (Op->getSplatValue())
8132       return SDValue();
8133     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8134       return SDValue();
8135     break;
8136   }
8137
8138   SmallVector<SDValue, 4> LHSElts, RHSElts;
8139   for (SDValue Elt : Op->ops()) {
8140     SDValue LHS = Elt.getOperand(0);
8141     SDValue RHS = Elt.getOperand(1);
8142
8143     // We expect the canonicalized RHS operand to be the constant.
8144     if (!isa<ConstantSDNode>(RHS))
8145       return SDValue();
8146
8147     // Extend shift amounts.
8148     if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8149       if (!IsShift)
8150         return SDValue();
8151       RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8152     }
8153
8154     LHSElts.push_back(LHS);
8155     RHSElts.push_back(RHS);
8156   }
8157
8158   // Limit to shifts by uniform immediates.
8159   // TODO: Only accept vXi8/vXi64 special cases?
8160   // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8161   if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8162     return SDValue();
8163
8164   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8165   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8166   SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8167
8168   if (!IsShift)
8169     return Res;
8170
8171   // Immediately lower the shift to ensure the constant build vector doesn't
8172   // get converted to a constant pool before the shift is lowered.
8173   return LowerShift(Res, Subtarget, DAG);
8174 }
8175
8176 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
8177 /// functionality to do this, so it's all zeros, all ones, or some derivation
8178 /// that is cheap to calculate.
8179 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
8180                                          const X86Subtarget &Subtarget) {
8181   SDLoc DL(Op);
8182   MVT VT = Op.getSimpleValueType();
8183
8184   // Vectors containing all zeros can be matched by pxor and xorps.
8185   if (ISD::isBuildVectorAllZeros(Op.getNode()))
8186     return Op;
8187
8188   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8189   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8190   // vpcmpeqd on 256-bit vectors.
8191   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8192     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8193       return Op;
8194
8195     return getOnesVector(VT, DAG, DL);
8196   }
8197
8198   return SDValue();
8199 }
8200
8201 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8202 /// from a vector of source values and a vector of extraction indices.
8203 /// The vectors might be manipulated to match the type of the permute op.
8204 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8205                                      SDLoc &DL, SelectionDAG &DAG,
8206                                      const X86Subtarget &Subtarget) {
8207   MVT ShuffleVT = VT;
8208   EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8209   unsigned NumElts = VT.getVectorNumElements();
8210   unsigned SizeInBits = VT.getSizeInBits();
8211
8212   // Adjust IndicesVec to match VT size.
8213   assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8214          "Illegal variable permute mask size");
8215   if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8216     // Narrow/widen the indices vector to the correct size.
8217     if (IndicesVec.getValueSizeInBits() > SizeInBits)
8218       IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8219                                     NumElts * VT.getScalarSizeInBits());
8220     else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8221       IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8222                                   SDLoc(IndicesVec), SizeInBits);
8223     // Zero-extend the index elements within the vector.
8224     if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8225       IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8226                                IndicesVT, IndicesVec);
8227   }
8228   IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8229
8230   // Handle SrcVec that don't match VT type.
8231   if (SrcVec.getValueSizeInBits() != SizeInBits) {
8232     if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8233       // Handle larger SrcVec by treating it as a larger permute.
8234       unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8235       VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8236       IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8237       IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8238                                   Subtarget, DAG, SDLoc(IndicesVec));
8239       SDValue NewSrcVec =
8240           createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8241       if (NewSrcVec)
8242         return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8243       return SDValue();
8244     } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8245       // Widen smaller SrcVec to match VT.
8246       SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8247     } else
8248       return SDValue();
8249   }
8250
8251   auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8252     assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8253     EVT SrcVT = Idx.getValueType();
8254     unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8255     uint64_t IndexScale = 0;
8256     uint64_t IndexOffset = 0;
8257
8258     // If we're scaling a smaller permute op, then we need to repeat the
8259     // indices, scaling and offsetting them as well.
8260     // e.g. v4i32 -> v16i8 (Scale = 4)
8261     // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8262     // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8263     for (uint64_t i = 0; i != Scale; ++i) {
8264       IndexScale |= Scale << (i * NumDstBits);
8265       IndexOffset |= i << (i * NumDstBits);
8266     }
8267
8268     Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8269                       DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8270     Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8271                       DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8272     return Idx;
8273   };
8274
8275   unsigned Opcode = 0;
8276   switch (VT.SimpleTy) {
8277   default:
8278     break;
8279   case MVT::v16i8:
8280     if (Subtarget.hasSSSE3())
8281       Opcode = X86ISD::PSHUFB;
8282     break;
8283   case MVT::v8i16:
8284     if (Subtarget.hasVLX() && Subtarget.hasBWI())
8285       Opcode = X86ISD::VPERMV;
8286     else if (Subtarget.hasSSSE3()) {
8287       Opcode = X86ISD::PSHUFB;
8288       ShuffleVT = MVT::v16i8;
8289     }
8290     break;
8291   case MVT::v4f32:
8292   case MVT::v4i32:
8293     if (Subtarget.hasAVX()) {
8294       Opcode = X86ISD::VPERMILPV;
8295       ShuffleVT = MVT::v4f32;
8296     } else if (Subtarget.hasSSSE3()) {
8297       Opcode = X86ISD::PSHUFB;
8298       ShuffleVT = MVT::v16i8;
8299     }
8300     break;
8301   case MVT::v2f64:
8302   case MVT::v2i64:
8303     if (Subtarget.hasAVX()) {
8304       // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8305       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8306       Opcode = X86ISD::VPERMILPV;
8307       ShuffleVT = MVT::v2f64;
8308     } else if (Subtarget.hasSSE41()) {
8309       // SSE41 can compare v2i64 - select between indices 0 and 1.
8310       return DAG.getSelectCC(
8311           DL, IndicesVec,
8312           getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8313           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8314           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8315           ISD::CondCode::SETEQ);
8316     }
8317     break;
8318   case MVT::v32i8:
8319     if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8320       Opcode = X86ISD::VPERMV;
8321     else if (Subtarget.hasXOP()) {
8322       SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8323       SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8324       SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8325       SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8326       return DAG.getNode(
8327           ISD::CONCAT_VECTORS, DL, VT,
8328           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8329           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8330     } else if (Subtarget.hasAVX()) {
8331       SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8332       SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8333       SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8334       SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8335       auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8336                               ArrayRef<SDValue> Ops) {
8337         // Permute Lo and Hi and then select based on index range.
8338         // This works as SHUFB uses bits[3:0] to permute elements and we don't
8339         // care about the bit[7] as its just an index vector.
8340         SDValue Idx = Ops[2];
8341         EVT VT = Idx.getValueType();
8342         return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8343                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8344                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8345                                ISD::CondCode::SETGT);
8346       };
8347       SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8348       return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8349                               PSHUFBBuilder);
8350     }
8351     break;
8352   case MVT::v16i16:
8353     if (Subtarget.hasVLX() && Subtarget.hasBWI())
8354       Opcode = X86ISD::VPERMV;
8355     else if (Subtarget.hasAVX()) {
8356       // Scale to v32i8 and perform as v32i8.
8357       IndicesVec = ScaleIndices(IndicesVec, 2);
8358       return DAG.getBitcast(
8359           VT, createVariablePermute(
8360                   MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8361                   DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8362     }
8363     break;
8364   case MVT::v8f32:
8365   case MVT::v8i32:
8366     if (Subtarget.hasAVX2())
8367       Opcode = X86ISD::VPERMV;
8368     else if (Subtarget.hasAVX()) {
8369       SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8370       SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8371                                           {0, 1, 2, 3, 0, 1, 2, 3});
8372       SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8373                                           {4, 5, 6, 7, 4, 5, 6, 7});
8374       if (Subtarget.hasXOP())
8375         return DAG.getBitcast(
8376             VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8377                             IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8378       // Permute Lo and Hi and then select based on index range.
8379       // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8380       SDValue Res = DAG.getSelectCC(
8381           DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8382           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8383           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8384           ISD::CondCode::SETGT);
8385       return DAG.getBitcast(VT, Res);
8386     }
8387     break;
8388   case MVT::v4i64:
8389   case MVT::v4f64:
8390     if (Subtarget.hasAVX512()) {
8391       if (!Subtarget.hasVLX()) {
8392         MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8393         SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8394                                 SDLoc(SrcVec));
8395         IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8396                                     DAG, SDLoc(IndicesVec));
8397         SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8398                                             DAG, Subtarget);
8399         return extract256BitVector(Res, 0, DAG, DL);
8400       }
8401       Opcode = X86ISD::VPERMV;
8402     } else if (Subtarget.hasAVX()) {
8403       SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8404       SDValue LoLo =
8405           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8406       SDValue HiHi =
8407           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8408       // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8409       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8410       if (Subtarget.hasXOP())
8411         return DAG.getBitcast(
8412             VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8413                             IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8414       // Permute Lo and Hi and then select based on index range.
8415       // This works as VPERMILPD only uses index bit[1] to permute elements.
8416       SDValue Res = DAG.getSelectCC(
8417           DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8418           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8419           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8420           ISD::CondCode::SETGT);
8421       return DAG.getBitcast(VT, Res);
8422     }
8423     break;
8424   case MVT::v64i8:
8425     if (Subtarget.hasVBMI())
8426       Opcode = X86ISD::VPERMV;
8427     break;
8428   case MVT::v32i16:
8429     if (Subtarget.hasBWI())
8430       Opcode = X86ISD::VPERMV;
8431     break;
8432   case MVT::v16f32:
8433   case MVT::v16i32:
8434   case MVT::v8f64:
8435   case MVT::v8i64:
8436     if (Subtarget.hasAVX512())
8437       Opcode = X86ISD::VPERMV;
8438     break;
8439   }
8440   if (!Opcode)
8441     return SDValue();
8442
8443   assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8444          (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8445          "Illegal variable permute shuffle type");
8446
8447   uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8448   if (Scale > 1)
8449     IndicesVec = ScaleIndices(IndicesVec, Scale);
8450
8451   EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8452   IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8453
8454   SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8455   SDValue Res = Opcode == X86ISD::VPERMV
8456                     ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8457                     : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8458   return DAG.getBitcast(VT, Res);
8459 }
8460
8461 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8462 // reasoned to be a permutation of a vector by indices in a non-constant vector.
8463 // (build_vector (extract_elt V, (extract_elt I, 0)),
8464 //               (extract_elt V, (extract_elt I, 1)),
8465 //                    ...
8466 // ->
8467 // (vpermv I, V)
8468 //
8469 // TODO: Handle undefs
8470 // TODO: Utilize pshufb and zero mask blending to support more efficient
8471 // construction of vectors with constant-0 elements.
8472 static SDValue
8473 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
8474                                    const X86Subtarget &Subtarget) {
8475   SDValue SrcVec, IndicesVec;
8476   // Check for a match of the permute source vector and permute index elements.
8477   // This is done by checking that the i-th build_vector operand is of the form:
8478   // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8479   for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8480     SDValue Op = V.getOperand(Idx);
8481     if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8482       return SDValue();
8483
8484     // If this is the first extract encountered in V, set the source vector,
8485     // otherwise verify the extract is from the previously defined source
8486     // vector.
8487     if (!SrcVec)
8488       SrcVec = Op.getOperand(0);
8489     else if (SrcVec != Op.getOperand(0))
8490       return SDValue();
8491     SDValue ExtractedIndex = Op->getOperand(1);
8492     // Peek through extends.
8493     if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8494         ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8495       ExtractedIndex = ExtractedIndex.getOperand(0);
8496     if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8497       return SDValue();
8498
8499     // If this is the first extract from the index vector candidate, set the
8500     // indices vector, otherwise verify the extract is from the previously
8501     // defined indices vector.
8502     if (!IndicesVec)
8503       IndicesVec = ExtractedIndex.getOperand(0);
8504     else if (IndicesVec != ExtractedIndex.getOperand(0))
8505       return SDValue();
8506
8507     auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8508     if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8509       return SDValue();
8510   }
8511
8512   SDLoc DL(V);
8513   MVT VT = V.getSimpleValueType();
8514   return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8515 }
8516
8517 SDValue
8518 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8519   SDLoc dl(Op);
8520
8521   MVT VT = Op.getSimpleValueType();
8522   MVT EltVT = VT.getVectorElementType();
8523   MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
8524   unsigned NumElems = Op.getNumOperands();
8525
8526   // Generate vectors for predicate vectors.
8527   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8528     return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
8529
8530   if (VT.getVectorElementType() == MVT::bf16 &&
8531       (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
8532     return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
8533
8534   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
8535     return VectorConstant;
8536
8537   unsigned EVTBits = EltVT.getSizeInBits();
8538   APInt UndefMask = APInt::getZero(NumElems);
8539   APInt FrozenUndefMask = APInt::getZero(NumElems);
8540   APInt ZeroMask = APInt::getZero(NumElems);
8541   APInt NonZeroMask = APInt::getZero(NumElems);
8542   bool IsAllConstants = true;
8543   bool OneUseFrozenUndefs = true;
8544   SmallSet<SDValue, 8> Values;
8545   unsigned NumConstants = NumElems;
8546   for (unsigned i = 0; i < NumElems; ++i) {
8547     SDValue Elt = Op.getOperand(i);
8548     if (Elt.isUndef()) {
8549       UndefMask.setBit(i);
8550       continue;
8551     }
8552     if (ISD::isFreezeUndef(Elt.getNode())) {
8553       OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
8554       FrozenUndefMask.setBit(i);
8555       continue;
8556     }
8557     Values.insert(Elt);
8558     if (!isIntOrFPConstant(Elt)) {
8559       IsAllConstants = false;
8560       NumConstants--;
8561     }
8562     if (X86::isZeroNode(Elt)) {
8563       ZeroMask.setBit(i);
8564     } else {
8565       NonZeroMask.setBit(i);
8566     }
8567   }
8568
8569   // All undef vector. Return an UNDEF.
8570   if (UndefMask.isAllOnes())
8571     return DAG.getUNDEF(VT);
8572
8573   // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
8574   if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
8575     return DAG.getFreeze(DAG.getUNDEF(VT));
8576
8577   // All undef/freeze(undef)/zero vector. Return a zero vector.
8578   if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
8579     return getZeroVector(VT, Subtarget, DAG, dl);
8580
8581   // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
8582   // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
8583   // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
8584   // and blend the FREEZE-UNDEF operands back in.
8585   // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
8586   if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
8587       NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
8588     SmallVector<int, 16> BlendMask(NumElems, -1);
8589     SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
8590     for (unsigned i = 0; i < NumElems; ++i) {
8591       if (UndefMask[i]) {
8592         BlendMask[i] = -1;
8593         continue;
8594       }
8595       BlendMask[i] = i;
8596       if (!FrozenUndefMask[i])
8597         Elts[i] = Op.getOperand(i);
8598       else
8599         BlendMask[i] += NumElems;
8600     }
8601     SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
8602     SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
8603     SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
8604     return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
8605   }
8606
8607   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8608
8609   // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
8610   // be better off lowering to a smaller build vector and padding with
8611   // undef/zero.
8612   if ((VT.is256BitVector() || VT.is512BitVector()) &&
8613       !isFoldableUseOfShuffle(BV)) {
8614     unsigned UpperElems = NumElems / 2;
8615     APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
8616     unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
8617     if (NumUpperUndefsOrZeros >= UpperElems) {
8618       if (VT.is512BitVector() &&
8619           NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
8620         UpperElems = NumElems - (NumElems / 4);
8621       // If freeze(undef) is in any upper elements, force to zero.
8622       bool UndefUpper = UndefMask.countl_one() >= UpperElems;
8623       MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
8624       SDValue NewBV =
8625           DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
8626       return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
8627     }
8628   }
8629
8630   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
8631     return AddSub;
8632   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
8633     return HorizontalOp;
8634   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
8635     return Broadcast;
8636   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
8637     return BitOp;
8638
8639   unsigned NumZero = ZeroMask.popcount();
8640   unsigned NumNonZero = NonZeroMask.popcount();
8641
8642   // If we are inserting one variable into a vector of non-zero constants, try
8643   // to avoid loading each constant element as a scalar. Load the constants as a
8644   // vector and then insert the variable scalar element. If insertion is not
8645   // supported, fall back to a shuffle to get the scalar blended with the
8646   // constants. Insertion into a zero vector is handled as a special-case
8647   // somewhere below here.
8648   if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8649       (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
8650        isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
8651     // Create an all-constant vector. The variable element in the old
8652     // build vector is replaced by undef in the constant vector. Save the
8653     // variable scalar element and its index for use in the insertelement.
8654     LLVMContext &Context = *DAG.getContext();
8655     Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8656     SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8657     SDValue VarElt;
8658     SDValue InsIndex;
8659     for (unsigned i = 0; i != NumElems; ++i) {
8660       SDValue Elt = Op.getOperand(i);
8661       if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8662         ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8663       else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8664         ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8665       else if (!Elt.isUndef()) {
8666         assert(!VarElt.getNode() && !InsIndex.getNode() &&
8667                "Expected one variable element in this vector");
8668         VarElt = Elt;
8669         InsIndex = DAG.getVectorIdxConstant(i, dl);
8670       }
8671     }
8672     Constant *CV = ConstantVector::get(ConstVecOps);
8673     SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8674
8675     // The constants we just created may not be legal (eg, floating point). We
8676     // must lower the vector right here because we can not guarantee that we'll
8677     // legalize it before loading it. This is also why we could not just create
8678     // a new build vector here. If the build vector contains illegal constants,
8679     // it could get split back up into a series of insert elements.
8680     // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8681     SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8682     MachineFunction &MF = DAG.getMachineFunction();
8683     MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
8684     SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8685     unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
8686     unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
8687     if (InsertC < NumEltsInLow128Bits)
8688       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8689
8690     // There's no good way to insert into the high elements of a >128-bit
8691     // vector, so use shuffles to avoid an extract/insert sequence.
8692     assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
8693     assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
8694     SmallVector<int, 8> ShuffleMask;
8695     unsigned NumElts = VT.getVectorNumElements();
8696     for (unsigned i = 0; i != NumElts; ++i)
8697       ShuffleMask.push_back(i == InsertC ? NumElts : i);
8698     SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
8699     return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
8700   }
8701
8702   // Special case for single non-zero, non-undef, element.
8703   if (NumNonZero == 1) {
8704     unsigned Idx = NonZeroMask.countr_zero();
8705     SDValue Item = Op.getOperand(Idx);
8706
8707     // If we have a constant or non-constant insertion into the low element of
8708     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8709     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
8710     // depending on what the source datatype is.
8711     if (Idx == 0) {
8712       if (NumZero == 0)
8713         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8714
8715       if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
8716           EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
8717           (EltVT == MVT::i16 && Subtarget.hasFP16())) {
8718         assert((VT.is128BitVector() || VT.is256BitVector() ||
8719                 VT.is512BitVector()) &&
8720                "Expected an SSE value type!");
8721         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8722         // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
8723         // zero vector.
8724         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8725       }
8726
8727       // We can't directly insert an i8 or i16 into a vector, so zero extend
8728       // it to i32 first.
8729       if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8730         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8731         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
8732         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8733         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8734         return DAG.getBitcast(VT, Item);
8735       }
8736     }
8737
8738     // Is it a vector logical left shift?
8739     if (NumElems == 2 && Idx == 1 &&
8740         X86::isZeroNode(Op.getOperand(0)) &&
8741         !X86::isZeroNode(Op.getOperand(1))) {
8742       unsigned NumBits = VT.getSizeInBits();
8743       return getVShift(true, VT,
8744                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8745                                    VT, Op.getOperand(1)),
8746                        NumBits/2, DAG, *this, dl);
8747     }
8748
8749     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8750       return SDValue();
8751
8752     // Otherwise, if this is a vector with i32 or f32 elements, and the element
8753     // is a non-constant being inserted into an element other than the low one,
8754     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
8755     // movd/movss) to move this into the low element, then shuffle it into
8756     // place.
8757     if (EVTBits == 32) {
8758       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8759       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8760     }
8761   }
8762
8763   // Splat is obviously ok. Let legalizer expand it to a shuffle.
8764   if (Values.size() == 1) {
8765     if (EVTBits == 32) {
8766       // Instead of a shuffle like this:
8767       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8768       // Check if it's possible to issue this instead.
8769       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8770       unsigned Idx = NonZeroMask.countr_zero();
8771       SDValue Item = Op.getOperand(Idx);
8772       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8773         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8774     }
8775     return SDValue();
8776   }
8777
8778   // A vector full of immediates; various special cases are already
8779   // handled, so this is best done with a single constant-pool load.
8780   if (IsAllConstants)
8781     return SDValue();
8782
8783   if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8784       return V;
8785
8786   // See if we can use a vector load to get all of the elements.
8787   {
8788     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8789     if (SDValue LD =
8790             EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8791       return LD;
8792   }
8793
8794   // If this is a splat of pairs of 32-bit elements, we can use a narrower
8795   // build_vector and broadcast it.
8796   // TODO: We could probably generalize this more.
8797   if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
8798     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8799                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8800     auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
8801       // Make sure all the even/odd operands match.
8802       for (unsigned i = 2; i != NumElems; ++i)
8803         if (Ops[i % 2] != Op.getOperand(i))
8804           return false;
8805       return true;
8806     };
8807     if (CanSplat(Op, NumElems, Ops)) {
8808       MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
8809       MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
8810       // Create a new build vector and cast to v2i64/v2f64.
8811       SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
8812                                      DAG.getBuildVector(NarrowVT, dl, Ops));
8813       // Broadcast from v2i64/v2f64 and cast to final VT.
8814       MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
8815       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
8816                                             NewBV));
8817     }
8818   }
8819
8820   // For AVX-length vectors, build the individual 128-bit pieces and use
8821   // shuffles to put them in place.
8822   if (VT.getSizeInBits() > 128) {
8823     MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
8824
8825     // Build both the lower and upper subvector.
8826     SDValue Lower =
8827         DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8828     SDValue Upper = DAG.getBuildVector(
8829         HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8830
8831     // Recreate the wider vector with the lower and upper part.
8832     return concatSubVectors(Lower, Upper, DAG, dl);
8833   }
8834
8835   // Let legalizer expand 2-wide build_vectors.
8836   if (EVTBits == 64) {
8837     if (NumNonZero == 1) {
8838       // One half is zero or undef.
8839       unsigned Idx = NonZeroMask.countr_zero();
8840       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8841                                Op.getOperand(Idx));
8842       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8843     }
8844     return SDValue();
8845   }
8846
8847   // If element VT is < 32 bits, convert it to inserts into a zero vector.
8848   if (EVTBits == 8 && NumElems == 16)
8849     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
8850                                           DAG, Subtarget))
8851       return V;
8852
8853   if (EltVT == MVT::i16 && NumElems == 8)
8854     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
8855                                           DAG, Subtarget))
8856       return V;
8857
8858   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8859   if (EVTBits == 32 && NumElems == 4)
8860     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8861       return V;
8862
8863   // If element VT is == 32 bits, turn it into a number of shuffles.
8864   if (NumElems == 4 && NumZero > 0) {
8865     SmallVector<SDValue, 8> Ops(NumElems);
8866     for (unsigned i = 0; i < 4; ++i) {
8867       bool isZero = !NonZeroMask[i];
8868       if (isZero)
8869         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8870       else
8871         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8872     }
8873
8874     for (unsigned i = 0; i < 2; ++i) {
8875       switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
8876         default: llvm_unreachable("Unexpected NonZero count");
8877         case 0:
8878           Ops[i] = Ops[i*2];  // Must be a zero vector.
8879           break;
8880         case 1:
8881           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8882           break;
8883         case 2:
8884           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8885           break;
8886         case 3:
8887           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8888           break;
8889       }
8890     }
8891
8892     bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
8893     bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
8894     int MaskVec[] = {
8895       Reverse1 ? 1 : 0,
8896       Reverse1 ? 0 : 1,
8897       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8898       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
8899     };
8900     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8901   }
8902
8903   assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8904
8905   // Check for a build vector from mostly shuffle plus few inserting.
8906   if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8907     return Sh;
8908
8909   // For SSE 4.1, use insertps to put the high elements into the low element.
8910   if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
8911     SDValue Result;
8912     if (!Op.getOperand(0).isUndef())
8913       Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8914     else
8915       Result = DAG.getUNDEF(VT);
8916
8917     for (unsigned i = 1; i < NumElems; ++i) {
8918       if (Op.getOperand(i).isUndef()) continue;
8919       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8920                            Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8921     }
8922     return Result;
8923   }
8924
8925   // Otherwise, expand into a number of unpckl*, start by extending each of
8926   // our (non-undef) elements to the full vector width with the element in the
8927   // bottom slot of the vector (which generates no code for SSE).
8928   SmallVector<SDValue, 8> Ops(NumElems);
8929   for (unsigned i = 0; i < NumElems; ++i) {
8930     if (!Op.getOperand(i).isUndef())
8931       Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8932     else
8933       Ops[i] = DAG.getUNDEF(VT);
8934   }
8935
8936   // Next, we iteratively mix elements, e.g. for v4f32:
8937   //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8938   //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8939   //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
8940   for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8941     // Generate scaled UNPCKL shuffle mask.
8942     SmallVector<int, 16> Mask;
8943     for(unsigned i = 0; i != Scale; ++i)
8944       Mask.push_back(i);
8945     for (unsigned i = 0; i != Scale; ++i)
8946       Mask.push_back(NumElems+i);
8947     Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8948
8949     for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8950       Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8951   }
8952   return Ops[0];
8953 }
8954
8955 // 256-bit AVX can use the vinsertf128 instruction
8956 // to create 256-bit vectors from two other 128-bit ones.
8957 // TODO: Detect subvector broadcast here instead of DAG combine?
8958 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
8959                                       const X86Subtarget &Subtarget) {
8960   SDLoc dl(Op);
8961   MVT ResVT = Op.getSimpleValueType();
8962
8963   assert((ResVT.is256BitVector() ||
8964           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8965
8966   unsigned NumOperands = Op.getNumOperands();
8967   unsigned NumFreezeUndef = 0;
8968   unsigned NumZero = 0;
8969   unsigned NumNonZero = 0;
8970   unsigned NonZeros = 0;
8971   for (unsigned i = 0; i != NumOperands; ++i) {
8972     SDValue SubVec = Op.getOperand(i);
8973     if (SubVec.isUndef())
8974       continue;
8975     if (ISD::isFreezeUndef(SubVec.getNode())) {
8976         // If the freeze(undef) has multiple uses then we must fold to zero.
8977         if (SubVec.hasOneUse())
8978           ++NumFreezeUndef;
8979         else
8980           ++NumZero;
8981     }
8982     else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8983       ++NumZero;
8984     else {
8985       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8986       NonZeros |= 1 << i;
8987       ++NumNonZero;
8988     }
8989   }
8990
8991   // If we have more than 2 non-zeros, build each half separately.
8992   if (NumNonZero > 2) {
8993     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
8994     ArrayRef<SDUse> Ops = Op->ops();
8995     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8996                              Ops.slice(0, NumOperands/2));
8997     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8998                              Ops.slice(NumOperands/2));
8999     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9000   }
9001
9002   // Otherwise, build it up through insert_subvectors.
9003   SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9004                         : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9005                                           : DAG.getUNDEF(ResVT));
9006
9007   MVT SubVT = Op.getOperand(0).getSimpleValueType();
9008   unsigned NumSubElems = SubVT.getVectorNumElements();
9009   for (unsigned i = 0; i != NumOperands; ++i) {
9010     if ((NonZeros & (1 << i)) == 0)
9011       continue;
9012
9013     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
9014                       Op.getOperand(i),
9015                       DAG.getIntPtrConstant(i * NumSubElems, dl));
9016   }
9017
9018   return Vec;
9019 }
9020
9021 // Returns true if the given node is a type promotion (by concatenating i1
9022 // zeros) of the result of a node that already zeros all upper bits of
9023 // k-register.
9024 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
9025 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
9026                                        const X86Subtarget &Subtarget,
9027                                        SelectionDAG & DAG) {
9028   SDLoc dl(Op);
9029   MVT ResVT = Op.getSimpleValueType();
9030   unsigned NumOperands = Op.getNumOperands();
9031
9032   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9033          "Unexpected number of operands in CONCAT_VECTORS");
9034
9035   uint64_t Zeros = 0;
9036   uint64_t NonZeros = 0;
9037   for (unsigned i = 0; i != NumOperands; ++i) {
9038     SDValue SubVec = Op.getOperand(i);
9039     if (SubVec.isUndef())
9040       continue;
9041     assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9042     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9043       Zeros |= (uint64_t)1 << i;
9044     else
9045       NonZeros |= (uint64_t)1 << i;
9046   }
9047
9048   unsigned NumElems = ResVT.getVectorNumElements();
9049
9050   // If we are inserting non-zero vector and there are zeros in LSBs and undef
9051   // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9052   // insert_subvector will give us two kshifts.
9053   if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9054       Log2_64(NonZeros) != NumOperands - 1) {
9055     unsigned Idx = Log2_64(NonZeros);
9056     SDValue SubVec = Op.getOperand(Idx);
9057     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9058     MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9059     Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9060     Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9061                      DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9062     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9063                        DAG.getIntPtrConstant(0, dl));
9064   }
9065
9066   // If there are zero or one non-zeros we can handle this very simply.
9067   if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9068     SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9069     if (!NonZeros)
9070       return Vec;
9071     unsigned Idx = Log2_64(NonZeros);
9072     SDValue SubVec = Op.getOperand(Idx);
9073     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9074     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9075                        DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
9076   }
9077
9078   if (NumOperands > 2) {
9079     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9080     ArrayRef<SDUse> Ops = Op->ops();
9081     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9082                              Ops.slice(0, NumOperands/2));
9083     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9084                              Ops.slice(NumOperands/2));
9085     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9086   }
9087
9088   assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9089
9090   if (ResVT.getVectorNumElements() >= 16)
9091     return Op; // The operation is legal with KUNPCK
9092
9093   SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
9094                             DAG.getUNDEF(ResVT), Op.getOperand(0),
9095                             DAG.getIntPtrConstant(0, dl));
9096   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9097                      DAG.getIntPtrConstant(NumElems/2, dl));
9098 }
9099
9100 static SDValue LowerCONCAT_VECTORS(SDValue Op,
9101                                    const X86Subtarget &Subtarget,
9102                                    SelectionDAG &DAG) {
9103   MVT VT = Op.getSimpleValueType();
9104   if (VT.getVectorElementType() == MVT::i1)
9105     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9106
9107   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9108          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9109           Op.getNumOperands() == 4)));
9110
9111   // AVX can use the vinsertf128 instruction to create 256-bit vectors
9112   // from two other 128-bit ones.
9113
9114   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9115   return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9116 }
9117
9118 //===----------------------------------------------------------------------===//
9119 // Vector shuffle lowering
9120 //
9121 // This is an experimental code path for lowering vector shuffles on x86. It is
9122 // designed to handle arbitrary vector shuffles and blends, gracefully
9123 // degrading performance as necessary. It works hard to recognize idiomatic
9124 // shuffles and lower them to optimal instruction patterns without leaving
9125 // a framework that allows reasonably efficient handling of all vector shuffle
9126 // patterns.
9127 //===----------------------------------------------------------------------===//
9128
9129 /// Tiny helper function to identify a no-op mask.
9130 ///
9131 /// This is a somewhat boring predicate function. It checks whether the mask
9132 /// array input, which is assumed to be a single-input shuffle mask of the kind
9133 /// used by the X86 shuffle instructions (not a fully general
9134 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9135 /// in-place shuffle are 'no-op's.
9136 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
9137   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9138     assert(Mask[i] >= -1 && "Out of bound mask element!");
9139     if (Mask[i] >= 0 && Mask[i] != i)
9140       return false;
9141   }
9142   return true;
9143 }
9144
9145 /// Test whether there are elements crossing LaneSizeInBits lanes in this
9146 /// shuffle mask.
9147 ///
9148 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9149 /// and we routinely test for these.
9150 static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9151                                       unsigned ScalarSizeInBits,
9152                                       ArrayRef<int> Mask) {
9153   assert(LaneSizeInBits && ScalarSizeInBits &&
9154          (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9155          "Illegal shuffle lane size");
9156   int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9157   int Size = Mask.size();
9158   for (int i = 0; i < Size; ++i)
9159     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9160       return true;
9161   return false;
9162 }
9163
9164 /// Test whether there are elements crossing 128-bit lanes in this
9165 /// shuffle mask.
9166 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
9167   return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9168 }
9169
9170 /// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9171 /// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9172 /// better support 'repeated mask + lane permute' style shuffles.
9173 static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9174                                    unsigned ScalarSizeInBits,
9175                                    ArrayRef<int> Mask) {
9176   assert(LaneSizeInBits && ScalarSizeInBits &&
9177          (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9178          "Illegal shuffle lane size");
9179   int NumElts = Mask.size();
9180   int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9181   int NumLanes = NumElts / NumEltsPerLane;
9182   if (NumLanes > 1) {
9183     for (int i = 0; i != NumLanes; ++i) {
9184       int SrcLane = -1;
9185       for (int j = 0; j != NumEltsPerLane; ++j) {
9186         int M = Mask[(i * NumEltsPerLane) + j];
9187         if (M < 0)
9188           continue;
9189         int Lane = (M % NumElts) / NumEltsPerLane;
9190         if (SrcLane >= 0 && SrcLane != Lane)
9191           return true;
9192         SrcLane = Lane;
9193       }
9194     }
9195   }
9196   return false;
9197 }
9198
9199 /// Test whether a shuffle mask is equivalent within each sub-lane.
9200 ///
9201 /// This checks a shuffle mask to see if it is performing the same
9202 /// lane-relative shuffle in each sub-lane. This trivially implies
9203 /// that it is also not lane-crossing. It may however involve a blend from the
9204 /// same lane of a second vector.
9205 ///
9206 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9207 /// non-trivial to compute in the face of undef lanes. The representation is
9208 /// suitable for use with existing 128-bit shuffles as entries from the second
9209 /// vector have been remapped to [LaneSize, 2*LaneSize).
9210 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9211                                   ArrayRef<int> Mask,
9212                                   SmallVectorImpl<int> &RepeatedMask) {
9213   auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9214   RepeatedMask.assign(LaneSize, -1);
9215   int Size = Mask.size();
9216   for (int i = 0; i < Size; ++i) {
9217     assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9218     if (Mask[i] < 0)
9219       continue;
9220     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9221       // This entry crosses lanes, so there is no way to model this shuffle.
9222       return false;
9223
9224     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9225     // Adjust second vector indices to start at LaneSize instead of Size.
9226     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9227                                 : Mask[i] % LaneSize + LaneSize;
9228     if (RepeatedMask[i % LaneSize] < 0)
9229       // This is the first non-undef entry in this slot of a 128-bit lane.
9230       RepeatedMask[i % LaneSize] = LocalM;
9231     else if (RepeatedMask[i % LaneSize] != LocalM)
9232       // Found a mismatch with the repeated mask.
9233       return false;
9234   }
9235   return true;
9236 }
9237
9238 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
9239 static bool
9240 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
9241                                 SmallVectorImpl<int> &RepeatedMask) {
9242   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9243 }
9244
9245 static bool
9246 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
9247   SmallVector<int, 32> RepeatedMask;
9248   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9249 }
9250
9251 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
9252 static bool
9253 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
9254                                 SmallVectorImpl<int> &RepeatedMask) {
9255   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9256 }
9257
9258 /// Test whether a target shuffle mask is equivalent within each sub-lane.
9259 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9260 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9261                                         unsigned EltSizeInBits,
9262                                         ArrayRef<int> Mask,
9263                                         SmallVectorImpl<int> &RepeatedMask) {
9264   int LaneSize = LaneSizeInBits / EltSizeInBits;
9265   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9266   int Size = Mask.size();
9267   for (int i = 0; i < Size; ++i) {
9268     assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9269     if (Mask[i] == SM_SentinelUndef)
9270       continue;
9271     if (Mask[i] == SM_SentinelZero) {
9272       if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9273         return false;
9274       RepeatedMask[i % LaneSize] = SM_SentinelZero;
9275       continue;
9276     }
9277     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9278       // This entry crosses lanes, so there is no way to model this shuffle.
9279       return false;
9280
9281     // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9282     // later vector indices to start at multiples of LaneSize instead of Size.
9283     int LaneM = Mask[i] / Size;
9284     int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9285     if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9286       // This is the first non-undef entry in this slot of a 128-bit lane.
9287       RepeatedMask[i % LaneSize] = LocalM;
9288     else if (RepeatedMask[i % LaneSize] != LocalM)
9289       // Found a mismatch with the repeated mask.
9290       return false;
9291   }
9292   return true;
9293 }
9294
9295 /// Test whether a target shuffle mask is equivalent within each sub-lane.
9296 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9297 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9298                                         ArrayRef<int> Mask,
9299                                         SmallVectorImpl<int> &RepeatedMask) {
9300   return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9301                                      Mask, RepeatedMask);
9302 }
9303
9304 /// Checks whether the vector elements referenced by two shuffle masks are
9305 /// equivalent.
9306 static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9307                                 int Idx, int ExpectedIdx) {
9308   assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9309          ExpectedIdx < MaskSize && "Out of range element index");
9310   if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9311     return false;
9312
9313   switch (Op.getOpcode()) {
9314   case ISD::BUILD_VECTOR:
9315     // If the values are build vectors, we can look through them to find
9316     // equivalent inputs that make the shuffles equivalent.
9317     // TODO: Handle MaskSize != Op.getNumOperands()?
9318     if (MaskSize == (int)Op.getNumOperands() &&
9319         MaskSize == (int)ExpectedOp.getNumOperands())
9320       return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9321     break;
9322   case X86ISD::VBROADCAST:
9323   case X86ISD::VBROADCAST_LOAD:
9324     // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
9325     return (Op == ExpectedOp &&
9326             (int)Op.getValueType().getVectorNumElements() == MaskSize);
9327   case X86ISD::HADD:
9328   case X86ISD::HSUB:
9329   case X86ISD::FHADD:
9330   case X86ISD::FHSUB:
9331   case X86ISD::PACKSS:
9332   case X86ISD::PACKUS:
9333     // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9334     // TODO: Handle MaskSize != NumElts?
9335     // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9336     if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9337       MVT VT = Op.getSimpleValueType();
9338       int NumElts = VT.getVectorNumElements();
9339       if (MaskSize == NumElts) {
9340         int NumLanes = VT.getSizeInBits() / 128;
9341         int NumEltsPerLane = NumElts / NumLanes;
9342         int NumHalfEltsPerLane = NumEltsPerLane / 2;
9343         bool SameLane =
9344             (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9345         bool SameElt =
9346             (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9347         return SameLane && SameElt;
9348       }
9349     }
9350     break;
9351   }
9352
9353   return false;
9354 }
9355
9356 /// Checks whether a shuffle mask is equivalent to an explicit list of
9357 /// arguments.
9358 ///
9359 /// This is a fast way to test a shuffle mask against a fixed pattern:
9360 ///
9361 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9362 ///
9363 /// It returns true if the mask is exactly as wide as the argument list, and
9364 /// each element of the mask is either -1 (signifying undef) or the value given
9365 /// in the argument.
9366 static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9367                                 SDValue V1 = SDValue(),
9368                                 SDValue V2 = SDValue()) {
9369   int Size = Mask.size();
9370   if (Size != (int)ExpectedMask.size())
9371     return false;
9372
9373   for (int i = 0; i < Size; ++i) {
9374     assert(Mask[i] >= -1 && "Out of bound mask element!");
9375     int MaskIdx = Mask[i];
9376     int ExpectedIdx = ExpectedMask[i];
9377     if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9378       SDValue MaskV = MaskIdx < Size ? V1 : V2;
9379       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9380       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9381       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9382       if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9383         return false;
9384     }
9385   }
9386   return true;
9387 }
9388
9389 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9390 ///
9391 /// The masks must be exactly the same width.
9392 ///
9393 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9394 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
9395 ///
9396 /// SM_SentinelZero is accepted as a valid negative index but must match in
9397 /// both, or via a known bits test.
9398 static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
9399                                       ArrayRef<int> ExpectedMask,
9400                                       const SelectionDAG &DAG,
9401                                       SDValue V1 = SDValue(),
9402                                       SDValue V2 = SDValue()) {
9403   int Size = Mask.size();
9404   if (Size != (int)ExpectedMask.size())
9405     return false;
9406   assert(llvm::all_of(ExpectedMask,
9407                       [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9408          "Illegal target shuffle mask");
9409
9410   // Check for out-of-range target shuffle mask indices.
9411   if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9412     return false;
9413
9414   // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9415   if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9416              !V1.getValueType().isVector()))
9417     V1 = SDValue();
9418   if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9419              !V2.getValueType().isVector()))
9420     V2 = SDValue();
9421
9422   APInt ZeroV1 = APInt::getZero(Size);
9423   APInt ZeroV2 = APInt::getZero(Size);
9424
9425   for (int i = 0; i < Size; ++i) {
9426     int MaskIdx = Mask[i];
9427     int ExpectedIdx = ExpectedMask[i];
9428     if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9429       continue;
9430     if (MaskIdx == SM_SentinelZero) {
9431       // If we need this expected index to be a zero element, then update the
9432       // relevant zero mask and perform the known bits at the end to minimize
9433       // repeated computes.
9434       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9435       if (ExpectedV &&
9436           Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9437         int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9438         APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9439         ZeroMask.setBit(BitIdx);
9440         continue;
9441       }
9442     }
9443     if (MaskIdx >= 0) {
9444       SDValue MaskV = MaskIdx < Size ? V1 : V2;
9445       SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9446       MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9447       ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9448       if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9449         continue;
9450     }
9451     return false;
9452   }
9453   return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9454          (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9455 }
9456
9457 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9458 // instructions.
9459 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,
9460                                   const SelectionDAG &DAG) {
9461   if (VT != MVT::v8i32 && VT != MVT::v8f32)
9462     return false;
9463
9464   SmallVector<int, 8> Unpcklwd;
9465   createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9466                           /* Unary = */ false);
9467   SmallVector<int, 8> Unpckhwd;
9468   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9469                           /* Unary = */ false);
9470   bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9471                          isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9472   return IsUnpackwdMask;
9473 }
9474
9475 static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,
9476                                       const SelectionDAG &DAG) {
9477   // Create 128-bit vector type based on mask size.
9478   MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9479   MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9480
9481   // We can't assume a canonical shuffle mask, so try the commuted version too.
9482   SmallVector<int, 4> CommutedMask(Mask);
9483   ShuffleVectorSDNode::commuteMask(CommutedMask);
9484
9485   // Match any of unary/binary or low/high.
9486   for (unsigned i = 0; i != 4; ++i) {
9487     SmallVector<int, 16> UnpackMask;
9488     createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9489     if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9490         isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9491       return true;
9492   }
9493   return false;
9494 }
9495
9496 /// Return true if a shuffle mask chooses elements identically in its top and
9497 /// bottom halves. For example, any splat mask has the same top and bottom
9498 /// halves. If an element is undefined in only one half of the mask, the halves
9499 /// are not considered identical.
9500 static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
9501   assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9502   unsigned HalfSize = Mask.size() / 2;
9503   for (unsigned i = 0; i != HalfSize; ++i) {
9504     if (Mask[i] != Mask[i + HalfSize])
9505       return false;
9506   }
9507   return true;
9508 }
9509
9510 /// Get a 4-lane 8-bit shuffle immediate for a mask.
9511 ///
9512 /// This helper function produces an 8-bit shuffle immediate corresponding to
9513 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
9514 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9515 /// example.
9516 ///
9517 /// NB: We rely heavily on "undef" masks preserving the input lane.
9518 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9519   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9520   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9521   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9522   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9523   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9524
9525   // If the mask only uses one non-undef element, then fully 'splat' it to
9526   // improve later broadcast matching.
9527   int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
9528   assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
9529
9530   int FirstElt = Mask[FirstIndex];
9531   if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
9532     return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
9533
9534   unsigned Imm = 0;
9535   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9536   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9537   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9538   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9539   return Imm;
9540 }
9541
9542 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
9543                                           SelectionDAG &DAG) {
9544   return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9545 }
9546
9547 // The Shuffle result is as follow:
9548 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9549 // Each Zeroable's element correspond to a particular Mask's element.
9550 // As described in computeZeroableShuffleElements function.
9551 //
9552 // The function looks for a sub-mask that the nonzero elements are in
9553 // increasing order. If such sub-mask exist. The function returns true.
9554 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9555                                      ArrayRef<int> Mask, const EVT &VectorType,
9556                                      bool &IsZeroSideLeft) {
9557   int NextElement = -1;
9558   // Check if the Mask's nonzero elements are in increasing order.
9559   for (int i = 0, e = Mask.size(); i < e; i++) {
9560     // Checks if the mask's zeros elements are built from only zeros.
9561     assert(Mask[i] >= -1 && "Out of bound mask element!");
9562     if (Mask[i] < 0)
9563       return false;
9564     if (Zeroable[i])
9565       continue;
9566     // Find the lowest non zero element
9567     if (NextElement < 0) {
9568       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9569       IsZeroSideLeft = NextElement != 0;
9570     }
9571     // Exit if the mask's non zero elements are not in increasing order.
9572     if (NextElement != Mask[i])
9573       return false;
9574     NextElement++;
9575   }
9576   return true;
9577 }
9578
9579 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9580 static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
9581                                       ArrayRef<int> Mask, SDValue V1,
9582                                       SDValue V2, const APInt &Zeroable,
9583                                       const X86Subtarget &Subtarget,
9584                                       SelectionDAG &DAG) {
9585   int Size = Mask.size();
9586   int LaneSize = 128 / VT.getScalarSizeInBits();
9587   const int NumBytes = VT.getSizeInBits() / 8;
9588   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9589
9590   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9591          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9592          (Subtarget.hasBWI() && VT.is512BitVector()));
9593
9594   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9595   // Sign bit set in i8 mask means zero element.
9596   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9597
9598   SDValue V;
9599   for (int i = 0; i < NumBytes; ++i) {
9600     int M = Mask[i / NumEltBytes];
9601     if (M < 0) {
9602       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9603       continue;
9604     }
9605     if (Zeroable[i / NumEltBytes]) {
9606       PSHUFBMask[i] = ZeroMask;
9607       continue;
9608     }
9609
9610     // We can only use a single input of V1 or V2.
9611     SDValue SrcV = (M >= Size ? V2 : V1);
9612     if (V && V != SrcV)
9613       return SDValue();
9614     V = SrcV;
9615     M %= Size;
9616
9617     // PSHUFB can't cross lanes, ensure this doesn't happen.
9618     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9619       return SDValue();
9620
9621     M = M % LaneSize;
9622     M = M * NumEltBytes + (i % NumEltBytes);
9623     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9624   }
9625   assert(V && "Failed to find a source input");
9626
9627   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9628   return DAG.getBitcast(
9629       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9630                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9631 }
9632
9633 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9634                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
9635                            const SDLoc &dl);
9636
9637 // X86 has dedicated shuffle that can be lowered to VEXPAND
9638 static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
9639                                     const APInt &Zeroable,
9640                                     ArrayRef<int> Mask, SDValue &V1,
9641                                     SDValue &V2, SelectionDAG &DAG,
9642                                     const X86Subtarget &Subtarget) {
9643   bool IsLeftZeroSide = true;
9644   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9645                                 IsLeftZeroSide))
9646     return SDValue();
9647   unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9648   MVT IntegerType =
9649       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9650   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9651   unsigned NumElts = VT.getVectorNumElements();
9652   assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9653          "Unexpected number of vector elements");
9654   SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9655                               Subtarget, DAG, DL);
9656   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9657   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9658   return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
9659 }
9660
9661 static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9662                                   unsigned &UnpackOpcode, bool IsUnary,
9663                                   ArrayRef<int> TargetMask, const SDLoc &DL,
9664                                   SelectionDAG &DAG,
9665                                   const X86Subtarget &Subtarget) {
9666   int NumElts = VT.getVectorNumElements();
9667
9668   bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9669   for (int i = 0; i != NumElts; i += 2) {
9670     int M1 = TargetMask[i + 0];
9671     int M2 = TargetMask[i + 1];
9672     Undef1 &= (SM_SentinelUndef == M1);
9673     Undef2 &= (SM_SentinelUndef == M2);
9674     Zero1 &= isUndefOrZero(M1);
9675     Zero2 &= isUndefOrZero(M2);
9676   }
9677   assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9678          "Zeroable shuffle detected");
9679
9680   // Attempt to match the target mask against the unpack lo/hi mask patterns.
9681   SmallVector<int, 64> Unpckl, Unpckh;
9682   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9683   if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
9684                                 (IsUnary ? V1 : V2))) {
9685     UnpackOpcode = X86ISD::UNPCKL;
9686     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9687     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9688     return true;
9689   }
9690
9691   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9692   if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
9693                                 (IsUnary ? V1 : V2))) {
9694     UnpackOpcode = X86ISD::UNPCKH;
9695     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9696     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9697     return true;
9698   }
9699
9700   // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9701   if (IsUnary && (Zero1 || Zero2)) {
9702     // Don't bother if we can blend instead.
9703     if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9704         isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9705       return false;
9706
9707     bool MatchLo = true, MatchHi = true;
9708     for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9709       int M = TargetMask[i];
9710
9711       // Ignore if the input is known to be zero or the index is undef.
9712       if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9713           (M == SM_SentinelUndef))
9714         continue;
9715
9716       MatchLo &= (M == Unpckl[i]);
9717       MatchHi &= (M == Unpckh[i]);
9718     }
9719
9720     if (MatchLo || MatchHi) {
9721       UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9722       V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9723       V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9724       return true;
9725     }
9726   }
9727
9728   // If a binary shuffle, commute and try again.
9729   if (!IsUnary) {
9730     ShuffleVectorSDNode::commuteMask(Unpckl);
9731     if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
9732       UnpackOpcode = X86ISD::UNPCKL;
9733       std::swap(V1, V2);
9734       return true;
9735     }
9736
9737     ShuffleVectorSDNode::commuteMask(Unpckh);
9738     if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
9739       UnpackOpcode = X86ISD::UNPCKH;
9740       std::swap(V1, V2);
9741       return true;
9742     }
9743   }
9744
9745   return false;
9746 }
9747
9748 // X86 has dedicated unpack instructions that can handle specific blend
9749 // operations: UNPCKH and UNPCKL.
9750 static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
9751                                      ArrayRef<int> Mask, SDValue V1, SDValue V2,
9752                                      SelectionDAG &DAG) {
9753   SmallVector<int, 8> Unpckl;
9754   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
9755   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9756     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
9757
9758   SmallVector<int, 8> Unpckh;
9759   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
9760   if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
9761     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
9762
9763   // Commute and try again.
9764   ShuffleVectorSDNode::commuteMask(Unpckl);
9765   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9766     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9767
9768   ShuffleVectorSDNode::commuteMask(Unpckh);
9769   if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
9770     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
9771
9772   return SDValue();
9773 }
9774
9775 /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
9776 /// followed by unpack 256-bit.
9777 static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
9778                                         ArrayRef<int> Mask, SDValue V1,
9779                                         SDValue V2, SelectionDAG &DAG) {
9780   SmallVector<int, 32> Unpckl, Unpckh;
9781   createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
9782   createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
9783
9784   unsigned UnpackOpcode;
9785   if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9786     UnpackOpcode = X86ISD::UNPCKL;
9787   else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
9788     UnpackOpcode = X86ISD::UNPCKH;
9789   else
9790     return SDValue();
9791
9792   // This is a "natural" unpack operation (rather than the 128-bit sectored
9793   // operation implemented by AVX). We need to rearrange 64-bit chunks of the
9794   // input in order to use the x86 instruction.
9795   V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
9796                             DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
9797   V1 = DAG.getBitcast(VT, V1);
9798   return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
9799 }
9800
9801 // Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
9802 // source into the lower elements and zeroing the upper elements.
9803 static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
9804                                  ArrayRef<int> Mask, const APInt &Zeroable,
9805                                  const X86Subtarget &Subtarget) {
9806   if (!VT.is512BitVector() && !Subtarget.hasVLX())
9807     return false;
9808
9809   unsigned NumElts = Mask.size();
9810   unsigned EltSizeInBits = VT.getScalarSizeInBits();
9811   unsigned MaxScale = 64 / EltSizeInBits;
9812
9813   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
9814     unsigned SrcEltBits = EltSizeInBits * Scale;
9815     if (SrcEltBits < 32 && !Subtarget.hasBWI())
9816       continue;
9817     unsigned NumSrcElts = NumElts / Scale;
9818     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
9819       continue;
9820     unsigned UpperElts = NumElts - NumSrcElts;
9821     if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
9822       continue;
9823     SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
9824     SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
9825     DstVT = MVT::getIntegerVT(EltSizeInBits);
9826     if ((NumSrcElts * EltSizeInBits) >= 128) {
9827       // ISD::TRUNCATE
9828       DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
9829     } else {
9830       // X86ISD::VTRUNC
9831       DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
9832     }
9833     return true;
9834   }
9835
9836   return false;
9837 }
9838
9839 // Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
9840 // element padding to the final DstVT.
9841 static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
9842                                   const X86Subtarget &Subtarget,
9843                                   SelectionDAG &DAG, bool ZeroUppers) {
9844   MVT SrcVT = Src.getSimpleValueType();
9845   MVT DstSVT = DstVT.getScalarType();
9846   unsigned NumDstElts = DstVT.getVectorNumElements();
9847   unsigned NumSrcElts = SrcVT.getVectorNumElements();
9848   unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
9849
9850   if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
9851     return SDValue();
9852
9853   // Perform a direct ISD::TRUNCATE if possible.
9854   if (NumSrcElts == NumDstElts)
9855     return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
9856
9857   if (NumSrcElts > NumDstElts) {
9858     MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
9859     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
9860     return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
9861   }
9862
9863   if ((NumSrcElts * DstEltSizeInBits) >= 128) {
9864     MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
9865     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
9866     return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
9867                           DstVT.getSizeInBits());
9868   }
9869
9870   // Non-VLX targets must truncate from a 512-bit type, so we need to
9871   // widen, truncate and then possibly extract the original subvector.
9872   if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
9873     SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
9874     return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
9875   }
9876
9877   // Fallback to a X86ISD::VTRUNC, padding if necessary.
9878   MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
9879   SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
9880   if (DstVT != TruncVT)
9881     Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
9882                            DstVT.getSizeInBits());
9883   return Trunc;
9884 }
9885
9886 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
9887 //
9888 // An example is the following:
9889 //
9890 // t0: ch = EntryToken
9891 //           t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
9892 //         t25: v4i32 = truncate t2
9893 //       t41: v8i16 = bitcast t25
9894 //       t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
9895 //       Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
9896 //     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
9897 //   t18: v2i64 = bitcast t51
9898 //
9899 // One can just use a single vpmovdw instruction, without avx512vl we need to
9900 // use the zmm variant and extract the lower subvector, padding with zeroes.
9901 // TODO: Merge with lowerShuffleAsVTRUNC.
9902 static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
9903                                      SDValue V2, ArrayRef<int> Mask,
9904                                      const APInt &Zeroable,
9905                                      const X86Subtarget &Subtarget,
9906                                      SelectionDAG &DAG) {
9907   assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
9908   if (!Subtarget.hasAVX512())
9909     return SDValue();
9910
9911   unsigned NumElts = VT.getVectorNumElements();
9912   unsigned EltSizeInBits = VT.getScalarSizeInBits();
9913   unsigned MaxScale = 64 / EltSizeInBits;
9914   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
9915     unsigned SrcEltBits = EltSizeInBits * Scale;
9916     unsigned NumSrcElts = NumElts / Scale;
9917     unsigned UpperElts = NumElts - NumSrcElts;
9918     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
9919         !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
9920       continue;
9921
9922     // Attempt to find a matching source truncation, but as a fall back VLX
9923     // cases can use the VPMOV directly.
9924     SDValue Src = peekThroughBitcasts(V1);
9925     if (Src.getOpcode() == ISD::TRUNCATE &&
9926         Src.getScalarValueSizeInBits() == SrcEltBits) {
9927       Src = Src.getOperand(0);
9928     } else if (Subtarget.hasVLX()) {
9929       MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
9930       MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
9931       Src = DAG.getBitcast(SrcVT, Src);
9932       // Don't do this if PACKSS/PACKUS could perform it cheaper.
9933       if (Scale == 2 &&
9934           ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
9935            (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
9936         return SDValue();
9937     } else
9938       return SDValue();
9939
9940     // VPMOVWB is only available with avx512bw.
9941     if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
9942       return SDValue();
9943
9944     bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
9945     return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
9946   }
9947
9948   return SDValue();
9949 }
9950
9951 // Attempt to match binary shuffle patterns as a truncate.
9952 static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
9953                                     SDValue V2, ArrayRef<int> Mask,
9954                                     const APInt &Zeroable,
9955                                     const X86Subtarget &Subtarget,
9956                                     SelectionDAG &DAG) {
9957   assert((VT.is128BitVector() || VT.is256BitVector()) &&
9958          "Unexpected VTRUNC type");
9959   if (!Subtarget.hasAVX512())
9960     return SDValue();
9961
9962   unsigned NumElts = VT.getVectorNumElements();
9963   unsigned EltSizeInBits = VT.getScalarSizeInBits();
9964   unsigned MaxScale = 64 / EltSizeInBits;
9965   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
9966     // TODO: Support non-BWI VPMOVWB truncations?
9967     unsigned SrcEltBits = EltSizeInBits * Scale;
9968     if (SrcEltBits < 32 && !Subtarget.hasBWI())
9969       continue;
9970
9971     // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
9972     // Bail if the V2 elements are undef.
9973     unsigned NumHalfSrcElts = NumElts / Scale;
9974     unsigned NumSrcElts = 2 * NumHalfSrcElts;
9975     for (unsigned Offset = 0; Offset != Scale; ++Offset) {
9976       if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
9977           isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
9978         continue;
9979
9980       // The elements beyond the truncation must be undef/zero.
9981       unsigned UpperElts = NumElts - NumSrcElts;
9982       if (UpperElts > 0 &&
9983           !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
9984         continue;
9985       bool UndefUppers =
9986           UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
9987
9988       // For offset truncations, ensure that the concat is cheap.
9989       if (Offset) {
9990         auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
9991           if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
9992               Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
9993             return Lo.getOperand(0) == Hi.getOperand(0);
9994           if (ISD::isNormalLoad(Lo.getNode()) &&
9995               ISD::isNormalLoad(Hi.getNode())) {
9996             auto *LDLo = cast<LoadSDNode>(Lo);
9997             auto *LDHi = cast<LoadSDNode>(Hi);
9998             return DAG.areNonVolatileConsecutiveLoads(
9999                 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10000           }
10001           return false;
10002         };
10003         if (!IsCheapConcat(V1, V2))
10004           continue;
10005       }
10006
10007       // As we're using both sources then we need to concat them together
10008       // and truncate from the double-sized src.
10009       MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10010       SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10011
10012       MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10013       MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10014       Src = DAG.getBitcast(SrcVT, Src);
10015
10016       // Shift the offset'd elements into place for the truncation.
10017       // TODO: Use getTargetVShiftByConstNode.
10018       if (Offset)
10019         Src = DAG.getNode(
10020             X86ISD::VSRLI, DL, SrcVT, Src,
10021             DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10022
10023       return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10024     }
10025   }
10026
10027   return SDValue();
10028 }
10029
10030 /// Check whether a compaction lowering can be done by dropping even/odd
10031 /// elements and compute how many times even/odd elements must be dropped.
10032 ///
10033 /// This handles shuffles which take every Nth element where N is a power of
10034 /// two. Example shuffle masks:
10035 ///
10036 /// (even)
10037 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
10038 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10039 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
10040 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
10041 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
10042 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
10043 ///
10044 /// (odd)
10045 ///  N = 1:  1,  3,  5,  7,  9, 11, 13, 15,  0,  2,  4,  6,  8, 10, 12, 14
10046 ///  N = 1:  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10047 ///
10048 /// Any of these lanes can of course be undef.
10049 ///
10050 /// This routine only supports N <= 3.
10051 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10052 /// for larger N.
10053 ///
10054 /// \returns N above, or the number of times even/odd elements must be dropped
10055 /// if there is such a number. Otherwise returns zero.
10056 static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10057                                       bool IsSingleInput) {
10058   // The modulus for the shuffle vector entries is based on whether this is
10059   // a single input or not.
10060   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10061   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10062          "We should only be called with masks with a power-of-2 size!");
10063
10064   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10065   int Offset = MatchEven ? 0 : 1;
10066
10067   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10068   // and 2^3 simultaneously. This is because we may have ambiguity with
10069   // partially undef inputs.
10070   bool ViableForN[3] = {true, true, true};
10071
10072   for (int i = 0, e = Mask.size(); i < e; ++i) {
10073     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10074     // want.
10075     if (Mask[i] < 0)
10076       continue;
10077
10078     bool IsAnyViable = false;
10079     for (unsigned j = 0; j != std::size(ViableForN); ++j)
10080       if (ViableForN[j]) {
10081         uint64_t N = j + 1;
10082
10083         // The shuffle mask must be equal to (i * 2^N) % M.
10084         if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10085           IsAnyViable = true;
10086         else
10087           ViableForN[j] = false;
10088       }
10089     // Early exit if we exhaust the possible powers of two.
10090     if (!IsAnyViable)
10091       break;
10092   }
10093
10094   for (unsigned j = 0; j != std::size(ViableForN); ++j)
10095     if (ViableForN[j])
10096       return j + 1;
10097
10098   // Return 0 as there is no viable power of two.
10099   return 0;
10100 }
10101
10102 // X86 has dedicated pack instructions that can handle specific truncation
10103 // operations: PACKSS and PACKUS.
10104 // Checks for compaction shuffle masks if MaxStages > 1.
10105 // TODO: Add support for matching multiple PACKSS/PACKUS stages.
10106 static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10107                                  unsigned &PackOpcode, ArrayRef<int> TargetMask,
10108                                  const SelectionDAG &DAG,
10109                                  const X86Subtarget &Subtarget,
10110                                  unsigned MaxStages = 1) {
10111   unsigned NumElts = VT.getVectorNumElements();
10112   unsigned BitSize = VT.getScalarSizeInBits();
10113   assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10114          "Illegal maximum compaction");
10115
10116   auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10117     unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10118     unsigned NumPackedBits = NumSrcBits - BitSize;
10119     N1 = peekThroughBitcasts(N1);
10120     N2 = peekThroughBitcasts(N2);
10121     unsigned NumBits1 = N1.getScalarValueSizeInBits();
10122     unsigned NumBits2 = N2.getScalarValueSizeInBits();
10123     bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10124     bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10125     if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10126         (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10127       return false;
10128     if (Subtarget.hasSSE41() || BitSize == 8) {
10129       APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10130       if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10131           (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10132         V1 = N1;
10133         V2 = N2;
10134         SrcVT = PackVT;
10135         PackOpcode = X86ISD::PACKUS;
10136         return true;
10137       }
10138     }
10139     bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10140     bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10141     if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10142          DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10143         (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10144          DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10145       V1 = N1;
10146       V2 = N2;
10147       SrcVT = PackVT;
10148       PackOpcode = X86ISD::PACKSS;
10149       return true;
10150     }
10151     return false;
10152   };
10153
10154   // Attempt to match against wider and wider compaction patterns.
10155   for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10156     MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10157     MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10158
10159     // Try binary shuffle.
10160     SmallVector<int, 32> BinaryMask;
10161     createPackShuffleMask(VT, BinaryMask, false, NumStages);
10162     if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10163       if (MatchPACK(V1, V2, PackVT))
10164         return true;
10165
10166     // Try unary shuffle.
10167     SmallVector<int, 32> UnaryMask;
10168     createPackShuffleMask(VT, UnaryMask, true, NumStages);
10169     if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10170       if (MatchPACK(V1, V1, PackVT))
10171         return true;
10172   }
10173
10174   return false;
10175 }
10176
10177 static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
10178                                     SDValue V1, SDValue V2, SelectionDAG &DAG,
10179                                     const X86Subtarget &Subtarget) {
10180   MVT PackVT;
10181   unsigned PackOpcode;
10182   unsigned SizeBits = VT.getSizeInBits();
10183   unsigned EltBits = VT.getScalarSizeInBits();
10184   unsigned MaxStages = Log2_32(64 / EltBits);
10185   if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10186                             Subtarget, MaxStages))
10187     return SDValue();
10188
10189   unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10190   unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10191
10192   // Don't lower multi-stage packs on AVX512, truncation is better.
10193   if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10194     return SDValue();
10195
10196   // Pack to the largest type possible:
10197   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10198   unsigned MaxPackBits = 16;
10199   if (CurrentEltBits > 16 &&
10200       (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10201     MaxPackBits = 32;
10202
10203   // Repeatedly pack down to the target size.
10204   SDValue Res;
10205   for (unsigned i = 0; i != NumStages; ++i) {
10206     unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10207     unsigned NumSrcElts = SizeBits / SrcEltBits;
10208     MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10209     MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10210     MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10211     MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10212     Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10213                       DAG.getBitcast(SrcVT, V2));
10214     V1 = V2 = Res;
10215     CurrentEltBits /= 2;
10216   }
10217   assert(Res && Res.getValueType() == VT &&
10218          "Failed to lower compaction shuffle");
10219   return Res;
10220 }
10221
10222 /// Try to emit a bitmask instruction for a shuffle.
10223 ///
10224 /// This handles cases where we can model a blend exactly as a bitmask due to
10225 /// one of the inputs being zeroable.
10226 static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
10227                                      SDValue V2, ArrayRef<int> Mask,
10228                                      const APInt &Zeroable,
10229                                      const X86Subtarget &Subtarget,
10230                                      SelectionDAG &DAG) {
10231   MVT MaskVT = VT;
10232   MVT EltVT = VT.getVectorElementType();
10233   SDValue Zero, AllOnes;
10234   // Use f64 if i64 isn't legal.
10235   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10236     EltVT = MVT::f64;
10237     MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10238   }
10239
10240   MVT LogicVT = VT;
10241   if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10242     Zero = DAG.getConstantFP(0.0, DL, EltVT);
10243     APFloat AllOnesValue =
10244         APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));
10245     AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10246     LogicVT =
10247         MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10248   } else {
10249     Zero = DAG.getConstant(0, DL, EltVT);
10250     AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10251   }
10252
10253   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10254   SDValue V;
10255   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10256     if (Zeroable[i])
10257       continue;
10258     if (Mask[i] % Size != i)
10259       return SDValue(); // Not a blend.
10260     if (!V)
10261       V = Mask[i] < Size ? V1 : V2;
10262     else if (V != (Mask[i] < Size ? V1 : V2))
10263       return SDValue(); // Can only let one input through the mask.
10264
10265     VMaskOps[i] = AllOnes;
10266   }
10267   if (!V)
10268     return SDValue(); // No non-zeroable elements!
10269
10270   SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10271   VMask = DAG.getBitcast(LogicVT, VMask);
10272   V = DAG.getBitcast(LogicVT, V);
10273   SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10274   return DAG.getBitcast(VT, And);
10275 }
10276
10277 /// Try to emit a blend instruction for a shuffle using bit math.
10278 ///
10279 /// This is used as a fallback approach when first class blend instructions are
10280 /// unavailable. Currently it is only suitable for integer vectors, but could
10281 /// be generalized for floating point vectors if desirable.
10282 static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
10283                                       SDValue V2, ArrayRef<int> Mask,
10284                                       SelectionDAG &DAG) {
10285   assert(VT.isInteger() && "Only supports integer vector types!");
10286   MVT EltVT = VT.getVectorElementType();
10287   SDValue Zero = DAG.getConstant(0, DL, EltVT);
10288   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10289   SmallVector<SDValue, 16> MaskOps;
10290   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10291     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10292       return SDValue(); // Shuffled input!
10293     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10294   }
10295
10296   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10297   return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10298 }
10299
10300 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
10301                                     SDValue PreservedSrc,
10302                                     const X86Subtarget &Subtarget,
10303                                     SelectionDAG &DAG);
10304
10305 static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,
10306                                 MutableArrayRef<int> Mask,
10307                                 const APInt &Zeroable, bool &ForceV1Zero,
10308                                 bool &ForceV2Zero, uint64_t &BlendMask) {
10309   bool V1IsZeroOrUndef =
10310       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
10311   bool V2IsZeroOrUndef =
10312       V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10313
10314   BlendMask = 0;
10315   ForceV1Zero = false, ForceV2Zero = false;
10316   assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10317
10318   int NumElts = Mask.size();
10319   int NumLanes = VT.getSizeInBits() / 128;
10320   int NumEltsPerLane = NumElts / NumLanes;
10321   assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10322
10323   // For 32/64-bit elements, if we only reference one input (plus any undefs),
10324   // then ensure the blend mask part for that lane just references that input.
10325   bool ForceWholeLaneMasks =
10326       VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10327
10328   // Attempt to generate the binary blend mask. If an input is zero then
10329   // we can use any lane.
10330   for (int Lane = 0; Lane != NumLanes; ++Lane) {
10331     // Keep track of the inputs used per lane.
10332     bool LaneV1InUse = false;
10333     bool LaneV2InUse = false;
10334     uint64_t LaneBlendMask = 0;
10335     for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10336       int Elt = (Lane * NumEltsPerLane) + LaneElt;
10337       int M = Mask[Elt];
10338       if (M == SM_SentinelUndef)
10339         continue;
10340       if (M == Elt || (0 <= M && M < NumElts &&
10341                      IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10342         Mask[Elt] = Elt;
10343         LaneV1InUse = true;
10344         continue;
10345       }
10346       if (M == (Elt + NumElts) ||
10347           (NumElts <= M &&
10348            IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10349         LaneBlendMask |= 1ull << LaneElt;
10350         Mask[Elt] = Elt + NumElts;
10351         LaneV2InUse = true;
10352         continue;
10353       }
10354       if (Zeroable[Elt]) {
10355         if (V1IsZeroOrUndef) {
10356           ForceV1Zero = true;
10357           Mask[Elt] = Elt;
10358           LaneV1InUse = true;
10359           continue;
10360         }
10361         if (V2IsZeroOrUndef) {
10362           ForceV2Zero = true;
10363           LaneBlendMask |= 1ull << LaneElt;
10364           Mask[Elt] = Elt + NumElts;
10365           LaneV2InUse = true;
10366           continue;
10367         }
10368       }
10369       return false;
10370     }
10371
10372     // If we only used V2 then splat the lane blend mask to avoid any demanded
10373     // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10374     // blend mask bit).
10375     if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10376       LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10377
10378     BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10379   }
10380   return true;
10381 }
10382
10383 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
10384                                             int Scale) {
10385   uint64_t ScaledMask = 0;
10386   for (int i = 0; i != Size; ++i)
10387     if (BlendMask & (1ull << i))
10388       ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
10389   return ScaledMask;
10390 }
10391
10392 /// Try to emit a blend instruction for a shuffle.
10393 ///
10394 /// This doesn't do any checks for the availability of instructions for blending
10395 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10396 /// be matched in the backend with the type given. What it does check for is
10397 /// that the shuffle mask is a blend, or convertible into a blend with zero.
10398 static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
10399                                    SDValue V2, ArrayRef<int> Original,
10400                                    const APInt &Zeroable,
10401                                    const X86Subtarget &Subtarget,
10402                                    SelectionDAG &DAG) {
10403   uint64_t BlendMask = 0;
10404   bool ForceV1Zero = false, ForceV2Zero = false;
10405   SmallVector<int, 64> Mask(Original);
10406   if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10407                            BlendMask))
10408     return SDValue();
10409
10410   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10411   if (ForceV1Zero)
10412     V1 = getZeroVector(VT, Subtarget, DAG, DL);
10413   if (ForceV2Zero)
10414     V2 = getZeroVector(VT, Subtarget, DAG, DL);
10415
10416   unsigned NumElts = VT.getVectorNumElements();
10417
10418   switch (VT.SimpleTy) {
10419   case MVT::v4i64:
10420   case MVT::v8i32:
10421     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10422     [[fallthrough]];
10423   case MVT::v4f64:
10424   case MVT::v8f32:
10425     assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10426     [[fallthrough]];
10427   case MVT::v2f64:
10428   case MVT::v2i64:
10429   case MVT::v4f32:
10430   case MVT::v4i32:
10431   case MVT::v8i16:
10432     assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10433     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10434                        DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10435   case MVT::v16i16: {
10436     assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10437     SmallVector<int, 8> RepeatedMask;
10438     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10439       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10440       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10441       BlendMask = 0;
10442       for (int i = 0; i < 8; ++i)
10443         if (RepeatedMask[i] >= 8)
10444           BlendMask |= 1ull << i;
10445       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10446                          DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10447     }
10448     // Use PBLENDW for lower/upper lanes and then blend lanes.
10449     // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10450     // merge to VSELECT where useful.
10451     uint64_t LoMask = BlendMask & 0xFF;
10452     uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10453     if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10454       SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10455                                DAG.getTargetConstant(LoMask, DL, MVT::i8));
10456       SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10457                                DAG.getTargetConstant(HiMask, DL, MVT::i8));
10458       return DAG.getVectorShuffle(
10459           MVT::v16i16, DL, Lo, Hi,
10460           {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10461     }
10462     [[fallthrough]];
10463   }
10464   case MVT::v32i8:
10465     assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10466     [[fallthrough]];
10467   case MVT::v16i8: {
10468     assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10469
10470     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10471     if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10472                                                Subtarget, DAG))
10473       return Masked;
10474
10475     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10476       MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10477       SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10478       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10479     }
10480
10481     // If we have VPTERNLOG, we can use that as a bit blend.
10482     if (Subtarget.hasVLX())
10483       if (SDValue BitBlend =
10484               lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
10485         return BitBlend;
10486
10487     // Scale the blend by the number of bytes per element.
10488     int Scale = VT.getScalarSizeInBits() / 8;
10489
10490     // This form of blend is always done on bytes. Compute the byte vector
10491     // type.
10492     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10493
10494     // x86 allows load folding with blendvb from the 2nd source operand. But
10495     // we are still using LLVM select here (see comment below), so that's V1.
10496     // If V2 can be load-folded and V1 cannot be load-folded, then commute to
10497     // allow that load-folding possibility.
10498     if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
10499       ShuffleVectorSDNode::commuteMask(Mask);
10500       std::swap(V1, V2);
10501     }
10502
10503     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
10504     // mix of LLVM's code generator and the x86 backend. We tell the code
10505     // generator that boolean values in the elements of an x86 vector register
10506     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
10507     // mapping a select to operand #1, and 'false' mapping to operand #2. The
10508     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
10509     // of the element (the remaining are ignored) and 0 in that high bit would
10510     // mean operand #1 while 1 in the high bit would mean operand #2. So while
10511     // the LLVM model for boolean values in vector elements gets the relevant
10512     // bit set, it is set backwards and over constrained relative to x86's
10513     // actual model.
10514     SmallVector<SDValue, 32> VSELECTMask;
10515     for (int i = 0, Size = Mask.size(); i < Size; ++i)
10516       for (int j = 0; j < Scale; ++j)
10517         VSELECTMask.push_back(
10518             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
10519                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
10520                                           MVT::i8));
10521
10522     V1 = DAG.getBitcast(BlendVT, V1);
10523     V2 = DAG.getBitcast(BlendVT, V2);
10524     return DAG.getBitcast(
10525         VT,
10526         DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
10527                       V1, V2));
10528   }
10529   case MVT::v16f32:
10530   case MVT::v8f64:
10531   case MVT::v8i64:
10532   case MVT::v16i32:
10533   case MVT::v32i16:
10534   case MVT::v64i8: {
10535     // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
10536     bool OptForSize = DAG.shouldOptForSize();
10537     if (!OptForSize) {
10538       if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10539                                                  Subtarget, DAG))
10540         return Masked;
10541     }
10542
10543     // Otherwise load an immediate into a GPR, cast to k-register, and use a
10544     // masked move.
10545     MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10546     SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10547     return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10548   }
10549   default:
10550     llvm_unreachable("Not a supported integer vector type!");
10551   }
10552 }
10553
10554 /// Try to lower as a blend of elements from two inputs followed by
10555 /// a single-input permutation.
10556 ///
10557 /// This matches the pattern where we can blend elements from two inputs and
10558 /// then reduce the shuffle to a single-input permutation.
10559 static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
10560                                              SDValue V1, SDValue V2,
10561                                              ArrayRef<int> Mask,
10562                                              SelectionDAG &DAG,
10563                                              bool ImmBlends = false) {
10564   // We build up the blend mask while checking whether a blend is a viable way
10565   // to reduce the shuffle.
10566   SmallVector<int, 32> BlendMask(Mask.size(), -1);
10567   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
10568
10569   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10570     if (Mask[i] < 0)
10571       continue;
10572
10573     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
10574
10575     if (BlendMask[Mask[i] % Size] < 0)
10576       BlendMask[Mask[i] % Size] = Mask[i];
10577     else if (BlendMask[Mask[i] % Size] != Mask[i])
10578       return SDValue(); // Can't blend in the needed input!
10579
10580     PermuteMask[i] = Mask[i] % Size;
10581   }
10582
10583   // If only immediate blends, then bail if the blend mask can't be widened to
10584   // i16.
10585   unsigned EltSize = VT.getScalarSizeInBits();
10586   if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
10587     return SDValue();
10588
10589   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
10590   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
10591 }
10592
10593 /// Try to lower as an unpack of elements from two inputs followed by
10594 /// a single-input permutation.
10595 ///
10596 /// This matches the pattern where we can unpack elements from two inputs and
10597 /// then reduce the shuffle to a single-input (wider) permutation.
10598 static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
10599                                              SDValue V1, SDValue V2,
10600                                              ArrayRef<int> Mask,
10601                                              SelectionDAG &DAG) {
10602   int NumElts = Mask.size();
10603   int NumLanes = VT.getSizeInBits() / 128;
10604   int NumLaneElts = NumElts / NumLanes;
10605   int NumHalfLaneElts = NumLaneElts / 2;
10606
10607   bool MatchLo = true, MatchHi = true;
10608   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
10609
10610   // Determine UNPCKL/UNPCKH type and operand order.
10611   for (int Elt = 0; Elt != NumElts; ++Elt) {
10612     int M = Mask[Elt];
10613     if (M < 0)
10614       continue;
10615
10616     // Normalize the mask value depending on whether it's V1 or V2.
10617     int NormM = M;
10618     SDValue &Op = Ops[Elt & 1];
10619     if (M < NumElts && (Op.isUndef() || Op == V1))
10620       Op = V1;
10621     else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
10622       Op = V2;
10623       NormM -= NumElts;
10624     } else
10625       return SDValue();
10626
10627     bool MatchLoAnyLane = false, MatchHiAnyLane = false;
10628     for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
10629       int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
10630       MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
10631       MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
10632       if (MatchLoAnyLane || MatchHiAnyLane) {
10633         assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
10634                "Failed to match UNPCKLO/UNPCKHI");
10635         break;
10636       }
10637     }
10638     MatchLo &= MatchLoAnyLane;
10639     MatchHi &= MatchHiAnyLane;
10640     if (!MatchLo && !MatchHi)
10641       return SDValue();
10642   }
10643   assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
10644
10645   // Element indices have changed after unpacking. Calculate permute mask
10646   // so that they will be put back to the position as dictated by the
10647   // original shuffle mask indices.
10648   SmallVector<int, 32> PermuteMask(NumElts, -1);
10649   for (int Elt = 0; Elt != NumElts; ++Elt) {
10650     int M = Mask[Elt];
10651     if (M < 0)
10652       continue;
10653     int NormM = M;
10654     if (NumElts <= M)
10655       NormM -= NumElts;
10656     bool IsFirstOp = M < NumElts;
10657     int BaseMaskElt =
10658         NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
10659     if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
10660       PermuteMask[Elt] = BaseMaskElt;
10661     else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
10662       PermuteMask[Elt] = BaseMaskElt + 1;
10663     assert(PermuteMask[Elt] != -1 &&
10664            "Input mask element is defined but failed to assign permute mask");
10665   }
10666
10667   unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10668   SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
10669   return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
10670 }
10671
10672 /// Try to lower a shuffle as a permute of the inputs followed by an
10673 /// UNPCK instruction.
10674 ///
10675 /// This specifically targets cases where we end up with alternating between
10676 /// the two inputs, and so can permute them into something that feeds a single
10677 /// UNPCK instruction. Note that this routine only targets integer vectors
10678 /// because for floating point vectors we have a generalized SHUFPS lowering
10679 /// strategy that handles everything that doesn't *exactly* match an unpack,
10680 /// making this clever lowering unnecessary.
10681 static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10682                                               SDValue V1, SDValue V2,
10683                                               ArrayRef<int> Mask,
10684                                               const X86Subtarget &Subtarget,
10685                                               SelectionDAG &DAG) {
10686   int Size = Mask.size();
10687   assert(Mask.size() >= 2 && "Single element masks are invalid.");
10688
10689   // This routine only supports 128-bit integer dual input vectors.
10690   if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
10691     return SDValue();
10692
10693   int NumLoInputs =
10694       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10695   int NumHiInputs =
10696       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10697
10698   bool UnpackLo = NumLoInputs >= NumHiInputs;
10699
10700   auto TryUnpack = [&](int ScalarSize, int Scale) {
10701     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10702     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10703
10704     for (int i = 0; i < Size; ++i) {
10705       if (Mask[i] < 0)
10706         continue;
10707
10708       // Each element of the unpack contains Scale elements from this mask.
10709       int UnpackIdx = i / Scale;
10710
10711       // We only handle the case where V1 feeds the first slots of the unpack.
10712       // We rely on canonicalization to ensure this is the case.
10713       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10714         return SDValue();
10715
10716       // Setup the mask for this input. The indexing is tricky as we have to
10717       // handle the unpack stride.
10718       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10719       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10720           Mask[i] % Size;
10721     }
10722
10723     // If we will have to shuffle both inputs to use the unpack, check whether
10724     // we can just unpack first and shuffle the result. If so, skip this unpack.
10725     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10726         !isNoopShuffleMask(V2Mask))
10727       return SDValue();
10728
10729     // Shuffle the inputs into place.
10730     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10731     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10732
10733     // Cast the inputs to the type we will use to unpack them.
10734     MVT UnpackVT =
10735         MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10736     V1 = DAG.getBitcast(UnpackVT, V1);
10737     V2 = DAG.getBitcast(UnpackVT, V2);
10738
10739     // Unpack the inputs and cast the result back to the desired type.
10740     return DAG.getBitcast(
10741         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10742                         UnpackVT, V1, V2));
10743   };
10744
10745   // We try each unpack from the largest to the smallest to try and find one
10746   // that fits this mask.
10747   int OrigScalarSize = VT.getScalarSizeInBits();
10748   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10749     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10750       return Unpack;
10751
10752   // If we're shuffling with a zero vector then we're better off not doing
10753   // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
10754   if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
10755       ISD::isBuildVectorAllZeros(V2.getNode()))
10756     return SDValue();
10757
10758   // If none of the unpack-rooted lowerings worked (or were profitable) try an
10759   // initial unpack.
10760   if (NumLoInputs == 0 || NumHiInputs == 0) {
10761     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10762            "We have to have *some* inputs!");
10763     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10764
10765     // FIXME: We could consider the total complexity of the permute of each
10766     // possible unpacking. Or at the least we should consider how many
10767     // half-crossings are created.
10768     // FIXME: We could consider commuting the unpacks.
10769
10770     SmallVector<int, 32> PermMask((unsigned)Size, -1);
10771     for (int i = 0; i < Size; ++i) {
10772       if (Mask[i] < 0)
10773         continue;
10774
10775       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10776
10777       PermMask[i] =
10778           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10779     }
10780     return DAG.getVectorShuffle(
10781         VT, DL,
10782         DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
10783                     V1, V2),
10784         DAG.getUNDEF(VT), PermMask);
10785   }
10786
10787   return SDValue();
10788 }
10789
10790 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
10791 /// permuting the elements of the result in place.
10792 static SDValue lowerShuffleAsByteRotateAndPermute(
10793     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10794     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10795   if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
10796       (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
10797       (VT.is512BitVector() && !Subtarget.hasBWI()))
10798     return SDValue();
10799
10800   // We don't currently support lane crossing permutes.
10801   if (is128BitLaneCrossingShuffleMask(VT, Mask))
10802     return SDValue();
10803
10804   int Scale = VT.getScalarSizeInBits() / 8;
10805   int NumLanes = VT.getSizeInBits() / 128;
10806   int NumElts = VT.getVectorNumElements();
10807   int NumEltsPerLane = NumElts / NumLanes;
10808
10809   // Determine range of mask elts.
10810   bool Blend1 = true;
10811   bool Blend2 = true;
10812   std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
10813   std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
10814   for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
10815     for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
10816       int M = Mask[Lane + Elt];
10817       if (M < 0)
10818         continue;
10819       if (M < NumElts) {
10820         Blend1 &= (M == (Lane + Elt));
10821         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
10822         M = M % NumEltsPerLane;
10823         Range1.first = std::min(Range1.first, M);
10824         Range1.second = std::max(Range1.second, M);
10825       } else {
10826         M -= NumElts;
10827         Blend2 &= (M == (Lane + Elt));
10828         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
10829         M = M % NumEltsPerLane;
10830         Range2.first = std::min(Range2.first, M);
10831         Range2.second = std::max(Range2.second, M);
10832       }
10833     }
10834   }
10835
10836   // Bail if we don't need both elements.
10837   // TODO - it might be worth doing this for unary shuffles if the permute
10838   // can be widened.
10839   if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
10840       !(0 <= Range2.first && Range2.second < NumEltsPerLane))
10841     return SDValue();
10842
10843   if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
10844     return SDValue();
10845
10846   // Rotate the 2 ops so we can access both ranges, then permute the result.
10847   auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
10848     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10849     SDValue Rotate = DAG.getBitcast(
10850         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
10851                         DAG.getBitcast(ByteVT, Lo),
10852                         DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
10853     SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
10854     for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
10855       for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
10856         int M = Mask[Lane + Elt];
10857         if (M < 0)
10858           continue;
10859         if (M < NumElts)
10860           PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
10861         else
10862           PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
10863       }
10864     }
10865     return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
10866   };
10867
10868   // Check if the ranges are small enough to rotate from either direction.
10869   if (Range2.second < Range1.first)
10870     return RotateAndPermute(V1, V2, Range1.first, 0);
10871   if (Range1.second < Range2.first)
10872     return RotateAndPermute(V2, V1, Range2.first, NumElts);
10873   return SDValue();
10874 }
10875
10876 static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {
10877   return isUndefOrEqual(Mask, 0);
10878 }
10879
10880 static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {
10881   return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
10882 }
10883
10884 /// Check if the Mask consists of the same element repeated multiple times.
10885 static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) {
10886   size_t NumUndefs = 0;
10887   std::optional<int> UniqueElt;
10888   for (int Elt : Mask) {
10889     if (Elt == SM_SentinelUndef) {
10890       NumUndefs++;
10891       continue;
10892     }
10893     if (UniqueElt.has_value() && UniqueElt.value() != Elt)
10894       return false;
10895     UniqueElt = Elt;
10896   }
10897   // Make sure the element is repeated enough times by checking the number of
10898   // undefs is small.
10899   return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
10900 }
10901
10902 /// Generic routine to decompose a shuffle and blend into independent
10903 /// blends and permutes.
10904 ///
10905 /// This matches the extremely common pattern for handling combined
10906 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
10907 /// operations. It will try to pick the best arrangement of shuffles and
10908 /// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
10909 static SDValue lowerShuffleAsDecomposedShuffleMerge(
10910     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10911     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10912   int NumElts = Mask.size();
10913   int NumLanes = VT.getSizeInBits() / 128;
10914   int NumEltsPerLane = NumElts / NumLanes;
10915
10916   // Shuffle the input elements into the desired positions in V1 and V2 and
10917   // unpack/blend them together.
10918   bool IsAlternating = true;
10919   SmallVector<int, 32> V1Mask(NumElts, -1);
10920   SmallVector<int, 32> V2Mask(NumElts, -1);
10921   SmallVector<int, 32> FinalMask(NumElts, -1);
10922   for (int i = 0; i < NumElts; ++i) {
10923     int M = Mask[i];
10924     if (M >= 0 && M < NumElts) {
10925       V1Mask[i] = M;
10926       FinalMask[i] = i;
10927       IsAlternating &= (i & 1) == 0;
10928     } else if (M >= NumElts) {
10929       V2Mask[i] = M - NumElts;
10930       FinalMask[i] = i + NumElts;
10931       IsAlternating &= (i & 1) == 1;
10932     }
10933   }
10934
10935   // If we effectively only demand the 0'th element of \p Input, and not only
10936   // as 0'th element, then broadcast said input,
10937   // and change \p InputMask to be a no-op (identity) mask.
10938   auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
10939                                          &DAG](SDValue &Input,
10940                                                MutableArrayRef<int> InputMask) {
10941     unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
10942     if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
10943                                  !X86::mayFoldLoad(Input, Subtarget)))
10944       return;
10945     if (isNoopShuffleMask(InputMask))
10946       return;
10947     assert(isBroadcastShuffleMask(InputMask) &&
10948            "Expected to demand only the 0'th element.");
10949     Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
10950     for (auto I : enumerate(InputMask)) {
10951       int &InputMaskElt = I.value();
10952       if (InputMaskElt >= 0)
10953         InputMaskElt = I.index();
10954     }
10955   };
10956
10957   // Currently, we may need to produce one shuffle per input, and blend results.
10958   // It is possible that the shuffle for one of the inputs is already a no-op.
10959   // See if we can simplify non-no-op shuffles into broadcasts,
10960   // which we consider to be strictly better than an arbitrary shuffle.
10961   if (isNoopOrBroadcastShuffleMask(V1Mask) &&
10962       isNoopOrBroadcastShuffleMask(V2Mask)) {
10963     canonicalizeBroadcastableInput(V1, V1Mask);
10964     canonicalizeBroadcastableInput(V2, V2Mask);
10965   }
10966
10967   // Try to lower with the simpler initial blend/unpack/rotate strategies unless
10968   // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
10969   // the shuffle may be able to fold with a load or other benefit. However, when
10970   // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
10971   // pre-shuffle first is a better strategy.
10972   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
10973     // Only prefer immediate blends to unpack/rotate.
10974     if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
10975                                                           DAG, true))
10976       return BlendPerm;
10977     // If either input vector provides only a single element which is repeated
10978     // multiple times, unpacking from both input vectors would generate worse
10979     // code. e.g. for
10980     // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
10981     // it is better to process t4 first to create a vector of t4[0], then unpack
10982     // that vector with t2.
10983     if (!isSingleElementRepeatedMask(V1Mask) &&
10984         !isSingleElementRepeatedMask(V2Mask))
10985       if (SDValue UnpackPerm =
10986               lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
10987         return UnpackPerm;
10988     if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
10989             DL, VT, V1, V2, Mask, Subtarget, DAG))
10990       return RotatePerm;
10991     // Unpack/rotate failed - try again with variable blends.
10992     if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
10993                                                           DAG))
10994       return BlendPerm;
10995     if (VT.getScalarSizeInBits() >= 32)
10996       if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
10997               DL, VT, V1, V2, Mask, Subtarget, DAG))
10998         return PermUnpack;
10999   }
11000
11001   // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11002   // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11003   // TODO: It doesn't have to be alternating - but each lane mustn't have more
11004   // than half the elements coming from each source.
11005   if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11006     V1Mask.assign(NumElts, -1);
11007     V2Mask.assign(NumElts, -1);
11008     FinalMask.assign(NumElts, -1);
11009     for (int i = 0; i != NumElts; i += NumEltsPerLane)
11010       for (int j = 0; j != NumEltsPerLane; ++j) {
11011         int M = Mask[i + j];
11012         if (M >= 0 && M < NumElts) {
11013           V1Mask[i + (j / 2)] = M;
11014           FinalMask[i + j] = i + (j / 2);
11015         } else if (M >= NumElts) {
11016           V2Mask[i + (j / 2)] = M - NumElts;
11017           FinalMask[i + j] = i + (j / 2) + NumElts;
11018         }
11019       }
11020   }
11021
11022   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11023   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11024   return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11025 }
11026
11027 static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11028                                    const X86Subtarget &Subtarget,
11029                                    ArrayRef<int> Mask) {
11030   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11031   assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11032
11033   // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11034   int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11035   int MaxSubElts = 64 / EltSizeInBits;
11036   unsigned RotateAmt, NumSubElts;
11037   if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11038                                           MaxSubElts, NumSubElts, RotateAmt))
11039     return -1;
11040   unsigned NumElts = Mask.size();
11041   MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11042   RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11043   return RotateAmt;
11044 }
11045
11046 /// Lower shuffle using X86ISD::VROTLI rotations.
11047 static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
11048                                        ArrayRef<int> Mask,
11049                                        const X86Subtarget &Subtarget,
11050                                        SelectionDAG &DAG) {
11051   // Only XOP + AVX512 targets have bit rotation instructions.
11052   // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11053   bool IsLegal =
11054       (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11055   if (!IsLegal && Subtarget.hasSSE3())
11056     return SDValue();
11057
11058   MVT RotateVT;
11059   int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11060                                           Subtarget, Mask);
11061   if (RotateAmt < 0)
11062     return SDValue();
11063
11064   // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11065   // expanded to OR(SRL,SHL), will be more efficient, but if they can
11066   // widen to vXi16 or more then existing lowering should will be better.
11067   if (!IsLegal) {
11068     if ((RotateAmt % 16) == 0)
11069       return SDValue();
11070     // TODO: Use getTargetVShiftByConstNode.
11071     unsigned ShlAmt = RotateAmt;
11072     unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11073     V1 = DAG.getBitcast(RotateVT, V1);
11074     SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11075                               DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11076     SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11077                               DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11078     SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11079     return DAG.getBitcast(VT, Rot);
11080   }
11081
11082   SDValue Rot =
11083       DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11084                   DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11085   return DAG.getBitcast(VT, Rot);
11086 }
11087
11088 /// Try to match a vector shuffle as an element rotation.
11089 ///
11090 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11091 static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
11092                                        ArrayRef<int> Mask) {
11093   int NumElts = Mask.size();
11094
11095   // We need to detect various ways of spelling a rotation:
11096   //   [11, 12, 13, 14, 15,  0,  1,  2]
11097   //   [-1, 12, 13, 14, -1, -1,  1, -1]
11098   //   [-1, -1, -1, -1, -1, -1,  1,  2]
11099   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
11100   //   [-1,  4,  5,  6, -1, -1,  9, -1]
11101   //   [-1,  4,  5,  6, -1, -1, -1, -1]
11102   int Rotation = 0;
11103   SDValue Lo, Hi;
11104   for (int i = 0; i < NumElts; ++i) {
11105     int M = Mask[i];
11106     assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11107            "Unexpected mask index.");
11108     if (M < 0)
11109       continue;
11110
11111     // Determine where a rotated vector would have started.
11112     int StartIdx = i - (M % NumElts);
11113     if (StartIdx == 0)
11114       // The identity rotation isn't interesting, stop.
11115       return -1;
11116
11117     // If we found the tail of a vector the rotation must be the missing
11118     // front. If we found the head of a vector, it must be how much of the
11119     // head.
11120     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11121
11122     if (Rotation == 0)
11123       Rotation = CandidateRotation;
11124     else if (Rotation != CandidateRotation)
11125       // The rotations don't match, so we can't match this mask.
11126       return -1;
11127
11128     // Compute which value this mask is pointing at.
11129     SDValue MaskV = M < NumElts ? V1 : V2;
11130
11131     // Compute which of the two target values this index should be assigned
11132     // to. This reflects whether the high elements are remaining or the low
11133     // elements are remaining.
11134     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11135
11136     // Either set up this value if we've not encountered it before, or check
11137     // that it remains consistent.
11138     if (!TargetV)
11139       TargetV = MaskV;
11140     else if (TargetV != MaskV)
11141       // This may be a rotation, but it pulls from the inputs in some
11142       // unsupported interleaving.
11143       return -1;
11144   }
11145
11146   // Check that we successfully analyzed the mask, and normalize the results.
11147   assert(Rotation != 0 && "Failed to locate a viable rotation!");
11148   assert((Lo || Hi) && "Failed to find a rotated input vector!");
11149   if (!Lo)
11150     Lo = Hi;
11151   else if (!Hi)
11152     Hi = Lo;
11153
11154   V1 = Lo;
11155   V2 = Hi;
11156
11157   return Rotation;
11158 }
11159
11160 /// Try to lower a vector shuffle as a byte rotation.
11161 ///
11162 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11163 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11164 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11165 /// try to generically lower a vector shuffle through such an pattern. It
11166 /// does not check for the profitability of lowering either as PALIGNR or
11167 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11168 /// This matches shuffle vectors that look like:
11169 ///
11170 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11171 ///
11172 /// Essentially it concatenates V1 and V2, shifts right by some number of
11173 /// elements, and takes the low elements as the result. Note that while this is
11174 /// specified as a *right shift* because x86 is little-endian, it is a *left
11175 /// rotate* of the vector lanes.
11176 static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
11177                                     ArrayRef<int> Mask) {
11178   // Don't accept any shuffles with zero elements.
11179   if (isAnyZero(Mask))
11180     return -1;
11181
11182   // PALIGNR works on 128-bit lanes.
11183   SmallVector<int, 16> RepeatedMask;
11184   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11185     return -1;
11186
11187   int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11188   if (Rotation <= 0)
11189     return -1;
11190
11191   // PALIGNR rotates bytes, so we need to scale the
11192   // rotation based on how many bytes are in the vector lane.
11193   int NumElts = RepeatedMask.size();
11194   int Scale = 16 / NumElts;
11195   return Rotation * Scale;
11196 }
11197
11198 static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
11199                                         SDValue V2, ArrayRef<int> Mask,
11200                                         const X86Subtarget &Subtarget,
11201                                         SelectionDAG &DAG) {
11202   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11203
11204   SDValue Lo = V1, Hi = V2;
11205   int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11206   if (ByteRotation <= 0)
11207     return SDValue();
11208
11209   // Cast the inputs to i8 vector of correct length to match PALIGNR or
11210   // PSLLDQ/PSRLDQ.
11211   MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11212   Lo = DAG.getBitcast(ByteVT, Lo);
11213   Hi = DAG.getBitcast(ByteVT, Hi);
11214
11215   // SSSE3 targets can use the palignr instruction.
11216   if (Subtarget.hasSSSE3()) {
11217     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11218            "512-bit PALIGNR requires BWI instructions");
11219     return DAG.getBitcast(
11220         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11221                         DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11222   }
11223
11224   assert(VT.is128BitVector() &&
11225          "Rotate-based lowering only supports 128-bit lowering!");
11226   assert(Mask.size() <= 16 &&
11227          "Can shuffle at most 16 bytes in a 128-bit vector!");
11228   assert(ByteVT == MVT::v16i8 &&
11229          "SSE2 rotate lowering only needed for v16i8!");
11230
11231   // Default SSE2 implementation
11232   int LoByteShift = 16 - ByteRotation;
11233   int HiByteShift = ByteRotation;
11234
11235   SDValue LoShift =
11236       DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11237                   DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11238   SDValue HiShift =
11239       DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11240                   DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11241   return DAG.getBitcast(VT,
11242                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11243 }
11244
11245 /// Try to lower a vector shuffle as a dword/qword rotation.
11246 ///
11247 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11248 /// rotation of the concatenation of two vectors; This routine will
11249 /// try to generically lower a vector shuffle through such an pattern.
11250 ///
11251 /// Essentially it concatenates V1 and V2, shifts right by some number of
11252 /// elements, and takes the low elements as the result. Note that while this is
11253 /// specified as a *right shift* because x86 is little-endian, it is a *left
11254 /// rotate* of the vector lanes.
11255 static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
11256                                     SDValue V2, ArrayRef<int> Mask,
11257                                     const APInt &Zeroable,
11258                                     const X86Subtarget &Subtarget,
11259                                     SelectionDAG &DAG) {
11260   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11261          "Only 32-bit and 64-bit elements are supported!");
11262
11263   // 128/256-bit vectors are only supported with VLX.
11264   assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11265          && "VLX required for 128/256-bit vectors");
11266
11267   SDValue Lo = V1, Hi = V2;
11268   int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11269   if (0 < Rotation)
11270     return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11271                        DAG.getTargetConstant(Rotation, DL, MVT::i8));
11272
11273   // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11274   // TODO: Pull this out as a matchShuffleAsElementShift helper?
11275   // TODO: We can probably make this more aggressive and use shift-pairs like
11276   // lowerShuffleAsByteShiftMask.
11277   unsigned NumElts = Mask.size();
11278   unsigned ZeroLo = Zeroable.countr_one();
11279   unsigned ZeroHi = Zeroable.countl_one();
11280   assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11281   if (!ZeroLo && !ZeroHi)
11282     return SDValue();
11283
11284   if (ZeroLo) {
11285     SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11286     int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11287     if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11288       return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11289                          getZeroVector(VT, Subtarget, DAG, DL),
11290                          DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11291   }
11292
11293   if (ZeroHi) {
11294     SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11295     int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11296     if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11297       return DAG.getNode(X86ISD::VALIGN, DL, VT,
11298                          getZeroVector(VT, Subtarget, DAG, DL), Src,
11299                          DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11300   }
11301
11302   return SDValue();
11303 }
11304
11305 /// Try to lower a vector shuffle as a byte shift sequence.
11306 static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
11307                                            SDValue V2, ArrayRef<int> Mask,
11308                                            const APInt &Zeroable,
11309                                            const X86Subtarget &Subtarget,
11310                                            SelectionDAG &DAG) {
11311   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11312   assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11313
11314   // We need a shuffle that has zeros at one/both ends and a sequential
11315   // shuffle from one source within.
11316   unsigned ZeroLo = Zeroable.countr_one();
11317   unsigned ZeroHi = Zeroable.countl_one();
11318   if (!ZeroLo && !ZeroHi)
11319     return SDValue();
11320
11321   unsigned NumElts = Mask.size();
11322   unsigned Len = NumElts - (ZeroLo + ZeroHi);
11323   if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11324     return SDValue();
11325
11326   unsigned Scale = VT.getScalarSizeInBits() / 8;
11327   ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11328   if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11329       !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11330     return SDValue();
11331
11332   SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11333   Res = DAG.getBitcast(MVT::v16i8, Res);
11334
11335   // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11336   // inner sequential set of elements, possibly offset:
11337   // 01234567 --> zzzzzz01 --> 1zzzzzzz
11338   // 01234567 --> 4567zzzz --> zzzzz456
11339   // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11340   if (ZeroLo == 0) {
11341     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11342     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11343                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11344     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11345                       DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11346   } else if (ZeroHi == 0) {
11347     unsigned Shift = Mask[ZeroLo] % NumElts;
11348     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11349                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11350     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11351                       DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11352   } else if (!Subtarget.hasSSSE3()) {
11353     // If we don't have PSHUFB then its worth avoiding an AND constant mask
11354     // by performing 3 byte shifts. Shuffle combining can kick in above that.
11355     // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11356     unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11357     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11358                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11359     Shift += Mask[ZeroLo] % NumElts;
11360     Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11361                       DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11362     Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11363                       DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11364   } else
11365     return SDValue();
11366
11367   return DAG.getBitcast(VT, Res);
11368 }
11369
11370 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11371 ///
11372 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11373 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11374 /// matches elements from one of the input vectors shuffled to the left or
11375 /// right with zeroable elements 'shifted in'. It handles both the strictly
11376 /// bit-wise element shifts and the byte shift across an entire 128-bit double
11377 /// quad word lane.
11378 ///
11379 /// PSHL : (little-endian) left bit shift.
11380 /// [ zz, 0, zz,  2 ]
11381 /// [ -1, 4, zz, -1 ]
11382 /// PSRL : (little-endian) right bit shift.
11383 /// [  1, zz,  3, zz]
11384 /// [ -1, -1,  7, zz]
11385 /// PSLLDQ : (little-endian) left byte shift
11386 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
11387 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
11388 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
11389 /// PSRLDQ : (little-endian) right byte shift
11390 /// [  5, 6,  7, zz, zz, zz, zz, zz]
11391 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
11392 /// [  1, 2, -1, -1, -1, -1, zz, zz]
11393 static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11394                                unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11395                                int MaskOffset, const APInt &Zeroable,
11396                                const X86Subtarget &Subtarget) {
11397   int Size = Mask.size();
11398   unsigned SizeInBits = Size * ScalarSizeInBits;
11399
11400   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11401     for (int i = 0; i < Size; i += Scale)
11402       for (int j = 0; j < Shift; ++j)
11403         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11404           return false;
11405
11406     return true;
11407   };
11408
11409   auto MatchShift = [&](int Shift, int Scale, bool Left) {
11410     for (int i = 0; i != Size; i += Scale) {
11411       unsigned Pos = Left ? i + Shift : i;
11412       unsigned Low = Left ? i : i + Shift;
11413       unsigned Len = Scale - Shift;
11414       if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11415         return -1;
11416     }
11417
11418     int ShiftEltBits = ScalarSizeInBits * Scale;
11419     bool ByteShift = ShiftEltBits > 64;
11420     Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11421                   : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11422     int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11423
11424     // Normalize the scale for byte shifts to still produce an i64 element
11425     // type.
11426     Scale = ByteShift ? Scale / 2 : Scale;
11427
11428     // We need to round trip through the appropriate type for the shift.
11429     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11430     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11431                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
11432     return (int)ShiftAmt;
11433   };
11434
11435   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11436   // keep doubling the size of the integer elements up to that. We can
11437   // then shift the elements of the integer vector by whole multiples of
11438   // their width within the elements of the larger integer vector. Test each
11439   // multiple to see if we can find a match with the moved element indices
11440   // and that the shifted in elements are all zeroable.
11441   unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11442   for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11443     for (int Shift = 1; Shift != Scale; ++Shift)
11444       for (bool Left : {true, false})
11445         if (CheckZeros(Shift, Scale, Left)) {
11446           int ShiftAmt = MatchShift(Shift, Scale, Left);
11447           if (0 < ShiftAmt)
11448             return ShiftAmt;
11449         }
11450
11451   // no match
11452   return -1;
11453 }
11454
11455 static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
11456                                    SDValue V2, ArrayRef<int> Mask,
11457                                    const APInt &Zeroable,
11458                                    const X86Subtarget &Subtarget,
11459                                    SelectionDAG &DAG, bool BitwiseOnly) {
11460   int Size = Mask.size();
11461   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11462
11463   MVT ShiftVT;
11464   SDValue V = V1;
11465   unsigned Opcode;
11466
11467   // Try to match shuffle against V1 shift.
11468   int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11469                                      Mask, 0, Zeroable, Subtarget);
11470
11471   // If V1 failed, try to match shuffle against V2 shift.
11472   if (ShiftAmt < 0) {
11473     ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11474                                    Mask, Size, Zeroable, Subtarget);
11475     V = V2;
11476   }
11477
11478   if (ShiftAmt < 0)
11479     return SDValue();
11480
11481   if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
11482     return SDValue();
11483
11484   assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11485          "Illegal integer vector type");
11486   V = DAG.getBitcast(ShiftVT, V);
11487   V = DAG.getNode(Opcode, DL, ShiftVT, V,
11488                   DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11489   return DAG.getBitcast(VT, V);
11490 }
11491
11492 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
11493 // Remainder of lower half result is zero and upper half is all undef.
11494 static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
11495                                 ArrayRef<int> Mask, uint64_t &BitLen,
11496                                 uint64_t &BitIdx, const APInt &Zeroable) {
11497   int Size = Mask.size();
11498   int HalfSize = Size / 2;
11499   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11500   assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
11501
11502   // Upper half must be undefined.
11503   if (!isUndefUpperHalf(Mask))
11504     return false;
11505
11506   // Determine the extraction length from the part of the
11507   // lower half that isn't zeroable.
11508   int Len = HalfSize;
11509   for (; Len > 0; --Len)
11510     if (!Zeroable[Len - 1])
11511       break;
11512   assert(Len > 0 && "Zeroable shuffle mask");
11513
11514   // Attempt to match first Len sequential elements from the lower half.
11515   SDValue Src;
11516   int Idx = -1;
11517   for (int i = 0; i != Len; ++i) {
11518     int M = Mask[i];
11519     if (M == SM_SentinelUndef)
11520       continue;
11521     SDValue &V = (M < Size ? V1 : V2);
11522     M = M % Size;
11523
11524     // The extracted elements must start at a valid index and all mask
11525     // elements must be in the lower half.
11526     if (i > M || M >= HalfSize)
11527       return false;
11528
11529     if (Idx < 0 || (Src == V && Idx == (M - i))) {
11530       Src = V;
11531       Idx = M - i;
11532       continue;
11533     }
11534     return false;
11535   }
11536
11537   if (!Src || Idx < 0)
11538     return false;
11539
11540   assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
11541   BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11542   BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11543   V1 = Src;
11544   return true;
11545 }
11546
11547 // INSERTQ: Extract lowest Len elements from lower half of second source and
11548 // insert over first source, starting at Idx.
11549 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
11550 static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
11551                                   ArrayRef<int> Mask, uint64_t &BitLen,
11552                                   uint64_t &BitIdx) {
11553   int Size = Mask.size();
11554   int HalfSize = Size / 2;
11555   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11556
11557   // Upper half must be undefined.
11558   if (!isUndefUpperHalf(Mask))
11559     return false;
11560
11561   for (int Idx = 0; Idx != HalfSize; ++Idx) {
11562     SDValue Base;
11563
11564     // Attempt to match first source from mask before insertion point.
11565     if (isUndefInRange(Mask, 0, Idx)) {
11566       /* EMPTY */
11567     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
11568       Base = V1;
11569     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
11570       Base = V2;
11571     } else {
11572       continue;
11573     }
11574
11575     // Extend the extraction length looking to match both the insertion of
11576     // the second source and the remaining elements of the first.
11577     for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
11578       SDValue Insert;
11579       int Len = Hi - Idx;
11580
11581       // Match insertion.
11582       if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
11583         Insert = V1;
11584       } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
11585         Insert = V2;
11586       } else {
11587         continue;
11588       }
11589
11590       // Match the remaining elements of the lower half.
11591       if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
11592         /* EMPTY */
11593       } else if ((!Base || (Base == V1)) &&
11594                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
11595         Base = V1;
11596       } else if ((!Base || (Base == V2)) &&
11597                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
11598                                             Size + Hi)) {
11599         Base = V2;
11600       } else {
11601         continue;
11602       }
11603
11604       BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11605       BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11606       V1 = Base;
11607       V2 = Insert;
11608       return true;
11609     }
11610   }
11611
11612   return false;
11613 }
11614
11615 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
11616 static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
11617                                      SDValue V2, ArrayRef<int> Mask,
11618                                      const APInt &Zeroable, SelectionDAG &DAG) {
11619   uint64_t BitLen, BitIdx;
11620   if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
11621     return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
11622                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
11623                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11624
11625   if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
11626     return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
11627                        V2 ? V2 : DAG.getUNDEF(VT),
11628                        DAG.getTargetConstant(BitLen, DL, MVT::i8),
11629                        DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11630
11631   return SDValue();
11632 }
11633
11634 /// Lower a vector shuffle as a zero or any extension.
11635 ///
11636 /// Given a specific number of elements, element bit width, and extension
11637 /// stride, produce either a zero or any extension based on the available
11638 /// features of the subtarget. The extended elements are consecutive and
11639 /// begin and can start from an offsetted element index in the input; to
11640 /// avoid excess shuffling the offset must either being in the bottom lane
11641 /// or at the start of a higher lane. All extended elements must be from
11642 /// the same lane.
11643 static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
11644     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
11645     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11646   assert(Scale > 1 && "Need a scale to extend.");
11647   int EltBits = VT.getScalarSizeInBits();
11648   int NumElements = VT.getVectorNumElements();
11649   int NumEltsPerLane = 128 / EltBits;
11650   int OffsetLane = Offset / NumEltsPerLane;
11651   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
11652          "Only 8, 16, and 32 bit elements can be extended.");
11653   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
11654   assert(0 <= Offset && "Extension offset must be positive.");
11655   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
11656          "Extension offset must be in the first lane or start an upper lane.");
11657
11658   // Check that an index is in same lane as the base offset.
11659   auto SafeOffset = [&](int Idx) {
11660     return OffsetLane == (Idx / NumEltsPerLane);
11661   };
11662
11663   // Shift along an input so that the offset base moves to the first element.
11664   auto ShuffleOffset = [&](SDValue V) {
11665     if (!Offset)
11666       return V;
11667
11668     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11669     for (int i = 0; i * Scale < NumElements; ++i) {
11670       int SrcIdx = i + Offset;
11671       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
11672     }
11673     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
11674   };
11675
11676   // Found a valid a/zext mask! Try various lowering strategies based on the
11677   // input type and available ISA extensions.
11678   if (Subtarget.hasSSE41()) {
11679     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
11680     // PUNPCK will catch this in a later shuffle match.
11681     if (Offset && Scale == 2 && VT.is128BitVector())
11682       return SDValue();
11683     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
11684                                  NumElements / Scale);
11685     InputV = DAG.getBitcast(VT, InputV);
11686     InputV = ShuffleOffset(InputV);
11687     InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
11688                                     DL, ExtVT, InputV, DAG);
11689     return DAG.getBitcast(VT, InputV);
11690   }
11691
11692   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
11693   InputV = DAG.getBitcast(VT, InputV);
11694
11695   // For any extends we can cheat for larger element sizes and use shuffle
11696   // instructions that can fold with a load and/or copy.
11697   if (AnyExt && EltBits == 32) {
11698     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
11699                          -1};
11700     return DAG.getBitcast(
11701         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11702                         DAG.getBitcast(MVT::v4i32, InputV),
11703                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11704   }
11705   if (AnyExt && EltBits == 16 && Scale > 2) {
11706     int PSHUFDMask[4] = {Offset / 2, -1,
11707                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
11708     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11709                          DAG.getBitcast(MVT::v4i32, InputV),
11710                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11711     int PSHUFWMask[4] = {1, -1, -1, -1};
11712     unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
11713     return DAG.getBitcast(
11714         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
11715                         DAG.getBitcast(MVT::v8i16, InputV),
11716                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
11717   }
11718
11719   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
11720   // to 64-bits.
11721   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
11722     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
11723     assert(VT.is128BitVector() && "Unexpected vector width!");
11724
11725     int LoIdx = Offset * EltBits;
11726     SDValue Lo = DAG.getBitcast(
11727         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11728                                 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11729                                 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
11730
11731     if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
11732       return DAG.getBitcast(VT, Lo);
11733
11734     int HiIdx = (Offset + 1) * EltBits;
11735     SDValue Hi = DAG.getBitcast(
11736         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11737                                 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11738                                 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
11739     return DAG.getBitcast(VT,
11740                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
11741   }
11742
11743   // If this would require more than 2 unpack instructions to expand, use
11744   // pshufb when available. We can only use more than 2 unpack instructions
11745   // when zero extending i8 elements which also makes it easier to use pshufb.
11746   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
11747     assert(NumElements == 16 && "Unexpected byte vector width!");
11748     SDValue PSHUFBMask[16];
11749     for (int i = 0; i < 16; ++i) {
11750       int Idx = Offset + (i / Scale);
11751       if ((i % Scale == 0 && SafeOffset(Idx))) {
11752         PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
11753         continue;
11754       }
11755       PSHUFBMask[i] =
11756           AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
11757     }
11758     InputV = DAG.getBitcast(MVT::v16i8, InputV);
11759     return DAG.getBitcast(
11760         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
11761                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
11762   }
11763
11764   // If we are extending from an offset, ensure we start on a boundary that
11765   // we can unpack from.
11766   int AlignToUnpack = Offset % (NumElements / Scale);
11767   if (AlignToUnpack) {
11768     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11769     for (int i = AlignToUnpack; i < NumElements; ++i)
11770       ShMask[i - AlignToUnpack] = i;
11771     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
11772     Offset -= AlignToUnpack;
11773   }
11774
11775   // Otherwise emit a sequence of unpacks.
11776   do {
11777     unsigned UnpackLoHi = X86ISD::UNPCKL;
11778     if (Offset >= (NumElements / 2)) {
11779       UnpackLoHi = X86ISD::UNPCKH;
11780       Offset -= (NumElements / 2);
11781     }
11782
11783     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
11784     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
11785                          : getZeroVector(InputVT, Subtarget, DAG, DL);
11786     InputV = DAG.getBitcast(InputVT, InputV);
11787     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
11788     Scale /= 2;
11789     EltBits *= 2;
11790     NumElements /= 2;
11791   } while (Scale > 1);
11792   return DAG.getBitcast(VT, InputV);
11793 }
11794
11795 /// Try to lower a vector shuffle as a zero extension on any microarch.
11796 ///
11797 /// This routine will try to do everything in its power to cleverly lower
11798 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
11799 /// check for the profitability of this lowering,  it tries to aggressively
11800 /// match this pattern. It will use all of the micro-architectural details it
11801 /// can to emit an efficient lowering. It handles both blends with all-zero
11802 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
11803 /// masking out later).
11804 ///
11805 /// The reason we have dedicated lowering for zext-style shuffles is that they
11806 /// are both incredibly common and often quite performance sensitive.
11807 static SDValue lowerShuffleAsZeroOrAnyExtend(
11808     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11809     const APInt &Zeroable, const X86Subtarget &Subtarget,
11810     SelectionDAG &DAG) {
11811   int Bits = VT.getSizeInBits();
11812   int NumLanes = Bits / 128;
11813   int NumElements = VT.getVectorNumElements();
11814   int NumEltsPerLane = NumElements / NumLanes;
11815   assert(VT.getScalarSizeInBits() <= 32 &&
11816          "Exceeds 32-bit integer zero extension limit");
11817   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
11818
11819   // Define a helper function to check a particular ext-scale and lower to it if
11820   // valid.
11821   auto Lower = [&](int Scale) -> SDValue {
11822     SDValue InputV;
11823     bool AnyExt = true;
11824     int Offset = 0;
11825     int Matches = 0;
11826     for (int i = 0; i < NumElements; ++i) {
11827       int M = Mask[i];
11828       if (M < 0)
11829         continue; // Valid anywhere but doesn't tell us anything.
11830       if (i % Scale != 0) {
11831         // Each of the extended elements need to be zeroable.
11832         if (!Zeroable[i])
11833           return SDValue();
11834
11835         // We no longer are in the anyext case.
11836         AnyExt = false;
11837         continue;
11838       }
11839
11840       // Each of the base elements needs to be consecutive indices into the
11841       // same input vector.
11842       SDValue V = M < NumElements ? V1 : V2;
11843       M = M % NumElements;
11844       if (!InputV) {
11845         InputV = V;
11846         Offset = M - (i / Scale);
11847       } else if (InputV != V)
11848         return SDValue(); // Flip-flopping inputs.
11849
11850       // Offset must start in the lowest 128-bit lane or at the start of an
11851       // upper lane.
11852       // FIXME: Is it ever worth allowing a negative base offset?
11853       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
11854             (Offset % NumEltsPerLane) == 0))
11855         return SDValue();
11856
11857       // If we are offsetting, all referenced entries must come from the same
11858       // lane.
11859       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
11860         return SDValue();
11861
11862       if ((M % NumElements) != (Offset + (i / Scale)))
11863         return SDValue(); // Non-consecutive strided elements.
11864       Matches++;
11865     }
11866
11867     // If we fail to find an input, we have a zero-shuffle which should always
11868     // have already been handled.
11869     // FIXME: Maybe handle this here in case during blending we end up with one?
11870     if (!InputV)
11871       return SDValue();
11872
11873     // If we are offsetting, don't extend if we only match a single input, we
11874     // can always do better by using a basic PSHUF or PUNPCK.
11875     if (Offset != 0 && Matches < 2)
11876       return SDValue();
11877
11878     return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
11879                                                  InputV, Mask, Subtarget, DAG);
11880   };
11881
11882   // The widest scale possible for extending is to a 64-bit integer.
11883   assert(Bits % 64 == 0 &&
11884          "The number of bits in a vector must be divisible by 64 on x86!");
11885   int NumExtElements = Bits / 64;
11886
11887   // Each iteration, try extending the elements half as much, but into twice as
11888   // many elements.
11889   for (; NumExtElements < NumElements; NumExtElements *= 2) {
11890     assert(NumElements % NumExtElements == 0 &&
11891            "The input vector size must be divisible by the extended size.");
11892     if (SDValue V = Lower(NumElements / NumExtElements))
11893       return V;
11894   }
11895
11896   // General extends failed, but 128-bit vectors may be able to use MOVQ.
11897   if (Bits != 128)
11898     return SDValue();
11899
11900   // Returns one of the source operands if the shuffle can be reduced to a
11901   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
11902   auto CanZExtLowHalf = [&]() {
11903     for (int i = NumElements / 2; i != NumElements; ++i)
11904       if (!Zeroable[i])
11905         return SDValue();
11906     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
11907       return V1;
11908     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
11909       return V2;
11910     return SDValue();
11911   };
11912
11913   if (SDValue V = CanZExtLowHalf()) {
11914     V = DAG.getBitcast(MVT::v2i64, V);
11915     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
11916     return DAG.getBitcast(VT, V);
11917   }
11918
11919   // No viable ext lowering found.
11920   return SDValue();
11921 }
11922
11923 /// Try to get a scalar value for a specific element of a vector.
11924 ///
11925 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
11926 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
11927                                               SelectionDAG &DAG) {
11928   MVT VT = V.getSimpleValueType();
11929   MVT EltVT = VT.getVectorElementType();
11930   V = peekThroughBitcasts(V);
11931
11932   // If the bitcasts shift the element size, we can't extract an equivalent
11933   // element from it.
11934   MVT NewVT = V.getSimpleValueType();
11935   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
11936     return SDValue();
11937
11938   if (V.getOpcode() == ISD::BUILD_VECTOR ||
11939       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
11940     // Ensure the scalar operand is the same size as the destination.
11941     // FIXME: Add support for scalar truncation where possible.
11942     SDValue S = V.getOperand(Idx);
11943     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
11944       return DAG.getBitcast(EltVT, S);
11945   }
11946
11947   return SDValue();
11948 }
11949
11950 /// Helper to test for a load that can be folded with x86 shuffles.
11951 ///
11952 /// This is particularly important because the set of instructions varies
11953 /// significantly based on whether the operand is a load or not.
11954 static bool isShuffleFoldableLoad(SDValue V) {
11955   return V->hasOneUse() &&
11956          ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
11957 }
11958
11959 template<typename T>
11960 static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
11961   T EltVT = VT.getScalarType();
11962   return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
11963 }
11964
11965 /// Try to lower insertion of a single element into a zero vector.
11966 ///
11967 /// This is a common pattern that we have especially efficient patterns to lower
11968 /// across all subtarget feature sets.
11969 static SDValue lowerShuffleAsElementInsertion(
11970     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11971     const APInt &Zeroable, const X86Subtarget &Subtarget,
11972     SelectionDAG &DAG) {
11973   MVT ExtVT = VT;
11974   MVT EltVT = VT.getVectorElementType();
11975   unsigned NumElts = VT.getVectorNumElements();
11976   unsigned EltBits = VT.getScalarSizeInBits();
11977
11978   if (isSoftF16(EltVT, Subtarget))
11979     return SDValue();
11980
11981   int V2Index =
11982       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
11983       Mask.begin();
11984   bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
11985   bool IsV1Zeroable = true;
11986   for (int i = 0, Size = Mask.size(); i < Size; ++i)
11987     if (i != V2Index && !Zeroable[i]) {
11988       IsV1Zeroable = false;
11989       break;
11990     }
11991
11992   // Bail if a non-zero V1 isn't used in place.
11993   if (!IsV1Zeroable) {
11994     SmallVector<int, 8> V1Mask(Mask);
11995     V1Mask[V2Index] = -1;
11996     if (!isNoopShuffleMask(V1Mask))
11997       return SDValue();
11998   }
11999
12000   // Check for a single input from a SCALAR_TO_VECTOR node.
12001   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12002   // all the smarts here sunk into that routine. However, the current
12003   // lowering of BUILD_VECTOR makes that nearly impossible until the old
12004   // vector shuffle lowering is dead.
12005   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12006                                                DAG);
12007   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12008     // We need to zext the scalar if it is smaller than an i32.
12009     V2S = DAG.getBitcast(EltVT, V2S);
12010     if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12011       // Using zext to expand a narrow element won't work for non-zero
12012       // insertions. But we can use a masked constant vector if we're
12013       // inserting V2 into the bottom of V1.
12014       if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12015         return SDValue();
12016
12017       // Zero-extend directly to i32.
12018       ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12019       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12020
12021       // If we're inserting into a constant, mask off the inserted index
12022       // and OR with the zero-extended scalar.
12023       if (!IsV1Zeroable) {
12024         SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12025         Bits[V2Index] = APInt::getZero(EltBits);
12026         SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12027         V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12028         V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12029         V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12030         return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12031       }
12032     }
12033     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12034   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12035              EltVT == MVT::i16) {
12036     // Either not inserting from the low element of the input or the input
12037     // element size is too small to use VZEXT_MOVL to clear the high bits.
12038     return SDValue();
12039   }
12040
12041   if (!IsV1Zeroable) {
12042     // If V1 can't be treated as a zero vector we have fewer options to lower
12043     // this. We can't support integer vectors or non-zero targets cheaply.
12044     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12045     if (!VT.isFloatingPoint() || V2Index != 0)
12046       return SDValue();
12047     if (!VT.is128BitVector())
12048       return SDValue();
12049
12050     // Otherwise, use MOVSD, MOVSS or MOVSH.
12051     unsigned MovOpc = 0;
12052     if (EltVT == MVT::f16)
12053       MovOpc = X86ISD::MOVSH;
12054     else if (EltVT == MVT::f32)
12055       MovOpc = X86ISD::MOVSS;
12056     else if (EltVT == MVT::f64)
12057       MovOpc = X86ISD::MOVSD;
12058     else
12059       llvm_unreachable("Unsupported floating point element type to handle!");
12060     return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12061   }
12062
12063   // This lowering only works for the low element with floating point vectors.
12064   if (VT.isFloatingPoint() && V2Index != 0)
12065     return SDValue();
12066
12067   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12068   if (ExtVT != VT)
12069     V2 = DAG.getBitcast(VT, V2);
12070
12071   if (V2Index != 0) {
12072     // If we have 4 or fewer lanes we can cheaply shuffle the element into
12073     // the desired position. Otherwise it is more efficient to do a vector
12074     // shift left. We know that we can do a vector shift left because all
12075     // the inputs are zero.
12076     if (VT.isFloatingPoint() || NumElts <= 4) {
12077       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12078       V2Shuffle[V2Index] = 0;
12079       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12080     } else {
12081       V2 = DAG.getBitcast(MVT::v16i8, V2);
12082       V2 = DAG.getNode(
12083           X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12084           DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12085       V2 = DAG.getBitcast(VT, V2);
12086     }
12087   }
12088   return V2;
12089 }
12090
12091 /// Try to lower broadcast of a single - truncated - integer element,
12092 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12093 ///
12094 /// This assumes we have AVX2.
12095 static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
12096                                             int BroadcastIdx,
12097                                             const X86Subtarget &Subtarget,
12098                                             SelectionDAG &DAG) {
12099   assert(Subtarget.hasAVX2() &&
12100          "We can only lower integer broadcasts with AVX2!");
12101
12102   MVT EltVT = VT.getVectorElementType();
12103   MVT V0VT = V0.getSimpleValueType();
12104
12105   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12106   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12107
12108   MVT V0EltVT = V0VT.getVectorElementType();
12109   if (!V0EltVT.isInteger())
12110     return SDValue();
12111
12112   const unsigned EltSize = EltVT.getSizeInBits();
12113   const unsigned V0EltSize = V0EltVT.getSizeInBits();
12114
12115   // This is only a truncation if the original element type is larger.
12116   if (V0EltSize <= EltSize)
12117     return SDValue();
12118
12119   assert(((V0EltSize % EltSize) == 0) &&
12120          "Scalar type sizes must all be powers of 2 on x86!");
12121
12122   const unsigned V0Opc = V0.getOpcode();
12123   const unsigned Scale = V0EltSize / EltSize;
12124   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12125
12126   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12127       V0Opc != ISD::BUILD_VECTOR)
12128     return SDValue();
12129
12130   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12131
12132   // If we're extracting non-least-significant bits, shift so we can truncate.
12133   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12134   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12135   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12136   if (const int OffsetIdx = BroadcastIdx % Scale)
12137     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12138                          DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12139
12140   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12141                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12142 }
12143
12144 /// Test whether this can be lowered with a single SHUFPS instruction.
12145 ///
12146 /// This is used to disable more specialized lowerings when the shufps lowering
12147 /// will happen to be efficient.
12148 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
12149   // This routine only handles 128-bit shufps.
12150   assert(Mask.size() == 4 && "Unsupported mask size!");
12151   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12152   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12153   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12154   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12155
12156   // To lower with a single SHUFPS we need to have the low half and high half
12157   // each requiring a single input.
12158   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12159     return false;
12160   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12161     return false;
12162
12163   return true;
12164 }
12165
12166 /// Test whether the specified input (0 or 1) is in-place blended by the
12167 /// given mask.
12168 ///
12169 /// This returns true if the elements from a particular input are already in the
12170 /// slot required by the given mask and require no permutation.
12171 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12172   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12173   int Size = Mask.size();
12174   for (int i = 0; i < Size; ++i)
12175     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12176       return false;
12177
12178   return true;
12179 }
12180
12181 /// If we are extracting two 128-bit halves of a vector and shuffling the
12182 /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12183 /// multi-shuffle lowering.
12184 static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
12185                                              SDValue N1, ArrayRef<int> Mask,
12186                                              SelectionDAG &DAG) {
12187   MVT VT = N0.getSimpleValueType();
12188   assert((VT.is128BitVector() &&
12189           (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12190          "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12191
12192   // Check that both sources are extracts of the same source vector.
12193   if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12194       N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12195       N0.getOperand(0) != N1.getOperand(0) ||
12196       !N0.hasOneUse() || !N1.hasOneUse())
12197     return SDValue();
12198
12199   SDValue WideVec = N0.getOperand(0);
12200   MVT WideVT = WideVec.getSimpleValueType();
12201   if (!WideVT.is256BitVector())
12202     return SDValue();
12203
12204   // Match extracts of each half of the wide source vector. Commute the shuffle
12205   // if the extract of the low half is N1.
12206   unsigned NumElts = VT.getVectorNumElements();
12207   SmallVector<int, 4> NewMask(Mask);
12208   const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12209   const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12210   if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12211     ShuffleVectorSDNode::commuteMask(NewMask);
12212   else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12213     return SDValue();
12214
12215   // Final bailout: if the mask is simple, we are better off using an extract
12216   // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12217   // because that avoids a constant load from memory.
12218   if (NumElts == 4 &&
12219       (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12220     return SDValue();
12221
12222   // Extend the shuffle mask with undef elements.
12223   NewMask.append(NumElts, -1);
12224
12225   // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12226   SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12227                                       NewMask);
12228   // This is free: ymm -> xmm.
12229   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12230                      DAG.getIntPtrConstant(0, DL));
12231 }
12232
12233 /// Try to lower broadcast of a single element.
12234 ///
12235 /// For convenience, this code also bundles all of the subtarget feature set
12236 /// filtering. While a little annoying to re-dispatch on type here, there isn't
12237 /// a convenient way to factor it out.
12238 static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
12239                                        SDValue V2, ArrayRef<int> Mask,
12240                                        const X86Subtarget &Subtarget,
12241                                        SelectionDAG &DAG) {
12242   MVT EltVT = VT.getVectorElementType();
12243   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12244         (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12245         (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12246     return SDValue();
12247
12248   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12249   // we can only broadcast from a register with AVX2.
12250   unsigned NumEltBits = VT.getScalarSizeInBits();
12251   unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12252                         ? X86ISD::MOVDDUP
12253                         : X86ISD::VBROADCAST;
12254   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12255
12256   // Check that the mask is a broadcast.
12257   int BroadcastIdx = getSplatIndex(Mask);
12258   if (BroadcastIdx < 0)
12259     return SDValue();
12260   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12261                                             "a sorted mask where the broadcast "
12262                                             "comes from V1.");
12263
12264   // Go up the chain of (vector) values to find a scalar load that we can
12265   // combine with the broadcast.
12266   // TODO: Combine this logic with findEltLoadSrc() used by
12267   //       EltsFromConsecutiveLoads().
12268   int BitOffset = BroadcastIdx * NumEltBits;
12269   SDValue V = V1;
12270   for (;;) {
12271     switch (V.getOpcode()) {
12272     case ISD::BITCAST: {
12273       V = V.getOperand(0);
12274       continue;
12275     }
12276     case ISD::CONCAT_VECTORS: {
12277       int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12278       int OpIdx = BitOffset / OpBitWidth;
12279       V = V.getOperand(OpIdx);
12280       BitOffset %= OpBitWidth;
12281       continue;
12282     }
12283     case ISD::EXTRACT_SUBVECTOR: {
12284       // The extraction index adds to the existing offset.
12285       unsigned EltBitWidth = V.getScalarValueSizeInBits();
12286       unsigned Idx = V.getConstantOperandVal(1);
12287       unsigned BeginOffset = Idx * EltBitWidth;
12288       BitOffset += BeginOffset;
12289       V = V.getOperand(0);
12290       continue;
12291     }
12292     case ISD::INSERT_SUBVECTOR: {
12293       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12294       int EltBitWidth = VOuter.getScalarValueSizeInBits();
12295       int Idx = (int)V.getConstantOperandVal(2);
12296       int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12297       int BeginOffset = Idx * EltBitWidth;
12298       int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12299       if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12300         BitOffset -= BeginOffset;
12301         V = VInner;
12302       } else {
12303         V = VOuter;
12304       }
12305       continue;
12306     }
12307     }
12308     break;
12309   }
12310   assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12311   BroadcastIdx = BitOffset / NumEltBits;
12312
12313   // Do we need to bitcast the source to retrieve the original broadcast index?
12314   bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12315
12316   // Check if this is a broadcast of a scalar. We special case lowering
12317   // for scalars so that we can more effectively fold with loads.
12318   // If the original value has a larger element type than the shuffle, the
12319   // broadcast element is in essence truncated. Make that explicit to ease
12320   // folding.
12321   if (BitCastSrc && VT.isInteger())
12322     if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12323             DL, VT, V, BroadcastIdx, Subtarget, DAG))
12324       return TruncBroadcast;
12325
12326   // Also check the simpler case, where we can directly reuse the scalar.
12327   if (!BitCastSrc &&
12328       ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12329        (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12330     V = V.getOperand(BroadcastIdx);
12331
12332     // If we can't broadcast from a register, check that the input is a load.
12333     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12334       return SDValue();
12335   } else if (ISD::isNormalLoad(V.getNode()) &&
12336              cast<LoadSDNode>(V)->isSimple()) {
12337     // We do not check for one-use of the vector load because a broadcast load
12338     // is expected to be a win for code size, register pressure, and possibly
12339     // uops even if the original vector load is not eliminated.
12340
12341     // Reduce the vector load and shuffle to a broadcasted scalar load.
12342     LoadSDNode *Ld = cast<LoadSDNode>(V);
12343     SDValue BaseAddr = Ld->getOperand(1);
12344     MVT SVT = VT.getScalarType();
12345     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12346     assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12347     SDValue NewAddr =
12348         DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
12349
12350     // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12351     // than MOVDDUP.
12352     // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12353     if (Opcode == X86ISD::VBROADCAST) {
12354       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12355       SDValue Ops[] = {Ld->getChain(), NewAddr};
12356       V = DAG.getMemIntrinsicNode(
12357           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12358           DAG.getMachineFunction().getMachineMemOperand(
12359               Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12360       DAG.makeEquivalentMemoryOrdering(Ld, V);
12361       return DAG.getBitcast(VT, V);
12362     }
12363     assert(SVT == MVT::f64 && "Unexpected VT!");
12364     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12365                     DAG.getMachineFunction().getMachineMemOperand(
12366                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12367     DAG.makeEquivalentMemoryOrdering(Ld, V);
12368   } else if (!BroadcastFromReg) {
12369     // We can't broadcast from a vector register.
12370     return SDValue();
12371   } else if (BitOffset != 0) {
12372     // We can only broadcast from the zero-element of a vector register,
12373     // but it can be advantageous to broadcast from the zero-element of a
12374     // subvector.
12375     if (!VT.is256BitVector() && !VT.is512BitVector())
12376       return SDValue();
12377
12378     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12379     if (VT == MVT::v4f64 || VT == MVT::v4i64)
12380       return SDValue();
12381
12382     // Only broadcast the zero-element of a 128-bit subvector.
12383     if ((BitOffset % 128) != 0)
12384       return SDValue();
12385
12386     assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12387            "Unexpected bit-offset");
12388     assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12389            "Unexpected vector size");
12390     unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12391     V = extract128BitVector(V, ExtractIdx, DAG, DL);
12392   }
12393
12394   // On AVX we can use VBROADCAST directly for scalar sources.
12395   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12396     V = DAG.getBitcast(MVT::f64, V);
12397     if (Subtarget.hasAVX()) {
12398       V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12399       return DAG.getBitcast(VT, V);
12400     }
12401     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12402   }
12403
12404   // If this is a scalar, do the broadcast on this type and bitcast.
12405   if (!V.getValueType().isVector()) {
12406     assert(V.getScalarValueSizeInBits() == NumEltBits &&
12407            "Unexpected scalar size");
12408     MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12409                                        VT.getVectorNumElements());
12410     return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12411   }
12412
12413   // We only support broadcasting from 128-bit vectors to minimize the
12414   // number of patterns we need to deal with in isel. So extract down to
12415   // 128-bits, removing as many bitcasts as possible.
12416   if (V.getValueSizeInBits() > 128)
12417     V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
12418
12419   // Otherwise cast V to a vector with the same element type as VT, but
12420   // possibly narrower than VT. Then perform the broadcast.
12421   unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12422   MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12423   return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12424 }
12425
12426 // Check for whether we can use INSERTPS to perform the shuffle. We only use
12427 // INSERTPS when the V1 elements are already in the correct locations
12428 // because otherwise we can just always use two SHUFPS instructions which
12429 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12430 // perform INSERTPS if a single V1 element is out of place and all V2
12431 // elements are zeroable.
12432 static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
12433                                    unsigned &InsertPSMask,
12434                                    const APInt &Zeroable,
12435                                    ArrayRef<int> Mask, SelectionDAG &DAG) {
12436   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12437   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12438   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12439
12440   // Attempt to match INSERTPS with one element from VA or VB being
12441   // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12442   // are updated.
12443   auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12444                              ArrayRef<int> CandidateMask) {
12445     unsigned ZMask = 0;
12446     int VADstIndex = -1;
12447     int VBDstIndex = -1;
12448     bool VAUsedInPlace = false;
12449
12450     for (int i = 0; i < 4; ++i) {
12451       // Synthesize a zero mask from the zeroable elements (includes undefs).
12452       if (Zeroable[i]) {
12453         ZMask |= 1 << i;
12454         continue;
12455       }
12456
12457       // Flag if we use any VA inputs in place.
12458       if (i == CandidateMask[i]) {
12459         VAUsedInPlace = true;
12460         continue;
12461       }
12462
12463       // We can only insert a single non-zeroable element.
12464       if (VADstIndex >= 0 || VBDstIndex >= 0)
12465         return false;
12466
12467       if (CandidateMask[i] < 4) {
12468         // VA input out of place for insertion.
12469         VADstIndex = i;
12470       } else {
12471         // VB input for insertion.
12472         VBDstIndex = i;
12473       }
12474     }
12475
12476     // Don't bother if we have no (non-zeroable) element for insertion.
12477     if (VADstIndex < 0 && VBDstIndex < 0)
12478       return false;
12479
12480     // Determine element insertion src/dst indices. The src index is from the
12481     // start of the inserted vector, not the start of the concatenated vector.
12482     unsigned VBSrcIndex = 0;
12483     if (VADstIndex >= 0) {
12484       // If we have a VA input out of place, we use VA as the V2 element
12485       // insertion and don't use the original V2 at all.
12486       VBSrcIndex = CandidateMask[VADstIndex];
12487       VBDstIndex = VADstIndex;
12488       VB = VA;
12489     } else {
12490       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
12491     }
12492
12493     // If no V1 inputs are used in place, then the result is created only from
12494     // the zero mask and the V2 insertion - so remove V1 dependency.
12495     if (!VAUsedInPlace)
12496       VA = DAG.getUNDEF(MVT::v4f32);
12497
12498     // Update V1, V2 and InsertPSMask accordingly.
12499     V1 = VA;
12500     V2 = VB;
12501
12502     // Insert the V2 element into the desired position.
12503     InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
12504     assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
12505     return true;
12506   };
12507
12508   if (matchAsInsertPS(V1, V2, Mask))
12509     return true;
12510
12511   // Commute and try again.
12512   SmallVector<int, 4> CommutedMask(Mask);
12513   ShuffleVectorSDNode::commuteMask(CommutedMask);
12514   if (matchAsInsertPS(V2, V1, CommutedMask))
12515     return true;
12516
12517   return false;
12518 }
12519
12520 static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
12521                                       ArrayRef<int> Mask, const APInt &Zeroable,
12522                                       SelectionDAG &DAG) {
12523   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12524   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12525
12526   // Attempt to match the insertps pattern.
12527   unsigned InsertPSMask = 0;
12528   if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
12529     return SDValue();
12530
12531   // Insert the V2 element into the desired position.
12532   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
12533                      DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
12534 }
12535
12536 /// Handle lowering of 2-lane 64-bit floating point shuffles.
12537 ///
12538 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
12539 /// support for floating point shuffles but not integer shuffles. These
12540 /// instructions will incur a domain crossing penalty on some chips though so
12541 /// it is better to avoid lowering through this for integer vectors where
12542 /// possible.
12543 static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
12544                                  const APInt &Zeroable, SDValue V1, SDValue V2,
12545                                  const X86Subtarget &Subtarget,
12546                                  SelectionDAG &DAG) {
12547   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12548   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12549   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12550
12551   if (V2.isUndef()) {
12552     // Check for being able to broadcast a single element.
12553     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
12554                                                     Mask, Subtarget, DAG))
12555       return Broadcast;
12556
12557     // Straight shuffle of a single input vector. Simulate this by using the
12558     // single input as both of the "inputs" to this instruction..
12559     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
12560
12561     if (Subtarget.hasAVX()) {
12562       // If we have AVX, we can use VPERMILPS which will allow folding a load
12563       // into the shuffle.
12564       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
12565                          DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12566     }
12567
12568     return DAG.getNode(
12569         X86ISD::SHUFP, DL, MVT::v2f64,
12570         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12571         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12572         DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12573   }
12574   assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12575   assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12576   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12577   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12578
12579   if (Subtarget.hasAVX2())
12580     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12581       return Extract;
12582
12583   // When loading a scalar and then shuffling it into a vector we can often do
12584   // the insertion cheaply.
12585   if (SDValue Insertion = lowerShuffleAsElementInsertion(
12586           DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12587     return Insertion;
12588   // Try inverting the insertion since for v2 masks it is easy to do and we
12589   // can't reliably sort the mask one way or the other.
12590   int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
12591                         Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
12592   if (SDValue Insertion = lowerShuffleAsElementInsertion(
12593           DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12594     return Insertion;
12595
12596   // Try to use one of the special instruction patterns to handle two common
12597   // blend patterns if a zero-blend above didn't work.
12598   if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
12599       isShuffleEquivalent(Mask, {1, 3}, V1, V2))
12600     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
12601       // We can either use a special instruction to load over the low double or
12602       // to move just the low double.
12603       return DAG.getNode(
12604           X86ISD::MOVSD, DL, MVT::v2f64, V2,
12605           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
12606
12607   if (Subtarget.hasSSE41())
12608     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
12609                                             Zeroable, Subtarget, DAG))
12610       return Blend;
12611
12612   // Use dedicated unpack instructions for masks that match their pattern.
12613   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
12614     return V;
12615
12616   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
12617   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
12618                      DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12619 }
12620
12621 /// Handle lowering of 2-lane 64-bit integer shuffles.
12622 ///
12623 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
12624 /// the integer unit to minimize domain crossing penalties. However, for blends
12625 /// it falls back to the floating point shuffle operation with appropriate bit
12626 /// casting.
12627 static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
12628                                  const APInt &Zeroable, SDValue V1, SDValue V2,
12629                                  const X86Subtarget &Subtarget,
12630                                  SelectionDAG &DAG) {
12631   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12632   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12633   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12634
12635   if (V2.isUndef()) {
12636     // Check for being able to broadcast a single element.
12637     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
12638                                                     Mask, Subtarget, DAG))
12639       return Broadcast;
12640
12641     // Straight shuffle of a single input vector. For everything from SSE2
12642     // onward this has a single fast instruction with no scary immediates.
12643     // We have to map the mask as it is actually a v4i32 shuffle instruction.
12644     V1 = DAG.getBitcast(MVT::v4i32, V1);
12645     int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
12646                           Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
12647                           Mask[1] < 0 ? -1 : (Mask[1] * 2),
12648                           Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
12649     return DAG.getBitcast(
12650         MVT::v2i64,
12651         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
12652                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
12653   }
12654   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
12655   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
12656   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12657   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12658
12659   if (Subtarget.hasAVX2())
12660     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12661       return Extract;
12662
12663   // Try to use shift instructions.
12664   if (SDValue Shift =
12665           lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
12666                               DAG, /*BitwiseOnly*/ false))
12667     return Shift;
12668
12669   // When loading a scalar and then shuffling it into a vector we can often do
12670   // the insertion cheaply.
12671   if (SDValue Insertion = lowerShuffleAsElementInsertion(
12672           DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12673     return Insertion;
12674   // Try inverting the insertion since for v2 masks it is easy to do and we
12675   // can't reliably sort the mask one way or the other.
12676   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
12677   if (SDValue Insertion = lowerShuffleAsElementInsertion(
12678           DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12679     return Insertion;
12680
12681   // We have different paths for blend lowering, but they all must use the
12682   // *exact* same predicate.
12683   bool IsBlendSupported = Subtarget.hasSSE41();
12684   if (IsBlendSupported)
12685     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
12686                                             Zeroable, Subtarget, DAG))
12687       return Blend;
12688
12689   // Use dedicated unpack instructions for masks that match their pattern.
12690   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
12691     return V;
12692
12693   // Try to use byte rotation instructions.
12694   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
12695   if (Subtarget.hasSSSE3()) {
12696     if (Subtarget.hasVLX())
12697       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
12698                                                 Zeroable, Subtarget, DAG))
12699         return Rotate;
12700
12701     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
12702                                                   Subtarget, DAG))
12703       return Rotate;
12704   }
12705
12706   // If we have direct support for blends, we should lower by decomposing into
12707   // a permute. That will be faster than the domain cross.
12708   if (IsBlendSupported)
12709     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
12710                                                 Subtarget, DAG);
12711
12712   // We implement this with SHUFPD which is pretty lame because it will likely
12713   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
12714   // However, all the alternatives are still more cycles and newer chips don't
12715   // have this problem. It would be really nice if x86 had better shuffles here.
12716   V1 = DAG.getBitcast(MVT::v2f64, V1);
12717   V2 = DAG.getBitcast(MVT::v2f64, V2);
12718   return DAG.getBitcast(MVT::v2i64,
12719                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
12720 }
12721
12722 /// Lower a vector shuffle using the SHUFPS instruction.
12723 ///
12724 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
12725 /// It makes no assumptions about whether this is the *best* lowering, it simply
12726 /// uses it.
12727 static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
12728                                       ArrayRef<int> Mask, SDValue V1,
12729                                       SDValue V2, SelectionDAG &DAG) {
12730   SDValue LowV = V1, HighV = V2;
12731   SmallVector<int, 4> NewMask(Mask);
12732   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12733
12734   if (NumV2Elements == 1) {
12735     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
12736
12737     // Compute the index adjacent to V2Index and in the same half by toggling
12738     // the low bit.
12739     int V2AdjIndex = V2Index ^ 1;
12740
12741     if (Mask[V2AdjIndex] < 0) {
12742       // Handles all the cases where we have a single V2 element and an undef.
12743       // This will only ever happen in the high lanes because we commute the
12744       // vector otherwise.
12745       if (V2Index < 2)
12746         std::swap(LowV, HighV);
12747       NewMask[V2Index] -= 4;
12748     } else {
12749       // Handle the case where the V2 element ends up adjacent to a V1 element.
12750       // To make this work, blend them together as the first step.
12751       int V1Index = V2AdjIndex;
12752       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
12753       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
12754                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
12755
12756       // Now proceed to reconstruct the final blend as we have the necessary
12757       // high or low half formed.
12758       if (V2Index < 2) {
12759         LowV = V2;
12760         HighV = V1;
12761       } else {
12762         HighV = V2;
12763       }
12764       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
12765       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
12766     }
12767   } else if (NumV2Elements == 2) {
12768     if (Mask[0] < 4 && Mask[1] < 4) {
12769       // Handle the easy case where we have V1 in the low lanes and V2 in the
12770       // high lanes.
12771       NewMask[2] -= 4;
12772       NewMask[3] -= 4;
12773     } else if (Mask[2] < 4 && Mask[3] < 4) {
12774       // We also handle the reversed case because this utility may get called
12775       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
12776       // arrange things in the right direction.
12777       NewMask[0] -= 4;
12778       NewMask[1] -= 4;
12779       HighV = V1;
12780       LowV = V2;
12781     } else {
12782       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
12783       // trying to place elements directly, just blend them and set up the final
12784       // shuffle to place them.
12785
12786       // The first two blend mask elements are for V1, the second two are for
12787       // V2.
12788       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
12789                           Mask[2] < 4 ? Mask[2] : Mask[3],
12790                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
12791                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
12792       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12793                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
12794
12795       // Now we do a normal shuffle of V1 by giving V1 as both operands to
12796       // a blend.
12797       LowV = HighV = V1;
12798       NewMask[0] = Mask[0] < 4 ? 0 : 2;
12799       NewMask[1] = Mask[0] < 4 ? 2 : 0;
12800       NewMask[2] = Mask[2] < 4 ? 1 : 3;
12801       NewMask[3] = Mask[2] < 4 ? 3 : 1;
12802     }
12803   } else if (NumV2Elements == 3) {
12804     // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
12805     // we can get here due to other paths (e.g repeated mask matching) that we
12806     // don't want to do another round of lowerVECTOR_SHUFFLE.
12807     ShuffleVectorSDNode::commuteMask(NewMask);
12808     return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
12809   }
12810   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
12811                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
12812 }
12813
12814 /// Lower 4-lane 32-bit floating point shuffles.
12815 ///
12816 /// Uses instructions exclusively from the floating point unit to minimize
12817 /// domain crossing penalties, as these are sufficient to implement all v4f32
12818 /// shuffles.
12819 static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
12820                                  const APInt &Zeroable, SDValue V1, SDValue V2,
12821                                  const X86Subtarget &Subtarget,
12822                                  SelectionDAG &DAG) {
12823   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12824   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12825   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12826
12827   if (Subtarget.hasSSE41())
12828     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
12829                                             Zeroable, Subtarget, DAG))
12830       return Blend;
12831
12832   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12833
12834   if (NumV2Elements == 0) {
12835     // Check for being able to broadcast a single element.
12836     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
12837                                                     Mask, Subtarget, DAG))
12838       return Broadcast;
12839
12840     // Use even/odd duplicate instructions for masks that match their pattern.
12841     if (Subtarget.hasSSE3()) {
12842       if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
12843         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
12844       if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
12845         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
12846     }
12847
12848     if (Subtarget.hasAVX()) {
12849       // If we have AVX, we can use VPERMILPS which will allow folding a load
12850       // into the shuffle.
12851       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
12852                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12853     }
12854
12855     // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
12856     // in SSE1 because otherwise they are widened to v2f64 and never get here.
12857     if (!Subtarget.hasSSE2()) {
12858       if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
12859         return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
12860       if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
12861         return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
12862     }
12863
12864     // Otherwise, use a straight shuffle of a single input vector. We pass the
12865     // input vector to both operands to simulate this with a SHUFPS.
12866     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
12867                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12868   }
12869
12870   if (Subtarget.hasSSE2())
12871     if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
12872             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
12873       ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
12874       return ZExt;
12875     }
12876
12877   if (Subtarget.hasAVX2())
12878     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12879       return Extract;
12880
12881   // There are special ways we can lower some single-element blends. However, we
12882   // have custom ways we can lower more complex single-element blends below that
12883   // we defer to if both this and BLENDPS fail to match, so restrict this to
12884   // when the V2 input is targeting element 0 of the mask -- that is the fast
12885   // case here.
12886   if (NumV2Elements == 1 && Mask[0] >= 4)
12887     if (SDValue V = lowerShuffleAsElementInsertion(
12888             DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12889       return V;
12890
12891   if (Subtarget.hasSSE41()) {
12892     // Use INSERTPS if we can complete the shuffle efficiently.
12893     if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
12894       return V;
12895
12896     if (!isSingleSHUFPSMask(Mask))
12897       if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
12898                                                             V2, Mask, DAG))
12899         return BlendPerm;
12900   }
12901
12902   // Use low/high mov instructions. These are only valid in SSE1 because
12903   // otherwise they are widened to v2f64 and never get here.
12904   if (!Subtarget.hasSSE2()) {
12905     if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
12906       return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
12907     if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
12908       return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
12909   }
12910
12911   // Use dedicated unpack instructions for masks that match their pattern.
12912   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
12913     return V;
12914
12915   // Otherwise fall back to a SHUFPS lowering strategy.
12916   return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
12917 }
12918
12919 /// Lower 4-lane i32 vector shuffles.
12920 ///
12921 /// We try to handle these with integer-domain shuffles where we can, but for
12922 /// blends we use the floating point domain blend instructions.
12923 static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
12924                                  const APInt &Zeroable, SDValue V1, SDValue V2,
12925                                  const X86Subtarget &Subtarget,
12926                                  SelectionDAG &DAG) {
12927   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
12928   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
12929   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12930
12931   // Whenever we can lower this as a zext, that instruction is strictly faster
12932   // than any alternative. It also allows us to fold memory operands into the
12933   // shuffle in many cases.
12934   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
12935                                                    Zeroable, Subtarget, DAG))
12936     return ZExt;
12937
12938   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12939
12940   // Try to use shift instructions if fast.
12941   if (Subtarget.preferLowerShuffleAsShift()) {
12942     if (SDValue Shift =
12943             lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
12944                                 Subtarget, DAG, /*BitwiseOnly*/ true))
12945       return Shift;
12946     if (NumV2Elements == 0)
12947       if (SDValue Rotate =
12948               lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
12949         return Rotate;
12950   }
12951
12952   if (NumV2Elements == 0) {
12953     // Try to use broadcast unless the mask only has one non-undef element.
12954     if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
12955       if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
12956                                                       Mask, Subtarget, DAG))
12957         return Broadcast;
12958     }
12959
12960     // Straight shuffle of a single input vector. For everything from SSE2
12961     // onward this has a single fast instruction with no scary immediates.
12962     // We coerce the shuffle pattern to be compatible with UNPCK instructions
12963     // but we aren't actually going to use the UNPCK instruction because doing
12964     // so prevents folding a load into this instruction or making a copy.
12965     const int UnpackLoMask[] = {0, 0, 1, 1};
12966     const int UnpackHiMask[] = {2, 2, 3, 3};
12967     if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
12968       Mask = UnpackLoMask;
12969     else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
12970       Mask = UnpackHiMask;
12971
12972     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
12973                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12974   }
12975
12976   if (Subtarget.hasAVX2())
12977     if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12978       return Extract;
12979
12980   // Try to use shift instructions.
12981   if (SDValue Shift =
12982           lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
12983                               DAG, /*BitwiseOnly*/ false))
12984     return Shift;
12985
12986   // There are special ways we can lower some single-element blends.
12987   if (NumV2Elements == 1)
12988     if (SDValue V = lowerShuffleAsElementInsertion(
12989             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12990       return V;
12991
12992   // We have different paths for blend lowering, but they all must use the
12993   // *exact* same predicate.
12994   bool IsBlendSupported = Subtarget.hasSSE41();
12995   if (IsBlendSupported)
12996     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
12997                                             Zeroable, Subtarget, DAG))
12998       return Blend;
12999
13000   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13001                                              Zeroable, Subtarget, DAG))
13002     return Masked;
13003
13004   // Use dedicated unpack instructions for masks that match their pattern.
13005   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
13006     return V;
13007
13008   // Try to use byte rotation instructions.
13009   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13010   if (Subtarget.hasSSSE3()) {
13011     if (Subtarget.hasVLX())
13012       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13013                                                 Zeroable, Subtarget, DAG))
13014         return Rotate;
13015
13016     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13017                                                   Subtarget, DAG))
13018       return Rotate;
13019   }
13020
13021   // Assume that a single SHUFPS is faster than an alternative sequence of
13022   // multiple instructions (even if the CPU has a domain penalty).
13023   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13024   if (!isSingleSHUFPSMask(Mask)) {
13025     // If we have direct support for blends, we should lower by decomposing into
13026     // a permute. That will be faster than the domain cross.
13027     if (IsBlendSupported)
13028       return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13029                                                   Subtarget, DAG);
13030
13031     // Try to lower by permuting the inputs into an unpack instruction.
13032     if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13033                                                         Mask, Subtarget, DAG))
13034       return Unpack;
13035   }
13036
13037   // We implement this with SHUFPS because it can blend from two vectors.
13038   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13039   // up the inputs, bypassing domain shift penalties that we would incur if we
13040   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13041   // relevant.
13042   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13043   SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13044   SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13045   return DAG.getBitcast(MVT::v4i32, ShufPS);
13046 }
13047
13048 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13049 /// shuffle lowering, and the most complex part.
13050 ///
13051 /// The lowering strategy is to try to form pairs of input lanes which are
13052 /// targeted at the same half of the final vector, and then use a dword shuffle
13053 /// to place them onto the right half, and finally unpack the paired lanes into
13054 /// their final position.
13055 ///
13056 /// The exact breakdown of how to form these dword pairs and align them on the
13057 /// correct sides is really tricky. See the comments within the function for
13058 /// more of the details.
13059 ///
13060 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13061 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13062 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13063 /// vector, form the analogous 128-bit 8-element Mask.
13064 static SDValue lowerV8I16GeneralSingleInputShuffle(
13065     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13066     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13067   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13068   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13069
13070   assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13071   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13072   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13073
13074   // Attempt to directly match PSHUFLW or PSHUFHW.
13075   if (isUndefOrInRange(LoMask, 0, 4) &&
13076       isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13077     return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13078                        getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13079   }
13080   if (isUndefOrInRange(HiMask, 4, 8) &&
13081       isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13082     for (int i = 0; i != 4; ++i)
13083       HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13084     return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13085                        getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13086   }
13087
13088   SmallVector<int, 4> LoInputs;
13089   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13090   array_pod_sort(LoInputs.begin(), LoInputs.end());
13091   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
13092   SmallVector<int, 4> HiInputs;
13093   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13094   array_pod_sort(HiInputs.begin(), HiInputs.end());
13095   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
13096   int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13097   int NumHToL = LoInputs.size() - NumLToL;
13098   int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13099   int NumHToH = HiInputs.size() - NumLToH;
13100   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13101   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13102   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13103   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13104
13105   // If we are shuffling values from one half - check how many different DWORD
13106   // pairs we need to create. If only 1 or 2 then we can perform this as a
13107   // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13108   auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13109                                ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13110     V = DAG.getNode(ShufWOp, DL, VT, V,
13111                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13112     V = DAG.getBitcast(PSHUFDVT, V);
13113     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13114                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13115     return DAG.getBitcast(VT, V);
13116   };
13117
13118   if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13119     int PSHUFDMask[4] = { -1, -1, -1, -1 };
13120     SmallVector<std::pair<int, int>, 4> DWordPairs;
13121     int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13122
13123     // Collect the different DWORD pairs.
13124     for (int DWord = 0; DWord != 4; ++DWord) {
13125       int M0 = Mask[2 * DWord + 0];
13126       int M1 = Mask[2 * DWord + 1];
13127       M0 = (M0 >= 0 ? M0 % 4 : M0);
13128       M1 = (M1 >= 0 ? M1 % 4 : M1);
13129       if (M0 < 0 && M1 < 0)
13130         continue;
13131
13132       bool Match = false;
13133       for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13134         auto &DWordPair = DWordPairs[j];
13135         if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13136             (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13137           DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13138           DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13139           PSHUFDMask[DWord] = DOffset + j;
13140           Match = true;
13141           break;
13142         }
13143       }
13144       if (!Match) {
13145         PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13146         DWordPairs.push_back(std::make_pair(M0, M1));
13147       }
13148     }
13149
13150     if (DWordPairs.size() <= 2) {
13151       DWordPairs.resize(2, std::make_pair(-1, -1));
13152       int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13153                               DWordPairs[1].first, DWordPairs[1].second};
13154       if ((NumHToL + NumHToH) == 0)
13155         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13156       if ((NumLToL + NumLToH) == 0)
13157         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13158     }
13159   }
13160
13161   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13162   // such inputs we can swap two of the dwords across the half mark and end up
13163   // with <=2 inputs to each half in each half. Once there, we can fall through
13164   // to the generic code below. For example:
13165   //
13166   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13167   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13168   //
13169   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13170   // and an existing 2-into-2 on the other half. In this case we may have to
13171   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13172   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13173   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13174   // because any other situation (including a 3-into-1 or 1-into-3 in the other
13175   // half than the one we target for fixing) will be fixed when we re-enter this
13176   // path. We will also combine away any sequence of PSHUFD instructions that
13177   // result into a single instruction. Here is an example of the tricky case:
13178   //
13179   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13180   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13181   //
13182   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13183   //
13184   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13185   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13186   //
13187   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13188   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13189   //
13190   // The result is fine to be handled by the generic logic.
13191   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13192                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13193                           int AOffset, int BOffset) {
13194     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13195            "Must call this with A having 3 or 1 inputs from the A half.");
13196     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13197            "Must call this with B having 1 or 3 inputs from the B half.");
13198     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13199            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13200
13201     bool ThreeAInputs = AToAInputs.size() == 3;
13202
13203     // Compute the index of dword with only one word among the three inputs in
13204     // a half by taking the sum of the half with three inputs and subtracting
13205     // the sum of the actual three inputs. The difference is the remaining
13206     // slot.
13207     int ADWord = 0, BDWord = 0;
13208     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13209     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13210     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13211     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13212     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13213     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13214     int TripleNonInputIdx =
13215         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13216     TripleDWord = TripleNonInputIdx / 2;
13217
13218     // We use xor with one to compute the adjacent DWord to whichever one the
13219     // OneInput is in.
13220     OneInputDWord = (OneInput / 2) ^ 1;
13221
13222     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13223     // and BToA inputs. If there is also such a problem with the BToB and AToB
13224     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13225     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13226     // is essential that we don't *create* a 3<-1 as then we might oscillate.
13227     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13228       // Compute how many inputs will be flipped by swapping these DWords. We
13229       // need
13230       // to balance this to ensure we don't form a 3-1 shuffle in the other
13231       // half.
13232       int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13233                                  llvm::count(AToBInputs, 2 * ADWord + 1);
13234       int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13235                                  llvm::count(BToBInputs, 2 * BDWord + 1);
13236       if ((NumFlippedAToBInputs == 1 &&
13237            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13238           (NumFlippedBToBInputs == 1 &&
13239            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13240         // We choose whether to fix the A half or B half based on whether that
13241         // half has zero flipped inputs. At zero, we may not be able to fix it
13242         // with that half. We also bias towards fixing the B half because that
13243         // will more commonly be the high half, and we have to bias one way.
13244         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13245                                                        ArrayRef<int> Inputs) {
13246           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13247           bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13248           // Determine whether the free index is in the flipped dword or the
13249           // unflipped dword based on where the pinned index is. We use this bit
13250           // in an xor to conditionally select the adjacent dword.
13251           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13252           bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13253           if (IsFixIdxInput == IsFixFreeIdxInput)
13254             FixFreeIdx += 1;
13255           IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13256           assert(IsFixIdxInput != IsFixFreeIdxInput &&
13257                  "We need to be changing the number of flipped inputs!");
13258           int PSHUFHalfMask[] = {0, 1, 2, 3};
13259           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13260           V = DAG.getNode(
13261               FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13262               MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13263               getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13264
13265           for (int &M : Mask)
13266             if (M >= 0 && M == FixIdx)
13267               M = FixFreeIdx;
13268             else if (M >= 0 && M == FixFreeIdx)
13269               M = FixIdx;
13270         };
13271         if (NumFlippedBToBInputs != 0) {
13272           int BPinnedIdx =
13273               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13274           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13275         } else {
13276           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13277           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13278           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13279         }
13280       }
13281     }
13282
13283     int PSHUFDMask[] = {0, 1, 2, 3};
13284     PSHUFDMask[ADWord] = BDWord;
13285     PSHUFDMask[BDWord] = ADWord;
13286     V = DAG.getBitcast(
13287         VT,
13288         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13289                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13290
13291     // Adjust the mask to match the new locations of A and B.
13292     for (int &M : Mask)
13293       if (M >= 0 && M/2 == ADWord)
13294         M = 2 * BDWord + M % 2;
13295       else if (M >= 0 && M/2 == BDWord)
13296         M = 2 * ADWord + M % 2;
13297
13298     // Recurse back into this routine to re-compute state now that this isn't
13299     // a 3 and 1 problem.
13300     return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13301   };
13302   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13303     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13304   if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13305     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13306
13307   // At this point there are at most two inputs to the low and high halves from
13308   // each half. That means the inputs can always be grouped into dwords and
13309   // those dwords can then be moved to the correct half with a dword shuffle.
13310   // We use at most one low and one high word shuffle to collect these paired
13311   // inputs into dwords, and finally a dword shuffle to place them.
13312   int PSHUFLMask[4] = {-1, -1, -1, -1};
13313   int PSHUFHMask[4] = {-1, -1, -1, -1};
13314   int PSHUFDMask[4] = {-1, -1, -1, -1};
13315
13316   // First fix the masks for all the inputs that are staying in their
13317   // original halves. This will then dictate the targets of the cross-half
13318   // shuffles.
13319   auto fixInPlaceInputs =
13320       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13321                     MutableArrayRef<int> SourceHalfMask,
13322                     MutableArrayRef<int> HalfMask, int HalfOffset) {
13323     if (InPlaceInputs.empty())
13324       return;
13325     if (InPlaceInputs.size() == 1) {
13326       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13327           InPlaceInputs[0] - HalfOffset;
13328       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13329       return;
13330     }
13331     if (IncomingInputs.empty()) {
13332       // Just fix all of the in place inputs.
13333       for (int Input : InPlaceInputs) {
13334         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13335         PSHUFDMask[Input / 2] = Input / 2;
13336       }
13337       return;
13338     }
13339
13340     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13341     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13342         InPlaceInputs[0] - HalfOffset;
13343     // Put the second input next to the first so that they are packed into
13344     // a dword. We find the adjacent index by toggling the low bit.
13345     int AdjIndex = InPlaceInputs[0] ^ 1;
13346     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13347     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13348     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13349   };
13350   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13351   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13352
13353   // Now gather the cross-half inputs and place them into a free dword of
13354   // their target half.
13355   // FIXME: This operation could almost certainly be simplified dramatically to
13356   // look more like the 3-1 fixing operation.
13357   auto moveInputsToRightHalf = [&PSHUFDMask](
13358       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13359       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13360       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13361       int DestOffset) {
13362     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13363       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13364     };
13365     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13366                                                int Word) {
13367       int LowWord = Word & ~1;
13368       int HighWord = Word | 1;
13369       return isWordClobbered(SourceHalfMask, LowWord) ||
13370              isWordClobbered(SourceHalfMask, HighWord);
13371     };
13372
13373     if (IncomingInputs.empty())
13374       return;
13375
13376     if (ExistingInputs.empty()) {
13377       // Map any dwords with inputs from them into the right half.
13378       for (int Input : IncomingInputs) {
13379         // If the source half mask maps over the inputs, turn those into
13380         // swaps and use the swapped lane.
13381         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13382           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13383             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13384                 Input - SourceOffset;
13385             // We have to swap the uses in our half mask in one sweep.
13386             for (int &M : HalfMask)
13387               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13388                 M = Input;
13389               else if (M == Input)
13390                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13391           } else {
13392             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13393                        Input - SourceOffset &&
13394                    "Previous placement doesn't match!");
13395           }
13396           // Note that this correctly re-maps both when we do a swap and when
13397           // we observe the other side of the swap above. We rely on that to
13398           // avoid swapping the members of the input list directly.
13399           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13400         }
13401
13402         // Map the input's dword into the correct half.
13403         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13404           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13405         else
13406           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13407                      Input / 2 &&
13408                  "Previous placement doesn't match!");
13409       }
13410
13411       // And just directly shift any other-half mask elements to be same-half
13412       // as we will have mirrored the dword containing the element into the
13413       // same position within that half.
13414       for (int &M : HalfMask)
13415         if (M >= SourceOffset && M < SourceOffset + 4) {
13416           M = M - SourceOffset + DestOffset;
13417           assert(M >= 0 && "This should never wrap below zero!");
13418         }
13419       return;
13420     }
13421
13422     // Ensure we have the input in a viable dword of its current half. This
13423     // is particularly tricky because the original position may be clobbered
13424     // by inputs being moved and *staying* in that half.
13425     if (IncomingInputs.size() == 1) {
13426       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13427         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13428                          SourceOffset;
13429         SourceHalfMask[InputFixed - SourceOffset] =
13430             IncomingInputs[0] - SourceOffset;
13431         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13432                      InputFixed);
13433         IncomingInputs[0] = InputFixed;
13434       }
13435     } else if (IncomingInputs.size() == 2) {
13436       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13437           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13438         // We have two non-adjacent or clobbered inputs we need to extract from
13439         // the source half. To do this, we need to map them into some adjacent
13440         // dword slot in the source mask.
13441         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13442                               IncomingInputs[1] - SourceOffset};
13443
13444         // If there is a free slot in the source half mask adjacent to one of
13445         // the inputs, place the other input in it. We use (Index XOR 1) to
13446         // compute an adjacent index.
13447         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13448             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13449           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13450           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13451           InputsFixed[1] = InputsFixed[0] ^ 1;
13452         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13453                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13454           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13455           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13456           InputsFixed[0] = InputsFixed[1] ^ 1;
13457         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13458                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13459           // The two inputs are in the same DWord but it is clobbered and the
13460           // adjacent DWord isn't used at all. Move both inputs to the free
13461           // slot.
13462           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13463           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13464           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13465           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13466         } else {
13467           // The only way we hit this point is if there is no clobbering
13468           // (because there are no off-half inputs to this half) and there is no
13469           // free slot adjacent to one of the inputs. In this case, we have to
13470           // swap an input with a non-input.
13471           for (int i = 0; i < 4; ++i)
13472             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13473                    "We can't handle any clobbers here!");
13474           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13475                  "Cannot have adjacent inputs here!");
13476
13477           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13478           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
13479
13480           // We also have to update the final source mask in this case because
13481           // it may need to undo the above swap.
13482           for (int &M : FinalSourceHalfMask)
13483             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
13484               M = InputsFixed[1] + SourceOffset;
13485             else if (M == InputsFixed[1] + SourceOffset)
13486               M = (InputsFixed[0] ^ 1) + SourceOffset;
13487
13488           InputsFixed[1] = InputsFixed[0] ^ 1;
13489         }
13490
13491         // Point everything at the fixed inputs.
13492         for (int &M : HalfMask)
13493           if (M == IncomingInputs[0])
13494             M = InputsFixed[0] + SourceOffset;
13495           else if (M == IncomingInputs[1])
13496             M = InputsFixed[1] + SourceOffset;
13497
13498         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
13499         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
13500       }
13501     } else {
13502       llvm_unreachable("Unhandled input size!");
13503     }
13504
13505     // Now hoist the DWord down to the right half.
13506     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
13507     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
13508     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
13509     for (int &M : HalfMask)
13510       for (int Input : IncomingInputs)
13511         if (M == Input)
13512           M = FreeDWord * 2 + Input % 2;
13513   };
13514   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
13515                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
13516   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
13517                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
13518
13519   // Now enact all the shuffles we've computed to move the inputs into their
13520   // target half.
13521   if (!isNoopShuffleMask(PSHUFLMask))
13522     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13523                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
13524   if (!isNoopShuffleMask(PSHUFHMask))
13525     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13526                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
13527   if (!isNoopShuffleMask(PSHUFDMask))
13528     V = DAG.getBitcast(
13529         VT,
13530         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13531                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13532
13533   // At this point, each half should contain all its inputs, and we can then
13534   // just shuffle them into their final position.
13535   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
13536          "Failed to lift all the high half inputs to the low mask!");
13537   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
13538          "Failed to lift all the low half inputs to the high mask!");
13539
13540   // Do a half shuffle for the low mask.
13541   if (!isNoopShuffleMask(LoMask))
13542     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13543                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13544
13545   // Do a half shuffle with the high mask after shifting its values down.
13546   for (int &M : HiMask)
13547     if (M >= 0)
13548       M -= 4;
13549   if (!isNoopShuffleMask(HiMask))
13550     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13551                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13552
13553   return V;
13554 }
13555
13556 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
13557 /// blend if only one input is used.
13558 static SDValue lowerShuffleAsBlendOfPSHUFBs(
13559     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13560     const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
13561   assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
13562          "Lane crossing shuffle masks not supported");
13563
13564   int NumBytes = VT.getSizeInBits() / 8;
13565   int Size = Mask.size();
13566   int Scale = NumBytes / Size;
13567
13568   SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13569   SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13570   V1InUse = false;
13571   V2InUse = false;
13572
13573   for (int i = 0; i < NumBytes; ++i) {
13574     int M = Mask[i / Scale];
13575     if (M < 0)
13576       continue;
13577
13578     const int ZeroMask = 0x80;
13579     int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
13580     int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
13581     if (Zeroable[i / Scale])
13582       V1Idx = V2Idx = ZeroMask;
13583
13584     V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
13585     V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
13586     V1InUse |= (ZeroMask != V1Idx);
13587     V2InUse |= (ZeroMask != V2Idx);
13588   }
13589
13590   MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
13591   if (V1InUse)
13592     V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
13593                      DAG.getBuildVector(ShufVT, DL, V1Mask));
13594   if (V2InUse)
13595     V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
13596                      DAG.getBuildVector(ShufVT, DL, V2Mask));
13597
13598   // If we need shuffled inputs from both, blend the two.
13599   SDValue V;
13600   if (V1InUse && V2InUse)
13601     V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
13602   else
13603     V = V1InUse ? V1 : V2;
13604
13605   // Cast the result back to the correct type.
13606   return DAG.getBitcast(VT, V);
13607 }
13608
13609 /// Generic lowering of 8-lane i16 shuffles.
13610 ///
13611 /// This handles both single-input shuffles and combined shuffle/blends with
13612 /// two inputs. The single input shuffles are immediately delegated to
13613 /// a dedicated lowering routine.
13614 ///
13615 /// The blends are lowered in one of three fundamental ways. If there are few
13616 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
13617 /// of the input is significantly cheaper when lowered as an interleaving of
13618 /// the two inputs, try to interleave them. Otherwise, blend the low and high
13619 /// halves of the inputs separately (making them have relatively few inputs)
13620 /// and then concatenate them.
13621 static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13622                                  const APInt &Zeroable, SDValue V1, SDValue V2,
13623                                  const X86Subtarget &Subtarget,
13624                                  SelectionDAG &DAG) {
13625   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13626   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13627   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13628
13629   // Whenever we can lower this as a zext, that instruction is strictly faster
13630   // than any alternative.
13631   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
13632                                                    Zeroable, Subtarget, DAG))
13633     return ZExt;
13634
13635   // Try to use lower using a truncation.
13636   if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13637                                         Subtarget, DAG))
13638     return V;
13639
13640   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
13641
13642   if (NumV2Inputs == 0) {
13643     // Try to use shift instructions.
13644     if (SDValue Shift =
13645             lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
13646                                 Subtarget, DAG, /*BitwiseOnly*/ false))
13647       return Shift;
13648
13649     // Check for being able to broadcast a single element.
13650     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
13651                                                     Mask, Subtarget, DAG))
13652       return Broadcast;
13653
13654     // Try to use bit rotation instructions.
13655     if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
13656                                                  Subtarget, DAG))
13657       return Rotate;
13658
13659     // Use dedicated unpack instructions for masks that match their pattern.
13660     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13661       return V;
13662
13663     // Use dedicated pack instructions for masks that match their pattern.
13664     if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13665                                          Subtarget))
13666       return V;
13667
13668     // Try to use byte rotation instructions.
13669     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
13670                                                   Subtarget, DAG))
13671       return Rotate;
13672
13673     // Make a copy of the mask so it can be modified.
13674     SmallVector<int, 8> MutableMask(Mask);
13675     return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
13676                                                Subtarget, DAG);
13677   }
13678
13679   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
13680          "All single-input shuffles should be canonicalized to be V1-input "
13681          "shuffles.");
13682
13683   // Try to use shift instructions.
13684   if (SDValue Shift =
13685           lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
13686                               DAG, /*BitwiseOnly*/ false))
13687     return Shift;
13688
13689   // See if we can use SSE4A Extraction / Insertion.
13690   if (Subtarget.hasSSE4A())
13691     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
13692                                           Zeroable, DAG))
13693       return V;
13694
13695   // There are special ways we can lower some single-element blends.
13696   if (NumV2Inputs == 1)
13697     if (SDValue V = lowerShuffleAsElementInsertion(
13698             DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13699       return V;
13700
13701   // We have different paths for blend lowering, but they all must use the
13702   // *exact* same predicate.
13703   bool IsBlendSupported = Subtarget.hasSSE41();
13704   if (IsBlendSupported)
13705     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
13706                                             Zeroable, Subtarget, DAG))
13707       return Blend;
13708
13709   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
13710                                              Zeroable, Subtarget, DAG))
13711     return Masked;
13712
13713   // Use dedicated unpack instructions for masks that match their pattern.
13714   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13715     return V;
13716
13717   // Use dedicated pack instructions for masks that match their pattern.
13718   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13719                                        Subtarget))
13720     return V;
13721
13722   // Try to use lower using a truncation.
13723   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13724                                        Subtarget, DAG))
13725     return V;
13726
13727   // Try to use byte rotation instructions.
13728   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
13729                                                 Subtarget, DAG))
13730     return Rotate;
13731
13732   if (SDValue BitBlend =
13733           lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
13734     return BitBlend;
13735
13736   // Try to use byte shift instructions to mask.
13737   if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
13738                                               Zeroable, Subtarget, DAG))
13739     return V;
13740
13741   // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
13742   int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
13743   if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
13744       !Subtarget.hasVLX()) {
13745     // Check if this is part of a 256-bit vector truncation.
13746     unsigned PackOpc = 0;
13747     if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
13748         peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13749         peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {
13750       SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
13751       V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
13752                          getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
13753                          DAG.getTargetConstant(0xEE, DL, MVT::i8));
13754       V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
13755       V1 = extract128BitVector(V1V2, 0, DAG, DL);
13756       V2 = extract128BitVector(V1V2, 4, DAG, DL);
13757       PackOpc = X86ISD::PACKUS;
13758     } else if (Subtarget.hasSSE41()) {
13759       SmallVector<SDValue, 4> DWordClearOps(4,
13760                                             DAG.getConstant(0, DL, MVT::i32));
13761       for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
13762         DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
13763       SDValue DWordClearMask =
13764           DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
13765       V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
13766                        DWordClearMask);
13767       V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
13768                        DWordClearMask);
13769       PackOpc = X86ISD::PACKUS;
13770     } else if (!Subtarget.hasSSSE3()) {
13771       SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
13772       V1 = DAG.getBitcast(MVT::v4i32, V1);
13773       V2 = DAG.getBitcast(MVT::v4i32, V2);
13774       V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
13775       V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
13776       V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
13777       V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
13778       PackOpc = X86ISD::PACKSS;
13779     }
13780     if (PackOpc) {
13781       // Now pack things back together.
13782       SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
13783       if (NumEvenDrops == 2) {
13784         Result = DAG.getBitcast(MVT::v4i32, Result);
13785         Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
13786       }
13787       return Result;
13788     }
13789   }
13790
13791   // When compacting odd (upper) elements, use PACKSS pre-SSE41.
13792   int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
13793   if (NumOddDrops == 1) {
13794     bool HasSSE41 = Subtarget.hasSSE41();
13795     V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
13796                      DAG.getBitcast(MVT::v4i32, V1),
13797                      DAG.getTargetConstant(16, DL, MVT::i8));
13798     V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
13799                      DAG.getBitcast(MVT::v4i32, V2),
13800                      DAG.getTargetConstant(16, DL, MVT::i8));
13801     return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
13802                        MVT::v8i16, V1, V2);
13803   }
13804
13805   // Try to lower by permuting the inputs into an unpack instruction.
13806   if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
13807                                                       Mask, Subtarget, DAG))
13808     return Unpack;
13809
13810   // If we can't directly blend but can use PSHUFB, that will be better as it
13811   // can both shuffle and set up the inefficient blend.
13812   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
13813     bool V1InUse, V2InUse;
13814     return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
13815                                         Zeroable, DAG, V1InUse, V2InUse);
13816   }
13817
13818   // We can always bit-blend if we have to so the fallback strategy is to
13819   // decompose into single-input permutes and blends/unpacks.
13820   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
13821                                               Mask, Subtarget, DAG);
13822 }
13823
13824 /// Lower 8-lane 16-bit floating point shuffles.
13825 static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13826                                  const APInt &Zeroable, SDValue V1, SDValue V2,
13827                                  const X86Subtarget &Subtarget,
13828                                  SelectionDAG &DAG) {
13829   assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
13830   assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
13831   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13832   int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
13833
13834   if (Subtarget.hasFP16()) {
13835     if (NumV2Elements == 0) {
13836       // Check for being able to broadcast a single element.
13837       if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
13838                                                       Mask, Subtarget, DAG))
13839         return Broadcast;
13840     }
13841     if (NumV2Elements == 1 && Mask[0] >= 8)
13842       if (SDValue V = lowerShuffleAsElementInsertion(
13843               DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13844         return V;
13845   }
13846
13847   V1 = DAG.getBitcast(MVT::v8i16, V1);
13848   V2 = DAG.getBitcast(MVT::v8i16, V2);
13849   return DAG.getBitcast(MVT::v8f16,
13850                         DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
13851 }
13852
13853 // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
13854 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
13855 // the active subvector is extracted.
13856 static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
13857                                      ArrayRef<int> Mask, SDValue V1, SDValue V2,
13858                                      const X86Subtarget &Subtarget,
13859                                      SelectionDAG &DAG) {
13860   MVT MaskVT = VT.changeTypeToInteger();
13861   SDValue MaskNode;
13862   MVT ShuffleVT = VT;
13863   if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
13864     V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
13865     V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
13866     ShuffleVT = V1.getSimpleValueType();
13867
13868     // Adjust mask to correct indices for the second input.
13869     int NumElts = VT.getVectorNumElements();
13870     unsigned Scale = 512 / VT.getSizeInBits();
13871     SmallVector<int, 32> AdjustedMask(Mask);
13872     for (int &M : AdjustedMask)
13873       if (NumElts <= M)
13874         M += (Scale - 1) * NumElts;
13875     MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
13876     MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
13877   } else {
13878     MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
13879   }
13880
13881   SDValue Result;
13882   if (V2.isUndef())
13883     Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
13884   else
13885     Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
13886
13887   if (VT != ShuffleVT)
13888     Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
13889
13890   return Result;
13891 }
13892
13893 /// Generic lowering of v16i8 shuffles.
13894 ///
13895 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
13896 /// detect any complexity reducing interleaving. If that doesn't help, it uses
13897 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
13898 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
13899 /// back together.
13900 static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13901                                  const APInt &Zeroable, SDValue V1, SDValue V2,
13902                                  const X86Subtarget &Subtarget,
13903                                  SelectionDAG &DAG) {
13904   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
13905   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
13906   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13907
13908   // Try to use shift instructions.
13909   if (SDValue Shift =
13910           lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
13911                               DAG, /*BitwiseOnly*/ false))
13912     return Shift;
13913
13914   // Try to use byte rotation instructions.
13915   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
13916                                                 Subtarget, DAG))
13917     return Rotate;
13918
13919   // Use dedicated pack instructions for masks that match their pattern.
13920   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
13921                                        Subtarget))
13922     return V;
13923
13924   // Try to use a zext lowering.
13925   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
13926                                                    Zeroable, Subtarget, DAG))
13927     return ZExt;
13928
13929   // Try to use lower using a truncation.
13930   if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
13931                                         Subtarget, DAG))
13932     return V;
13933
13934   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
13935                                        Subtarget, DAG))
13936     return V;
13937
13938   // See if we can use SSE4A Extraction / Insertion.
13939   if (Subtarget.hasSSE4A())
13940     if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
13941                                           Zeroable, DAG))
13942       return V;
13943
13944   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
13945
13946   // For single-input shuffles, there are some nicer lowering tricks we can use.
13947   if (NumV2Elements == 0) {
13948     // Check for being able to broadcast a single element.
13949     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
13950                                                     Mask, Subtarget, DAG))
13951       return Broadcast;
13952
13953     // Try to use bit rotation instructions.
13954     if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
13955                                                  Subtarget, DAG))
13956       return Rotate;
13957
13958     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
13959       return V;
13960
13961     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
13962     // Notably, this handles splat and partial-splat shuffles more efficiently.
13963     // However, it only makes sense if the pre-duplication shuffle simplifies
13964     // things significantly. Currently, this means we need to be able to
13965     // express the pre-duplication shuffle as an i16 shuffle.
13966     //
13967     // FIXME: We should check for other patterns which can be widened into an
13968     // i16 shuffle as well.
13969     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
13970       for (int i = 0; i < 16; i += 2)
13971         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
13972           return false;
13973
13974       return true;
13975     };
13976     auto tryToWidenViaDuplication = [&]() -> SDValue {
13977       if (!canWidenViaDuplication(Mask))
13978         return SDValue();
13979       SmallVector<int, 4> LoInputs;
13980       copy_if(Mask, std::back_inserter(LoInputs),
13981               [](int M) { return M >= 0 && M < 8; });
13982       array_pod_sort(LoInputs.begin(), LoInputs.end());
13983       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
13984                      LoInputs.end());
13985       SmallVector<int, 4> HiInputs;
13986       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
13987       array_pod_sort(HiInputs.begin(), HiInputs.end());
13988       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
13989                      HiInputs.end());
13990
13991       bool TargetLo = LoInputs.size() >= HiInputs.size();
13992       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
13993       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
13994
13995       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
13996       SmallDenseMap<int, int, 8> LaneMap;
13997       for (int I : InPlaceInputs) {
13998         PreDupI16Shuffle[I/2] = I/2;
13999         LaneMap[I] = I;
14000       }
14001       int j = TargetLo ? 0 : 4, je = j + 4;
14002       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14003         // Check if j is already a shuffle of this input. This happens when
14004         // there are two adjacent bytes after we move the low one.
14005         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14006           // If we haven't yet mapped the input, search for a slot into which
14007           // we can map it.
14008           while (j < je && PreDupI16Shuffle[j] >= 0)
14009             ++j;
14010
14011           if (j == je)
14012             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14013             return SDValue();
14014
14015           // Map this input with the i16 shuffle.
14016           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14017         }
14018
14019         // Update the lane map based on the mapping we ended up with.
14020         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14021       }
14022       V1 = DAG.getBitcast(
14023           MVT::v16i8,
14024           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14025                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14026
14027       // Unpack the bytes to form the i16s that will be shuffled into place.
14028       bool EvenInUse = false, OddInUse = false;
14029       for (int i = 0; i < 16; i += 2) {
14030         EvenInUse |= (Mask[i + 0] >= 0);
14031         OddInUse |= (Mask[i + 1] >= 0);
14032         if (EvenInUse && OddInUse)
14033           break;
14034       }
14035       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14036                        MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14037                        OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14038
14039       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14040       for (int i = 0; i < 16; ++i)
14041         if (Mask[i] >= 0) {
14042           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14043           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14044           if (PostDupI16Shuffle[i / 2] < 0)
14045             PostDupI16Shuffle[i / 2] = MappedMask;
14046           else
14047             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14048                    "Conflicting entries in the original shuffle!");
14049         }
14050       return DAG.getBitcast(
14051           MVT::v16i8,
14052           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14053                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14054     };
14055     if (SDValue V = tryToWidenViaDuplication())
14056       return V;
14057   }
14058
14059   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14060                                              Zeroable, Subtarget, DAG))
14061     return Masked;
14062
14063   // Use dedicated unpack instructions for masks that match their pattern.
14064   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14065     return V;
14066
14067   // Try to use byte shift instructions to mask.
14068   if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14069                                               Zeroable, Subtarget, DAG))
14070     return V;
14071
14072   // Check for compaction patterns.
14073   bool IsSingleInput = V2.isUndef();
14074   int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14075
14076   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14077   // with PSHUFB. It is important to do this before we attempt to generate any
14078   // blends but after all of the single-input lowerings. If the single input
14079   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14080   // want to preserve that and we can DAG combine any longer sequences into
14081   // a PSHUFB in the end. But once we start blending from multiple inputs,
14082   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14083   // and there are *very* few patterns that would actually be faster than the
14084   // PSHUFB approach because of its ability to zero lanes.
14085   //
14086   // If the mask is a binary compaction, we can more efficiently perform this
14087   // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14088   //
14089   // FIXME: The only exceptions to the above are blends which are exact
14090   // interleavings with direct instructions supporting them. We currently don't
14091   // handle those well here.
14092   if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14093     bool V1InUse = false;
14094     bool V2InUse = false;
14095
14096     SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
14097         DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14098
14099     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14100     // do so. This avoids using them to handle blends-with-zero which is
14101     // important as a single pshufb is significantly faster for that.
14102     if (V1InUse && V2InUse) {
14103       if (Subtarget.hasSSE41())
14104         if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14105                                                 Zeroable, Subtarget, DAG))
14106           return Blend;
14107
14108       // We can use an unpack to do the blending rather than an or in some
14109       // cases. Even though the or may be (very minorly) more efficient, we
14110       // preference this lowering because there are common cases where part of
14111       // the complexity of the shuffles goes away when we do the final blend as
14112       // an unpack.
14113       // FIXME: It might be worth trying to detect if the unpack-feeding
14114       // shuffles will both be pshufb, in which case we shouldn't bother with
14115       // this.
14116       if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
14117               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14118         return Unpack;
14119
14120       // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14121       if (Subtarget.hasVBMI())
14122         return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14123                                      DAG);
14124
14125       // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14126       if (Subtarget.hasXOP()) {
14127         SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14128         return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14129       }
14130
14131       // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14132       // PALIGNR will be cheaper than the second PSHUFB+OR.
14133       if (SDValue V = lowerShuffleAsByteRotateAndPermute(
14134               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14135         return V;
14136     }
14137
14138     return PSHUFB;
14139   }
14140
14141   // There are special ways we can lower some single-element blends.
14142   if (NumV2Elements == 1)
14143     if (SDValue V = lowerShuffleAsElementInsertion(
14144             DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14145       return V;
14146
14147   if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14148     return Blend;
14149
14150   // Check whether a compaction lowering can be done. This handles shuffles
14151   // which take every Nth element for some even N. See the helper function for
14152   // details.
14153   //
14154   // We special case these as they can be particularly efficiently handled with
14155   // the PACKUSB instruction on x86 and they show up in common patterns of
14156   // rearranging bytes to truncate wide elements.
14157   if (NumEvenDrops) {
14158     // NumEvenDrops is the power of two stride of the elements. Another way of
14159     // thinking about it is that we need to drop the even elements this many
14160     // times to get the original input.
14161
14162     // First we need to zero all the dropped bytes.
14163     assert(NumEvenDrops <= 3 &&
14164            "No support for dropping even elements more than 3 times.");
14165     SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14166     for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14167       WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14168     SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14169     V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14170                      WordClearMask);
14171     if (!IsSingleInput)
14172       V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14173                        WordClearMask);
14174
14175     // Now pack things back together.
14176     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14177                                  IsSingleInput ? V1 : V2);
14178     for (int i = 1; i < NumEvenDrops; ++i) {
14179       Result = DAG.getBitcast(MVT::v8i16, Result);
14180       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14181     }
14182     return Result;
14183   }
14184
14185   int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14186   if (NumOddDrops == 1) {
14187     V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14188                      DAG.getBitcast(MVT::v8i16, V1),
14189                      DAG.getTargetConstant(8, DL, MVT::i8));
14190     if (!IsSingleInput)
14191       V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14192                        DAG.getBitcast(MVT::v8i16, V2),
14193                        DAG.getTargetConstant(8, DL, MVT::i8));
14194     return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14195                        IsSingleInput ? V1 : V2);
14196   }
14197
14198   // Handle multi-input cases by blending/unpacking single-input shuffles.
14199   if (NumV2Elements > 0)
14200     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14201                                                 Subtarget, DAG);
14202
14203   // The fallback path for single-input shuffles widens this into two v8i16
14204   // vectors with unpacks, shuffles those, and then pulls them back together
14205   // with a pack.
14206   SDValue V = V1;
14207
14208   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14209   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14210   for (int i = 0; i < 16; ++i)
14211     if (Mask[i] >= 0)
14212       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14213
14214   SDValue VLoHalf, VHiHalf;
14215   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14216   // them out and avoid using UNPCK{L,H} to extract the elements of V as
14217   // i16s.
14218   if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14219       none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14220     // Use a mask to drop the high bytes.
14221     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14222     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14223                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
14224
14225     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14226     VHiHalf = DAG.getUNDEF(MVT::v8i16);
14227
14228     // Squash the masks to point directly into VLoHalf.
14229     for (int &M : LoBlendMask)
14230       if (M >= 0)
14231         M /= 2;
14232     for (int &M : HiBlendMask)
14233       if (M >= 0)
14234         M /= 2;
14235   } else {
14236     // Otherwise just unpack the low half of V into VLoHalf and the high half into
14237     // VHiHalf so that we can blend them as i16s.
14238     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14239
14240     VLoHalf = DAG.getBitcast(
14241         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14242     VHiHalf = DAG.getBitcast(
14243         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14244   }
14245
14246   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14247   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14248
14249   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14250 }
14251
14252 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
14253 ///
14254 /// This routine breaks down the specific type of 128-bit shuffle and
14255 /// dispatches to the lowering routines accordingly.
14256 static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14257                                   MVT VT, SDValue V1, SDValue V2,
14258                                   const APInt &Zeroable,
14259                                   const X86Subtarget &Subtarget,
14260                                   SelectionDAG &DAG) {
14261   switch (VT.SimpleTy) {
14262   case MVT::v2i64:
14263     return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14264   case MVT::v2f64:
14265     return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14266   case MVT::v4i32:
14267     return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14268   case MVT::v4f32:
14269     return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14270   case MVT::v8i16:
14271     return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14272   case MVT::v8f16:
14273     return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14274   case MVT::v16i8:
14275     return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14276
14277   default:
14278     llvm_unreachable("Unimplemented!");
14279   }
14280 }
14281
14282 /// Generic routine to split vector shuffle into half-sized shuffles.
14283 ///
14284 /// This routine just extracts two subvectors, shuffles them independently, and
14285 /// then concatenates them back together. This should work effectively with all
14286 /// AVX vector shuffle types.
14287 static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
14288                                     SDValue V2, ArrayRef<int> Mask,
14289                                     SelectionDAG &DAG, bool SimpleOnly) {
14290   assert(VT.getSizeInBits() >= 256 &&
14291          "Only for 256-bit or wider vector shuffles!");
14292   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14293   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14294
14295   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14296   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14297
14298   int NumElements = VT.getVectorNumElements();
14299   int SplitNumElements = NumElements / 2;
14300   MVT ScalarVT = VT.getVectorElementType();
14301   MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14302
14303   // Use splitVector/extractSubVector so that split build-vectors just build two
14304   // narrower build vectors. This helps shuffling with splats and zeros.
14305   auto SplitVector = [&](SDValue V) {
14306     SDValue LoV, HiV;
14307     std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14308     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14309                           DAG.getBitcast(SplitVT, HiV));
14310   };
14311
14312   SDValue LoV1, HiV1, LoV2, HiV2;
14313   std::tie(LoV1, HiV1) = SplitVector(V1);
14314   std::tie(LoV2, HiV2) = SplitVector(V2);
14315
14316   // Now create two 4-way blends of these half-width vectors.
14317   auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14318                                    bool &UseHiV1, bool &UseLoV2,
14319                                    bool &UseHiV2) {
14320     UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14321     for (int i = 0; i < SplitNumElements; ++i) {
14322       int M = HalfMask[i];
14323       if (M >= NumElements) {
14324         if (M >= NumElements + SplitNumElements)
14325           UseHiV2 = true;
14326         else
14327           UseLoV2 = true;
14328       } else if (M >= 0) {
14329         if (M >= SplitNumElements)
14330           UseHiV1 = true;
14331         else
14332           UseLoV1 = true;
14333       }
14334     }
14335   };
14336
14337   auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14338     if (!SimpleOnly)
14339       return true;
14340
14341     bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14342     GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14343
14344     return !(UseHiV1 || UseHiV2);
14345   };
14346
14347   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14348     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14349     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14350     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14351     for (int i = 0; i < SplitNumElements; ++i) {
14352       int M = HalfMask[i];
14353       if (M >= NumElements) {
14354         V2BlendMask[i] = M - NumElements;
14355         BlendMask[i] = SplitNumElements + i;
14356       } else if (M >= 0) {
14357         V1BlendMask[i] = M;
14358         BlendMask[i] = i;
14359       }
14360     }
14361
14362     bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14363     GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14364
14365     // Because the lowering happens after all combining takes place, we need to
14366     // manually combine these blend masks as much as possible so that we create
14367     // a minimal number of high-level vector shuffle nodes.
14368     assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14369
14370     // First try just blending the halves of V1 or V2.
14371     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14372       return DAG.getUNDEF(SplitVT);
14373     if (!UseLoV2 && !UseHiV2)
14374       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14375     if (!UseLoV1 && !UseHiV1)
14376       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14377
14378     SDValue V1Blend, V2Blend;
14379     if (UseLoV1 && UseHiV1) {
14380       V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14381     } else {
14382       // We only use half of V1 so map the usage down into the final blend mask.
14383       V1Blend = UseLoV1 ? LoV1 : HiV1;
14384       for (int i = 0; i < SplitNumElements; ++i)
14385         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14386           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14387     }
14388     if (UseLoV2 && UseHiV2) {
14389       V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14390     } else {
14391       // We only use half of V2 so map the usage down into the final blend mask.
14392       V2Blend = UseLoV2 ? LoV2 : HiV2;
14393       for (int i = 0; i < SplitNumElements; ++i)
14394         if (BlendMask[i] >= SplitNumElements)
14395           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14396     }
14397     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14398   };
14399
14400   if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14401     return SDValue();
14402
14403   SDValue Lo = HalfBlend(LoMask);
14404   SDValue Hi = HalfBlend(HiMask);
14405   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14406 }
14407
14408 /// Either split a vector in halves or decompose the shuffles and the
14409 /// blend/unpack.
14410 ///
14411 /// This is provided as a good fallback for many lowerings of non-single-input
14412 /// shuffles with more than one 128-bit lane. In those cases, we want to select
14413 /// between splitting the shuffle into 128-bit components and stitching those
14414 /// back together vs. extracting the single-input shuffles and blending those
14415 /// results.
14416 static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
14417                                           SDValue V2, ArrayRef<int> Mask,
14418                                           const X86Subtarget &Subtarget,
14419                                           SelectionDAG &DAG) {
14420   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14421          "shuffles as it could then recurse on itself.");
14422   int Size = Mask.size();
14423
14424   // If this can be modeled as a broadcast of two elements followed by a blend,
14425   // prefer that lowering. This is especially important because broadcasts can
14426   // often fold with memory operands.
14427   auto DoBothBroadcast = [&] {
14428     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14429     for (int M : Mask)
14430       if (M >= Size) {
14431         if (V2BroadcastIdx < 0)
14432           V2BroadcastIdx = M - Size;
14433         else if (M - Size != V2BroadcastIdx)
14434           return false;
14435       } else if (M >= 0) {
14436         if (V1BroadcastIdx < 0)
14437           V1BroadcastIdx = M;
14438         else if (M != V1BroadcastIdx)
14439           return false;
14440       }
14441     return true;
14442   };
14443   if (DoBothBroadcast())
14444     return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14445                                                 DAG);
14446
14447   // If the inputs all stem from a single 128-bit lane of each input, then we
14448   // split them rather than blending because the split will decompose to
14449   // unusually few instructions.
14450   int LaneCount = VT.getSizeInBits() / 128;
14451   int LaneSize = Size / LaneCount;
14452   SmallBitVector LaneInputs[2];
14453   LaneInputs[0].resize(LaneCount, false);
14454   LaneInputs[1].resize(LaneCount, false);
14455   for (int i = 0; i < Size; ++i)
14456     if (Mask[i] >= 0)
14457       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14458   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14459     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14460                                 /*SimpleOnly*/ false);
14461
14462   // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
14463   // requires that the decomposed single-input shuffles don't end up here.
14464   return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14465                                               DAG);
14466 }
14467
14468 // Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14469 // TODO: Extend to support v8f32 (+ 512-bit shuffles).
14470 static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
14471                                                  SDValue V1, SDValue V2,
14472                                                  ArrayRef<int> Mask,
14473                                                  SelectionDAG &DAG) {
14474   assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
14475
14476   int LHSMask[4] = {-1, -1, -1, -1};
14477   int RHSMask[4] = {-1, -1, -1, -1};
14478   unsigned SHUFPMask = 0;
14479
14480   // As SHUFPD uses a single LHS/RHS element per lane, we can always
14481   // perform the shuffle once the lanes have been shuffled in place.
14482   for (int i = 0; i != 4; ++i) {
14483     int M = Mask[i];
14484     if (M < 0)
14485       continue;
14486     int LaneBase = i & ~1;
14487     auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
14488     LaneMask[LaneBase + (M & 1)] = M;
14489     SHUFPMask |= (M & 1) << i;
14490   }
14491
14492   SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
14493   SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
14494   return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
14495                      DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
14496 }
14497
14498 /// Lower a vector shuffle crossing multiple 128-bit lanes as
14499 /// a lane permutation followed by a per-lane permutation.
14500 ///
14501 /// This is mainly for cases where we can have non-repeating permutes
14502 /// in each lane.
14503 ///
14504 /// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
14505 /// we should investigate merging them.
14506 static SDValue lowerShuffleAsLanePermuteAndPermute(
14507     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14508     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14509   int NumElts = VT.getVectorNumElements();
14510   int NumLanes = VT.getSizeInBits() / 128;
14511   int NumEltsPerLane = NumElts / NumLanes;
14512   bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
14513
14514   /// Attempts to find a sublane permute with the given size
14515   /// that gets all elements into their target lanes.
14516   ///
14517   /// If successful, fills CrossLaneMask and InLaneMask and returns true.
14518   /// If unsuccessful, returns false and may overwrite InLaneMask.
14519   auto getSublanePermute = [&](int NumSublanes) -> SDValue {
14520     int NumSublanesPerLane = NumSublanes / NumLanes;
14521     int NumEltsPerSublane = NumElts / NumSublanes;
14522
14523     SmallVector<int, 16> CrossLaneMask;
14524     SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
14525     // CrossLaneMask but one entry == one sublane.
14526     SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
14527
14528     for (int i = 0; i != NumElts; ++i) {
14529       int M = Mask[i];
14530       if (M < 0)
14531         continue;
14532
14533       int SrcSublane = M / NumEltsPerSublane;
14534       int DstLane = i / NumEltsPerLane;
14535
14536       // We only need to get the elements into the right lane, not sublane.
14537       // So search all sublanes that make up the destination lane.
14538       bool Found = false;
14539       int DstSubStart = DstLane * NumSublanesPerLane;
14540       int DstSubEnd = DstSubStart + NumSublanesPerLane;
14541       for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
14542         if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
14543           continue;
14544
14545         Found = true;
14546         CrossLaneMaskLarge[DstSublane] = SrcSublane;
14547         int DstSublaneOffset = DstSublane * NumEltsPerSublane;
14548         InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
14549         break;
14550       }
14551       if (!Found)
14552         return SDValue();
14553     }
14554
14555     // Fill CrossLaneMask using CrossLaneMaskLarge.
14556     narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
14557
14558     if (!CanUseSublanes) {
14559       // If we're only shuffling a single lowest lane and the rest are identity
14560       // then don't bother.
14561       // TODO - isShuffleMaskInputInPlace could be extended to something like
14562       // this.
14563       int NumIdentityLanes = 0;
14564       bool OnlyShuffleLowestLane = true;
14565       for (int i = 0; i != NumLanes; ++i) {
14566         int LaneOffset = i * NumEltsPerLane;
14567         if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
14568                                        i * NumEltsPerLane))
14569           NumIdentityLanes++;
14570         else if (CrossLaneMask[LaneOffset] != 0)
14571           OnlyShuffleLowestLane = false;
14572       }
14573       if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
14574         return SDValue();
14575     }
14576
14577     // Avoid returning the same shuffle operation. For example,
14578     // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
14579     //                             undef:v16i16
14580     if (CrossLaneMask == Mask || InLaneMask == Mask)
14581       return SDValue();
14582
14583     SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
14584     return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
14585                                 InLaneMask);
14586   };
14587
14588   // First attempt a solution with full lanes.
14589   if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
14590     return V;
14591
14592   // The rest of the solutions use sublanes.
14593   if (!CanUseSublanes)
14594     return SDValue();
14595
14596   // Then attempt a solution with 64-bit sublanes (vpermq).
14597   if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
14598     return V;
14599
14600   // If that doesn't work and we have fast variable cross-lane shuffle,
14601   // attempt 32-bit sublanes (vpermd).
14602   if (!Subtarget.hasFastVariableCrossLaneShuffle())
14603     return SDValue();
14604
14605   return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
14606 }
14607
14608 /// Helper to get compute inlane shuffle mask for a complete shuffle mask.
14609 static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
14610                                      SmallVector<int> &InLaneMask) {
14611   int Size = Mask.size();
14612   InLaneMask.assign(Mask.begin(), Mask.end());
14613   for (int i = 0; i < Size; ++i) {
14614     int &M = InLaneMask[i];
14615     if (M < 0)
14616       continue;
14617     if (((M % Size) / LaneSize) != (i / LaneSize))
14618       M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
14619   }
14620 }
14621
14622 /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
14623 /// source with a lane permutation.
14624 ///
14625 /// This lowering strategy results in four instructions in the worst case for a
14626 /// single-input cross lane shuffle which is lower than any other fully general
14627 /// cross-lane shuffle strategy I'm aware of. Special cases for each particular
14628 /// shuffle pattern should be handled prior to trying this lowering.
14629 static SDValue lowerShuffleAsLanePermuteAndShuffle(
14630     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14631     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14632   // FIXME: This should probably be generalized for 512-bit vectors as well.
14633   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
14634   int Size = Mask.size();
14635   int LaneSize = Size / 2;
14636
14637   // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14638   // Only do this if the elements aren't all from the lower lane,
14639   // otherwise we're (probably) better off doing a split.
14640   if (VT == MVT::v4f64 &&
14641       !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
14642     return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
14643
14644   // If there are only inputs from one 128-bit lane, splitting will in fact be
14645   // less expensive. The flags track whether the given lane contains an element
14646   // that crosses to another lane.
14647   bool AllLanes;
14648   if (!Subtarget.hasAVX2()) {
14649     bool LaneCrossing[2] = {false, false};
14650     for (int i = 0; i < Size; ++i)
14651       if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
14652         LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
14653     AllLanes = LaneCrossing[0] && LaneCrossing[1];
14654   } else {
14655     bool LaneUsed[2] = {false, false};
14656     for (int i = 0; i < Size; ++i)
14657       if (Mask[i] >= 0)
14658         LaneUsed[(Mask[i] % Size) / LaneSize] = true;
14659     AllLanes = LaneUsed[0] && LaneUsed[1];
14660   }
14661
14662   // TODO - we could support shuffling V2 in the Flipped input.
14663   assert(V2.isUndef() &&
14664          "This last part of this routine only works on single input shuffles");
14665
14666   SmallVector<int> InLaneMask;
14667   computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
14668
14669   assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
14670          "In-lane shuffle mask expected");
14671
14672   // If we're not using both lanes in each lane and the inlane mask is not
14673   // repeating, then we're better off splitting.
14674   if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
14675     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14676                                 /*SimpleOnly*/ false);
14677
14678   // Flip the lanes, and shuffle the results which should now be in-lane.
14679   MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
14680   SDValue Flipped = DAG.getBitcast(PVT, V1);
14681   Flipped =
14682       DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
14683   Flipped = DAG.getBitcast(VT, Flipped);
14684   return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
14685 }
14686
14687 /// Handle lowering 2-lane 128-bit shuffles.
14688 static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
14689                                   SDValue V2, ArrayRef<int> Mask,
14690                                   const APInt &Zeroable,
14691                                   const X86Subtarget &Subtarget,
14692                                   SelectionDAG &DAG) {
14693   if (V2.isUndef()) {
14694     // Attempt to match VBROADCAST*128 subvector broadcast load.
14695     bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
14696     bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
14697     if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
14698         X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
14699       MVT MemVT = VT.getHalfNumVectorElementsVT();
14700       unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
14701       auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
14702       if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,
14703                                              VT, MemVT, Ld, Ofs, DAG))
14704         return BcstLd;
14705     }
14706
14707     // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
14708     if (Subtarget.hasAVX2())
14709       return SDValue();
14710   }
14711
14712   bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
14713
14714   SmallVector<int, 4> WidenedMask;
14715   if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
14716     return SDValue();
14717
14718   bool IsLowZero = (Zeroable & 0x3) == 0x3;
14719   bool IsHighZero = (Zeroable & 0xc) == 0xc;
14720
14721   // Try to use an insert into a zero vector.
14722   if (WidenedMask[0] == 0 && IsHighZero) {
14723     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14724     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14725                               DAG.getIntPtrConstant(0, DL));
14726     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14727                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
14728                        DAG.getIntPtrConstant(0, DL));
14729   }
14730
14731   // TODO: If minimizing size and one of the inputs is a zero vector and the
14732   // the zero vector has only one use, we could use a VPERM2X128 to save the
14733   // instruction bytes needed to explicitly generate the zero vector.
14734
14735   // Blends are faster and handle all the non-lane-crossing cases.
14736   if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
14737                                           Subtarget, DAG))
14738     return Blend;
14739
14740   // If either input operand is a zero vector, use VPERM2X128 because its mask
14741   // allows us to replace the zero input with an implicit zero.
14742   if (!IsLowZero && !IsHighZero) {
14743     // Check for patterns which can be matched with a single insert of a 128-bit
14744     // subvector.
14745     bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
14746     if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
14747
14748       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
14749       // this will likely become vinsertf128 which can't fold a 256-bit memop.
14750       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
14751         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14752         SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14753                                      OnlyUsesV1 ? V1 : V2,
14754                                      DAG.getIntPtrConstant(0, DL));
14755         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14756                            DAG.getIntPtrConstant(2, DL));
14757       }
14758     }
14759
14760     // Try to use SHUF128 if possible.
14761     if (Subtarget.hasVLX()) {
14762       if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
14763         unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
14764                             ((WidenedMask[1] % 2) << 1);
14765         return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
14766                            DAG.getTargetConstant(PermMask, DL, MVT::i8));
14767       }
14768     }
14769   }
14770
14771   // Otherwise form a 128-bit permutation. After accounting for undefs,
14772   // convert the 64-bit shuffle mask selection values into 128-bit
14773   // selection bits by dividing the indexes by 2 and shifting into positions
14774   // defined by a vperm2*128 instruction's immediate control byte.
14775
14776   // The immediate permute control byte looks like this:
14777   //    [1:0] - select 128 bits from sources for low half of destination
14778   //    [2]   - ignore
14779   //    [3]   - zero low half of destination
14780   //    [5:4] - select 128 bits from sources for high half of destination
14781   //    [6]   - ignore
14782   //    [7]   - zero high half of destination
14783
14784   assert((WidenedMask[0] >= 0 || IsLowZero) &&
14785          (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
14786
14787   unsigned PermMask = 0;
14788   PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
14789   PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
14790
14791   // Check the immediate mask and replace unused sources with undef.
14792   if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
14793     V1 = DAG.getUNDEF(VT);
14794   if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
14795     V2 = DAG.getUNDEF(VT);
14796
14797   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
14798                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
14799 }
14800
14801 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
14802 /// shuffling each lane.
14803 ///
14804 /// This attempts to create a repeated lane shuffle where each lane uses one
14805 /// or two of the lanes of the inputs. The lanes of the input vectors are
14806 /// shuffled in one or two independent shuffles to get the lanes into the
14807 /// position needed by the final shuffle.
14808 static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
14809     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14810     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14811   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
14812
14813   if (is128BitLaneRepeatedShuffleMask(VT, Mask))
14814     return SDValue();
14815
14816   int NumElts = Mask.size();
14817   int NumLanes = VT.getSizeInBits() / 128;
14818   int NumLaneElts = 128 / VT.getScalarSizeInBits();
14819   SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
14820   SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
14821
14822   // First pass will try to fill in the RepeatMask from lanes that need two
14823   // sources.
14824   for (int Lane = 0; Lane != NumLanes; ++Lane) {
14825     int Srcs[2] = {-1, -1};
14826     SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
14827     for (int i = 0; i != NumLaneElts; ++i) {
14828       int M = Mask[(Lane * NumLaneElts) + i];
14829       if (M < 0)
14830         continue;
14831       // Determine which of the possible input lanes (NumLanes from each source)
14832       // this element comes from. Assign that as one of the sources for this
14833       // lane. We can assign up to 2 sources for this lane. If we run out
14834       // sources we can't do anything.
14835       int LaneSrc = M / NumLaneElts;
14836       int Src;
14837       if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
14838         Src = 0;
14839       else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
14840         Src = 1;
14841       else
14842         return SDValue();
14843
14844       Srcs[Src] = LaneSrc;
14845       InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
14846     }
14847
14848     // If this lane has two sources, see if it fits with the repeat mask so far.
14849     if (Srcs[1] < 0)
14850       continue;
14851
14852     LaneSrcs[Lane][0] = Srcs[0];
14853     LaneSrcs[Lane][1] = Srcs[1];
14854
14855     auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
14856       assert(M1.size() == M2.size() && "Unexpected mask size");
14857       for (int i = 0, e = M1.size(); i != e; ++i)
14858         if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
14859           return false;
14860       return true;
14861     };
14862
14863     auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
14864       assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
14865       for (int i = 0, e = MergedMask.size(); i != e; ++i) {
14866         int M = Mask[i];
14867         if (M < 0)
14868           continue;
14869         assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
14870                "Unexpected mask element");
14871         MergedMask[i] = M;
14872       }
14873     };
14874
14875     if (MatchMasks(InLaneMask, RepeatMask)) {
14876       // Merge this lane mask into the final repeat mask.
14877       MergeMasks(InLaneMask, RepeatMask);
14878       continue;
14879     }
14880
14881     // Didn't find a match. Swap the operands and try again.
14882     std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
14883     ShuffleVectorSDNode::commuteMask(InLaneMask);
14884
14885     if (MatchMasks(InLaneMask, RepeatMask)) {
14886       // Merge this lane mask into the final repeat mask.
14887       MergeMasks(InLaneMask, RepeatMask);
14888       continue;
14889     }
14890
14891     // Couldn't find a match with the operands in either order.
14892     return SDValue();
14893   }
14894
14895   // Now handle any lanes with only one source.
14896   for (int Lane = 0; Lane != NumLanes; ++Lane) {
14897     // If this lane has already been processed, skip it.
14898     if (LaneSrcs[Lane][0] >= 0)
14899       continue;
14900
14901     for (int i = 0; i != NumLaneElts; ++i) {
14902       int M = Mask[(Lane * NumLaneElts) + i];
14903       if (M < 0)
14904         continue;
14905
14906       // If RepeatMask isn't defined yet we can define it ourself.
14907       if (RepeatMask[i] < 0)
14908         RepeatMask[i] = M % NumLaneElts;
14909
14910       if (RepeatMask[i] < NumElts) {
14911         if (RepeatMask[i] != M % NumLaneElts)
14912           return SDValue();
14913         LaneSrcs[Lane][0] = M / NumLaneElts;
14914       } else {
14915         if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
14916           return SDValue();
14917         LaneSrcs[Lane][1] = M / NumLaneElts;
14918       }
14919     }
14920
14921     if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
14922       return SDValue();
14923   }
14924
14925   SmallVector<int, 16> NewMask(NumElts, -1);
14926   for (int Lane = 0; Lane != NumLanes; ++Lane) {
14927     int Src = LaneSrcs[Lane][0];
14928     for (int i = 0; i != NumLaneElts; ++i) {
14929       int M = -1;
14930       if (Src >= 0)
14931         M = Src * NumLaneElts + i;
14932       NewMask[Lane * NumLaneElts + i] = M;
14933     }
14934   }
14935   SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14936   // Ensure we didn't get back the shuffle we started with.
14937   // FIXME: This is a hack to make up for some splat handling code in
14938   // getVectorShuffle.
14939   if (isa<ShuffleVectorSDNode>(NewV1) &&
14940       cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
14941     return SDValue();
14942
14943   for (int Lane = 0; Lane != NumLanes; ++Lane) {
14944     int Src = LaneSrcs[Lane][1];
14945     for (int i = 0; i != NumLaneElts; ++i) {
14946       int M = -1;
14947       if (Src >= 0)
14948         M = Src * NumLaneElts + i;
14949       NewMask[Lane * NumLaneElts + i] = M;
14950     }
14951   }
14952   SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14953   // Ensure we didn't get back the shuffle we started with.
14954   // FIXME: This is a hack to make up for some splat handling code in
14955   // getVectorShuffle.
14956   if (isa<ShuffleVectorSDNode>(NewV2) &&
14957       cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
14958     return SDValue();
14959
14960   for (int i = 0; i != NumElts; ++i) {
14961     if (Mask[i] < 0) {
14962       NewMask[i] = -1;
14963       continue;
14964     }
14965     NewMask[i] = RepeatMask[i % NumLaneElts];
14966     if (NewMask[i] < 0)
14967       continue;
14968
14969     NewMask[i] += (i / NumLaneElts) * NumLaneElts;
14970   }
14971   return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
14972 }
14973
14974 /// If the input shuffle mask results in a vector that is undefined in all upper
14975 /// or lower half elements and that mask accesses only 2 halves of the
14976 /// shuffle's operands, return true. A mask of half the width with mask indexes
14977 /// adjusted to access the extracted halves of the original shuffle operands is
14978 /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
14979 /// lower half of each input operand is accessed.
14980 static bool
14981 getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
14982                    int &HalfIdx1, int &HalfIdx2) {
14983   assert((Mask.size() == HalfMask.size() * 2) &&
14984          "Expected input mask to be twice as long as output");
14985
14986   // Exactly one half of the result must be undef to allow narrowing.
14987   bool UndefLower = isUndefLowerHalf(Mask);
14988   bool UndefUpper = isUndefUpperHalf(Mask);
14989   if (UndefLower == UndefUpper)
14990     return false;
14991
14992   unsigned HalfNumElts = HalfMask.size();
14993   unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
14994   HalfIdx1 = -1;
14995   HalfIdx2 = -1;
14996   for (unsigned i = 0; i != HalfNumElts; ++i) {
14997     int M = Mask[i + MaskIndexOffset];
14998     if (M < 0) {
14999       HalfMask[i] = M;
15000       continue;
15001     }
15002
15003     // Determine which of the 4 half vectors this element is from.
15004     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15005     int HalfIdx = M / HalfNumElts;
15006
15007     // Determine the element index into its half vector source.
15008     int HalfElt = M % HalfNumElts;
15009
15010     // We can shuffle with up to 2 half vectors, set the new 'half'
15011     // shuffle mask accordingly.
15012     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15013       HalfMask[i] = HalfElt;
15014       HalfIdx1 = HalfIdx;
15015       continue;
15016     }
15017     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15018       HalfMask[i] = HalfElt + HalfNumElts;
15019       HalfIdx2 = HalfIdx;
15020       continue;
15021     }
15022
15023     // Too many half vectors referenced.
15024     return false;
15025   }
15026
15027   return true;
15028 }
15029
15030 /// Given the output values from getHalfShuffleMask(), create a half width
15031 /// shuffle of extracted vectors followed by an insert back to full width.
15032 static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
15033                                      ArrayRef<int> HalfMask, int HalfIdx1,
15034                                      int HalfIdx2, bool UndefLower,
15035                                      SelectionDAG &DAG, bool UseConcat = false) {
15036   assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15037   assert(V1.getValueType().isSimple() && "Expecting only simple types");
15038
15039   MVT VT = V1.getSimpleValueType();
15040   MVT HalfVT = VT.getHalfNumVectorElementsVT();
15041   unsigned HalfNumElts = HalfVT.getVectorNumElements();
15042
15043   auto getHalfVector = [&](int HalfIdx) {
15044     if (HalfIdx < 0)
15045       return DAG.getUNDEF(HalfVT);
15046     SDValue V = (HalfIdx < 2 ? V1 : V2);
15047     HalfIdx = (HalfIdx % 2) * HalfNumElts;
15048     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15049                        DAG.getIntPtrConstant(HalfIdx, DL));
15050   };
15051
15052   // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15053   SDValue Half1 = getHalfVector(HalfIdx1);
15054   SDValue Half2 = getHalfVector(HalfIdx2);
15055   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15056   if (UseConcat) {
15057     SDValue Op0 = V;
15058     SDValue Op1 = DAG.getUNDEF(HalfVT);
15059     if (UndefLower)
15060       std::swap(Op0, Op1);
15061     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15062   }
15063
15064   unsigned Offset = UndefLower ? HalfNumElts : 0;
15065   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15066                      DAG.getIntPtrConstant(Offset, DL));
15067 }
15068
15069 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15070 /// This allows for fast cases such as subvector extraction/insertion
15071 /// or shuffling smaller vector types which can lower more efficiently.
15072 static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
15073                                          SDValue V2, ArrayRef<int> Mask,
15074                                          const X86Subtarget &Subtarget,
15075                                          SelectionDAG &DAG) {
15076   assert((VT.is256BitVector() || VT.is512BitVector()) &&
15077          "Expected 256-bit or 512-bit vector");
15078
15079   bool UndefLower = isUndefLowerHalf(Mask);
15080   if (!UndefLower && !isUndefUpperHalf(Mask))
15081     return SDValue();
15082
15083   assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15084          "Completely undef shuffle mask should have been simplified already");
15085
15086   // Upper half is undef and lower half is whole upper subvector.
15087   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15088   MVT HalfVT = VT.getHalfNumVectorElementsVT();
15089   unsigned HalfNumElts = HalfVT.getVectorNumElements();
15090   if (!UndefLower &&
15091       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15092     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15093                              DAG.getIntPtrConstant(HalfNumElts, DL));
15094     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15095                        DAG.getIntPtrConstant(0, DL));
15096   }
15097
15098   // Lower half is undef and upper half is whole lower subvector.
15099   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15100   if (UndefLower &&
15101       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15102     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15103                              DAG.getIntPtrConstant(0, DL));
15104     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15105                        DAG.getIntPtrConstant(HalfNumElts, DL));
15106   }
15107
15108   int HalfIdx1, HalfIdx2;
15109   SmallVector<int, 8> HalfMask(HalfNumElts);
15110   if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15111     return SDValue();
15112
15113   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15114
15115   // Only shuffle the halves of the inputs when useful.
15116   unsigned NumLowerHalves =
15117       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15118   unsigned NumUpperHalves =
15119       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15120   assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15121
15122   // Determine the larger pattern of undef/halves, then decide if it's worth
15123   // splitting the shuffle based on subtarget capabilities and types.
15124   unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15125   if (!UndefLower) {
15126     // XXXXuuuu: no insert is needed.
15127     // Always extract lowers when setting lower - these are all free subreg ops.
15128     if (NumUpperHalves == 0)
15129       return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15130                                    UndefLower, DAG);
15131
15132     if (NumUpperHalves == 1) {
15133       // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15134       if (Subtarget.hasAVX2()) {
15135         // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15136         if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15137             !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15138             (!isSingleSHUFPSMask(HalfMask) ||
15139              Subtarget.hasFastVariableCrossLaneShuffle()))
15140           return SDValue();
15141         // If this is a unary shuffle (assume that the 2nd operand is
15142         // canonicalized to undef), then we can use vpermpd. Otherwise, we
15143         // are better off extracting the upper half of 1 operand and using a
15144         // narrow shuffle.
15145         if (EltWidth == 64 && V2.isUndef())
15146           return SDValue();
15147       }
15148       // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15149       if (Subtarget.hasAVX512() && VT.is512BitVector())
15150         return SDValue();
15151       // Extract + narrow shuffle is better than the wide alternative.
15152       return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15153                                    UndefLower, DAG);
15154     }
15155
15156     // Don't extract both uppers, instead shuffle and then extract.
15157     assert(NumUpperHalves == 2 && "Half vector count went wrong");
15158     return SDValue();
15159   }
15160
15161   // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15162   if (NumUpperHalves == 0) {
15163     // AVX2 has efficient 64-bit element cross-lane shuffles.
15164     // TODO: Refine to account for unary shuffle, splat, and other masks?
15165     if (Subtarget.hasAVX2() && EltWidth == 64)
15166       return SDValue();
15167     // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15168     if (Subtarget.hasAVX512() && VT.is512BitVector())
15169       return SDValue();
15170     // Narrow shuffle + insert is better than the wide alternative.
15171     return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15172                                  UndefLower, DAG);
15173   }
15174
15175   // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15176   return SDValue();
15177 }
15178
15179 /// Handle case where shuffle sources are coming from the same 128-bit lane and
15180 /// every lane can be represented as the same repeating mask - allowing us to
15181 /// shuffle the sources with the repeating shuffle and then permute the result
15182 /// to the destination lanes.
15183 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
15184     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15185     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15186   int NumElts = VT.getVectorNumElements();
15187   int NumLanes = VT.getSizeInBits() / 128;
15188   int NumLaneElts = NumElts / NumLanes;
15189
15190   // On AVX2 we may be able to just shuffle the lowest elements and then
15191   // broadcast the result.
15192   if (Subtarget.hasAVX2()) {
15193     for (unsigned BroadcastSize : {16, 32, 64}) {
15194       if (BroadcastSize <= VT.getScalarSizeInBits())
15195         continue;
15196       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15197
15198       // Attempt to match a repeating pattern every NumBroadcastElts,
15199       // accounting for UNDEFs but only references the lowest 128-bit
15200       // lane of the inputs.
15201       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15202         for (int i = 0; i != NumElts; i += NumBroadcastElts)
15203           for (int j = 0; j != NumBroadcastElts; ++j) {
15204             int M = Mask[i + j];
15205             if (M < 0)
15206               continue;
15207             int &R = RepeatMask[j];
15208             if (0 != ((M % NumElts) / NumLaneElts))
15209               return false;
15210             if (0 <= R && R != M)
15211               return false;
15212             R = M;
15213           }
15214         return true;
15215       };
15216
15217       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15218       if (!FindRepeatingBroadcastMask(RepeatMask))
15219         continue;
15220
15221       // Shuffle the (lowest) repeated elements in place for broadcast.
15222       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15223
15224       // Shuffle the actual broadcast.
15225       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15226       for (int i = 0; i != NumElts; i += NumBroadcastElts)
15227         for (int j = 0; j != NumBroadcastElts; ++j)
15228           BroadcastMask[i + j] = j;
15229
15230       // Avoid returning the same shuffle operation. For example,
15231       // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15232       if (BroadcastMask == Mask)
15233         return SDValue();
15234
15235       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15236                                   BroadcastMask);
15237     }
15238   }
15239
15240   // Bail if the shuffle mask doesn't cross 128-bit lanes.
15241   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15242     return SDValue();
15243
15244   // Bail if we already have a repeated lane shuffle mask.
15245   if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15246     return SDValue();
15247
15248   // Helper to look for repeated mask in each split sublane, and that those
15249   // sublanes can then be permuted into place.
15250   auto ShuffleSubLanes = [&](int SubLaneScale) {
15251     int NumSubLanes = NumLanes * SubLaneScale;
15252     int NumSubLaneElts = NumLaneElts / SubLaneScale;
15253
15254     // Check that all the sources are coming from the same lane and see if we
15255     // can form a repeating shuffle mask (local to each sub-lane). At the same
15256     // time, determine the source sub-lane for each destination sub-lane.
15257     int TopSrcSubLane = -1;
15258     SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15259     SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15260         SubLaneScale,
15261         SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15262
15263     for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15264       // Extract the sub-lane mask, check that it all comes from the same lane
15265       // and normalize the mask entries to come from the first lane.
15266       int SrcLane = -1;
15267       SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15268       for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15269         int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15270         if (M < 0)
15271           continue;
15272         int Lane = (M % NumElts) / NumLaneElts;
15273         if ((0 <= SrcLane) && (SrcLane != Lane))
15274           return SDValue();
15275         SrcLane = Lane;
15276         int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15277         SubLaneMask[Elt] = LocalM;
15278       }
15279
15280       // Whole sub-lane is UNDEF.
15281       if (SrcLane < 0)
15282         continue;
15283
15284       // Attempt to match against the candidate repeated sub-lane masks.
15285       for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15286         auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15287           for (int i = 0; i != NumSubLaneElts; ++i) {
15288             if (M1[i] < 0 || M2[i] < 0)
15289               continue;
15290             if (M1[i] != M2[i])
15291               return false;
15292           }
15293           return true;
15294         };
15295
15296         auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15297         if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15298           continue;
15299
15300         // Merge the sub-lane mask into the matching repeated sub-lane mask.
15301         for (int i = 0; i != NumSubLaneElts; ++i) {
15302           int M = SubLaneMask[i];
15303           if (M < 0)
15304             continue;
15305           assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15306                  "Unexpected mask element");
15307           RepeatedSubLaneMask[i] = M;
15308         }
15309
15310         // Track the top most source sub-lane - by setting the remaining to
15311         // UNDEF we can greatly simplify shuffle matching.
15312         int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15313         TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15314         Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15315         break;
15316       }
15317
15318       // Bail if we failed to find a matching repeated sub-lane mask.
15319       if (Dst2SrcSubLanes[DstSubLane] < 0)
15320         return SDValue();
15321     }
15322     assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15323            "Unexpected source lane");
15324
15325     // Create a repeating shuffle mask for the entire vector.
15326     SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15327     for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15328       int Lane = SubLane / SubLaneScale;
15329       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15330       for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15331         int M = RepeatedSubLaneMask[Elt];
15332         if (M < 0)
15333           continue;
15334         int Idx = (SubLane * NumSubLaneElts) + Elt;
15335         RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15336       }
15337     }
15338
15339     // Shuffle each source sub-lane to its destination.
15340     SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15341     for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15342       int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15343       if (SrcSubLane < 0)
15344         continue;
15345       for (int j = 0; j != NumSubLaneElts; ++j)
15346         SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15347     }
15348
15349     // Avoid returning the same shuffle operation.
15350     // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15351     if (RepeatedMask == Mask || SubLaneMask == Mask)
15352       return SDValue();
15353
15354     SDValue RepeatedShuffle =
15355         DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15356
15357     return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15358                                 SubLaneMask);
15359   };
15360
15361   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15362   // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15363   // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15364   // Otherwise we can only permute whole 128-bit lanes.
15365   int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15366   if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15367     bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15368     MinSubLaneScale = 2;
15369     MaxSubLaneScale =
15370         (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15371   }
15372   if (Subtarget.hasBWI() && VT == MVT::v64i8)
15373     MinSubLaneScale = MaxSubLaneScale = 4;
15374
15375   for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15376     if (SDValue Shuffle = ShuffleSubLanes(Scale))
15377       return Shuffle;
15378
15379   return SDValue();
15380 }
15381
15382 static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
15383                                    bool &ForceV1Zero, bool &ForceV2Zero,
15384                                    unsigned &ShuffleImm, ArrayRef<int> Mask,
15385                                    const APInt &Zeroable) {
15386   int NumElts = VT.getVectorNumElements();
15387   assert(VT.getScalarSizeInBits() == 64 &&
15388          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15389          "Unexpected data type for VSHUFPD");
15390   assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15391          "Illegal shuffle mask");
15392
15393   bool ZeroLane[2] = { true, true };
15394   for (int i = 0; i < NumElts; ++i)
15395     ZeroLane[i & 1] &= Zeroable[i];
15396
15397   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
15398   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
15399   ShuffleImm = 0;
15400   bool ShufpdMask = true;
15401   bool CommutableMask = true;
15402   for (int i = 0; i < NumElts; ++i) {
15403     if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15404       continue;
15405     if (Mask[i] < 0)
15406       return false;
15407     int Val = (i & 6) + NumElts * (i & 1);
15408     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15409     if (Mask[i] < Val || Mask[i] > Val + 1)
15410       ShufpdMask = false;
15411     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15412       CommutableMask = false;
15413     ShuffleImm |= (Mask[i] % 2) << i;
15414   }
15415
15416   if (!ShufpdMask && !CommutableMask)
15417     return false;
15418
15419   if (!ShufpdMask && CommutableMask)
15420     std::swap(V1, V2);
15421
15422   ForceV1Zero = ZeroLane[0];
15423   ForceV2Zero = ZeroLane[1];
15424   return true;
15425 }
15426
15427 static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
15428                                       SDValue V2, ArrayRef<int> Mask,
15429                                       const APInt &Zeroable,
15430                                       const X86Subtarget &Subtarget,
15431                                       SelectionDAG &DAG) {
15432   assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15433          "Unexpected data type for VSHUFPD");
15434
15435   unsigned Immediate = 0;
15436   bool ForceV1Zero = false, ForceV2Zero = false;
15437   if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15438                               Mask, Zeroable))
15439     return SDValue();
15440
15441   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15442   if (ForceV1Zero)
15443     V1 = getZeroVector(VT, Subtarget, DAG, DL);
15444   if (ForceV2Zero)
15445     V2 = getZeroVector(VT, Subtarget, DAG, DL);
15446
15447   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15448                      DAG.getTargetConstant(Immediate, DL, MVT::i8));
15449 }
15450
15451 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
15452 // by zeroable elements in the remaining 24 elements. Turn this into two
15453 // vmovqb instructions shuffled together.
15454 static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
15455                                              SDValue V1, SDValue V2,
15456                                              ArrayRef<int> Mask,
15457                                              const APInt &Zeroable,
15458                                              SelectionDAG &DAG) {
15459   assert(VT == MVT::v32i8 && "Unexpected type!");
15460
15461   // The first 8 indices should be every 8th element.
15462   if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
15463     return SDValue();
15464
15465   // Remaining elements need to be zeroable.
15466   if (Zeroable.countl_one() < (Mask.size() - 8))
15467     return SDValue();
15468
15469   V1 = DAG.getBitcast(MVT::v4i64, V1);
15470   V2 = DAG.getBitcast(MVT::v4i64, V2);
15471
15472   V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
15473   V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
15474
15475   // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
15476   // the upper bits of the result using an unpckldq.
15477   SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
15478                                         { 0, 1, 2, 3, 16, 17, 18, 19,
15479                                           4, 5, 6, 7, 20, 21, 22, 23 });
15480   // Insert the unpckldq into a zero vector to widen to v32i8.
15481   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
15482                      DAG.getConstant(0, DL, MVT::v32i8), Unpack,
15483                      DAG.getIntPtrConstant(0, DL));
15484 }
15485
15486 // a = shuffle v1, v2, mask1    ; interleaving lower lanes of v1 and v2
15487 // b = shuffle v1, v2, mask2    ; interleaving higher lanes of v1 and v2
15488 //     =>
15489 // ul = unpckl v1, v2
15490 // uh = unpckh v1, v2
15491 // a = vperm ul, uh
15492 // b = vperm ul, uh
15493 //
15494 // Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
15495 // and permute. We cannot directly match v3 because it is split into two
15496 // 256-bit vectors in earlier isel stages. Therefore, this function matches a
15497 // pair of 256-bit shuffles and makes sure the masks are consecutive.
15498 //
15499 // Once unpck and permute nodes are created, the permute corresponding to this
15500 // shuffle is returned, while the other permute replaces the other half of the
15501 // shuffle in the selection dag.
15502 static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
15503                                                  SDValue V1, SDValue V2,
15504                                                  ArrayRef<int> Mask,
15505                                                  SelectionDAG &DAG) {
15506   if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
15507       VT != MVT::v32i8)
15508     return SDValue();
15509   // <B0, B1, B0+1, B1+1, ..., >
15510   auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
15511                                    unsigned Begin1) {
15512     size_t Size = Mask.size();
15513     assert(Size % 2 == 0 && "Expected even mask size");
15514     for (unsigned I = 0; I < Size; I += 2) {
15515       if (Mask[I] != (int)(Begin0 + I / 2) ||
15516           Mask[I + 1] != (int)(Begin1 + I / 2))
15517         return false;
15518     }
15519     return true;
15520   };
15521   // Check which half is this shuffle node
15522   int NumElts = VT.getVectorNumElements();
15523   size_t FirstQtr = NumElts / 2;
15524   size_t ThirdQtr = NumElts + NumElts / 2;
15525   bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
15526   bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
15527   if (!IsFirstHalf && !IsSecondHalf)
15528     return SDValue();
15529
15530   // Find the intersection between shuffle users of V1 and V2.
15531   SmallVector<SDNode *, 2> Shuffles;
15532   for (SDNode *User : V1->uses())
15533     if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
15534         User->getOperand(1) == V2)
15535       Shuffles.push_back(User);
15536   // Limit user size to two for now.
15537   if (Shuffles.size() != 2)
15538     return SDValue();
15539   // Find out which half of the 512-bit shuffles is each smaller shuffle
15540   auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
15541   auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
15542   SDNode *FirstHalf;
15543   SDNode *SecondHalf;
15544   if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
15545       IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
15546     FirstHalf = Shuffles[0];
15547     SecondHalf = Shuffles[1];
15548   } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
15549              IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
15550     FirstHalf = Shuffles[1];
15551     SecondHalf = Shuffles[0];
15552   } else {
15553     return SDValue();
15554   }
15555   // Lower into unpck and perm. Return the perm of this shuffle and replace
15556   // the other.
15557   SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
15558   SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
15559   SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15560                               DAG.getTargetConstant(0x20, DL, MVT::i8));
15561   SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15562                               DAG.getTargetConstant(0x31, DL, MVT::i8));
15563   if (IsFirstHalf) {
15564     DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
15565     return Perm1;
15566   }
15567   DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
15568   return Perm2;
15569 }
15570
15571 /// Handle lowering of 4-lane 64-bit floating point shuffles.
15572 ///
15573 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
15574 /// isn't available.
15575 static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15576                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15577                                  const X86Subtarget &Subtarget,
15578                                  SelectionDAG &DAG) {
15579   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15580   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15581   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15582
15583   if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
15584                                      Subtarget, DAG))
15585     return V;
15586
15587   if (V2.isUndef()) {
15588     // Check for being able to broadcast a single element.
15589     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
15590                                                     Mask, Subtarget, DAG))
15591       return Broadcast;
15592
15593     // Use low duplicate instructions for masks that match their pattern.
15594     if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15595       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
15596
15597     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
15598       // Non-half-crossing single input shuffles can be lowered with an
15599       // interleaved permutation.
15600       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
15601                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
15602       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
15603                          DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
15604     }
15605
15606     // With AVX2 we have direct support for this permutation.
15607     if (Subtarget.hasAVX2())
15608       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
15609                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15610
15611     // Try to create an in-lane repeating shuffle mask and then shuffle the
15612     // results into the target lanes.
15613     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15614             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15615       return V;
15616
15617     // Try to permute the lanes and then use a per-lane permute.
15618     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
15619                                                         Mask, DAG, Subtarget))
15620       return V;
15621
15622     // Otherwise, fall back.
15623     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
15624                                                DAG, Subtarget);
15625   }
15626
15627   // Use dedicated unpack instructions for masks that match their pattern.
15628   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
15629     return V;
15630
15631   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
15632                                           Zeroable, Subtarget, DAG))
15633     return Blend;
15634
15635   // Check if the blend happens to exactly fit that of SHUFPD.
15636   if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
15637                                           Zeroable, Subtarget, DAG))
15638     return Op;
15639
15640   bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
15641   bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
15642
15643   // If we have lane crossing shuffles AND they don't all come from the lower
15644   // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15645   // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
15646   // canonicalize to a blend of splat which isn't necessary for this combine.
15647   if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
15648       !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
15649       (V1.getOpcode() != ISD::BUILD_VECTOR) &&
15650       (V2.getOpcode() != ISD::BUILD_VECTOR))
15651     return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
15652
15653   // If we have one input in place, then we can permute the other input and
15654   // blend the result.
15655   if (V1IsInPlace || V2IsInPlace)
15656     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15657                                                 Subtarget, DAG);
15658
15659   // Try to create an in-lane repeating shuffle mask and then shuffle the
15660   // results into the target lanes.
15661   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15662           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15663     return V;
15664
15665   // Try to simplify this by merging 128-bit lanes to enable a lane-based
15666   // shuffle. However, if we have AVX2 and either inputs are already in place,
15667   // we will be able to shuffle even across lanes the other input in a single
15668   // instruction so skip this pattern.
15669   if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
15670     if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
15671             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15672       return V;
15673
15674   // If we have VLX support, we can use VEXPAND.
15675   if (Subtarget.hasVLX())
15676     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
15677                                          DAG, Subtarget))
15678       return V;
15679
15680   // If we have AVX2 then we always want to lower with a blend because an v4 we
15681   // can fully permute the elements.
15682   if (Subtarget.hasAVX2())
15683     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15684                                                 Subtarget, DAG);
15685
15686   // Otherwise fall back on generic lowering.
15687   return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
15688                                     Subtarget, DAG);
15689 }
15690
15691 /// Handle lowering of 4-lane 64-bit integer shuffles.
15692 ///
15693 /// This routine is only called when we have AVX2 and thus a reasonable
15694 /// instruction set for v4i64 shuffling..
15695 static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15696                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15697                                  const X86Subtarget &Subtarget,
15698                                  SelectionDAG &DAG) {
15699   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15700   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15701   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15702   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
15703
15704   if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15705                                      Subtarget, DAG))
15706     return V;
15707
15708   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
15709                                           Zeroable, Subtarget, DAG))
15710     return Blend;
15711
15712   // Check for being able to broadcast a single element.
15713   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
15714                                                   Subtarget, DAG))
15715     return Broadcast;
15716
15717   // Try to use shift instructions if fast.
15718   if (Subtarget.preferLowerShuffleAsShift())
15719     if (SDValue Shift =
15720             lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15721                                 Subtarget, DAG, /*BitwiseOnly*/ true))
15722       return Shift;
15723
15724   if (V2.isUndef()) {
15725     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
15726     // can use lower latency instructions that will operate on both lanes.
15727     SmallVector<int, 2> RepeatedMask;
15728     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
15729       SmallVector<int, 4> PSHUFDMask;
15730       narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
15731       return DAG.getBitcast(
15732           MVT::v4i64,
15733           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
15734                       DAG.getBitcast(MVT::v8i32, V1),
15735                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15736     }
15737
15738     // AVX2 provides a direct instruction for permuting a single input across
15739     // lanes.
15740     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
15741                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15742   }
15743
15744   // Try to use shift instructions.
15745   if (SDValue Shift =
15746           lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
15747                               DAG, /*BitwiseOnly*/ false))
15748     return Shift;
15749
15750   // If we have VLX support, we can use VALIGN or VEXPAND.
15751   if (Subtarget.hasVLX()) {
15752     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
15753                                               Zeroable, Subtarget, DAG))
15754       return Rotate;
15755
15756     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
15757                                          DAG, Subtarget))
15758       return V;
15759   }
15760
15761   // Try to use PALIGNR.
15762   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
15763                                                 Subtarget, DAG))
15764     return Rotate;
15765
15766   // Use dedicated unpack instructions for masks that match their pattern.
15767   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
15768     return V;
15769
15770   bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
15771   bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
15772
15773   // If we have one input in place, then we can permute the other input and
15774   // blend the result.
15775   if (V1IsInPlace || V2IsInPlace)
15776     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
15777                                                 Subtarget, DAG);
15778
15779   // Try to create an in-lane repeating shuffle mask and then shuffle the
15780   // results into the target lanes.
15781   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15782           DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
15783     return V;
15784
15785   // Try to lower to PERMQ(BLENDD(V1,V2)).
15786   if (SDValue V =
15787           lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
15788     return V;
15789
15790   // Try to simplify this by merging 128-bit lanes to enable a lane-based
15791   // shuffle. However, if we have AVX2 and either inputs are already in place,
15792   // we will be able to shuffle even across lanes the other input in a single
15793   // instruction so skip this pattern.
15794   if (!V1IsInPlace && !V2IsInPlace)
15795     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
15796             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
15797       return Result;
15798
15799   // Otherwise fall back on generic blend lowering.
15800   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
15801                                               Subtarget, DAG);
15802 }
15803
15804 /// Handle lowering of 8-lane 32-bit floating point shuffles.
15805 ///
15806 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
15807 /// isn't available.
15808 static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15809                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15810                                  const X86Subtarget &Subtarget,
15811                                  SelectionDAG &DAG) {
15812   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
15813   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
15814   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15815
15816   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
15817                                           Zeroable, Subtarget, DAG))
15818     return Blend;
15819
15820   // Check for being able to broadcast a single element.
15821   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
15822                                                   Subtarget, DAG))
15823     return Broadcast;
15824
15825   if (!Subtarget.hasAVX2()) {
15826     SmallVector<int> InLaneMask;
15827     computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15828
15829     if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
15830       if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
15831                                            /*SimpleOnly*/ true))
15832         return R;
15833   }
15834   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
15835                                                    Zeroable, Subtarget, DAG))
15836     return DAG.getBitcast(MVT::v8f32, ZExt);
15837
15838   // If the shuffle mask is repeated in each 128-bit lane, we have many more
15839   // options to efficiently lower the shuffle.
15840   SmallVector<int, 4> RepeatedMask;
15841   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
15842     assert(RepeatedMask.size() == 4 &&
15843            "Repeated masks must be half the mask width!");
15844
15845     // Use even/odd duplicate instructions for masks that match their pattern.
15846     if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
15847       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
15848     if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
15849       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
15850
15851     if (V2.isUndef())
15852       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
15853                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
15854
15855     // Use dedicated unpack instructions for masks that match their pattern.
15856     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
15857       return V;
15858
15859     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
15860     // have already handled any direct blends.
15861     return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
15862   }
15863
15864   // Try to create an in-lane repeating shuffle mask and then shuffle the
15865   // results into the target lanes.
15866   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
15867           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
15868     return V;
15869
15870   // If we have a single input shuffle with different shuffle patterns in the
15871   // two 128-bit lanes use the variable mask to VPERMILPS.
15872   if (V2.isUndef()) {
15873     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
15874       SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
15875       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
15876     }
15877     if (Subtarget.hasAVX2()) {
15878       SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
15879       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
15880     }
15881     // Otherwise, fall back.
15882     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
15883                                                DAG, Subtarget);
15884   }
15885
15886   // Try to simplify this by merging 128-bit lanes to enable a lane-based
15887   // shuffle.
15888   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
15889           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
15890     return Result;
15891
15892   // If we have VLX support, we can use VEXPAND.
15893   if (Subtarget.hasVLX())
15894     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
15895                                          DAG, Subtarget))
15896       return V;
15897
15898   // Try to match an interleave of two v8f32s and lower them as unpck and
15899   // permutes using ymms. This needs to go before we try to split the vectors.
15900   //
15901   // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
15902   // this path inadvertently.
15903   if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
15904     if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
15905                                                       Mask, DAG))
15906       return V;
15907
15908   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
15909   // since after split we get a more efficient code using vpunpcklwd and
15910   // vpunpckhwd instrs than vblend.
15911   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
15912     return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
15913                                       DAG);
15914
15915   // If we have AVX2 then we always want to lower with a blend because at v8 we
15916   // can fully permute the elements.
15917   if (Subtarget.hasAVX2())
15918     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
15919                                                 Subtarget, DAG);
15920
15921   // Otherwise fall back on generic lowering.
15922   return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
15923                                     Subtarget, DAG);
15924 }
15925
15926 /// Handle lowering of 8-lane 32-bit integer shuffles.
15927 ///
15928 /// This routine is only called when we have AVX2 and thus a reasonable
15929 /// instruction set for v8i32 shuffling..
15930 static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15931                                  const APInt &Zeroable, SDValue V1, SDValue V2,
15932                                  const X86Subtarget &Subtarget,
15933                                  SelectionDAG &DAG) {
15934   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
15935   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
15936   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
15937   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
15938
15939   int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
15940
15941   // Whenever we can lower this as a zext, that instruction is strictly faster
15942   // than any alternative. It also allows us to fold memory operands into the
15943   // shuffle in many cases.
15944   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
15945                                                    Zeroable, Subtarget, DAG))
15946     return ZExt;
15947
15948   // Try to match an interleave of two v8i32s and lower them as unpck and
15949   // permutes using ymms. This needs to go before we try to split the vectors.
15950   if (!Subtarget.hasAVX512())
15951     if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
15952                                                       Mask, DAG))
15953       return V;
15954
15955   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
15956   // since after split we get a more efficient code than vblend by using
15957   // vpunpcklwd and vpunpckhwd instrs.
15958   if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
15959       !Subtarget.hasAVX512())
15960     return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
15961                                       DAG);
15962
15963   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
15964                                           Zeroable, Subtarget, DAG))
15965     return Blend;
15966
15967   // Check for being able to broadcast a single element.
15968   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
15969                                                   Subtarget, DAG))
15970     return Broadcast;
15971
15972   // Try to use shift instructions if fast.
15973   if (Subtarget.preferLowerShuffleAsShift()) {
15974     if (SDValue Shift =
15975             lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
15976                                 Subtarget, DAG, /*BitwiseOnly*/ true))
15977       return Shift;
15978     if (NumV2Elements == 0)
15979       if (SDValue Rotate =
15980               lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
15981         return Rotate;
15982   }
15983
15984   // If the shuffle mask is repeated in each 128-bit lane we can use more
15985   // efficient instructions that mirror the shuffles across the two 128-bit
15986   // lanes.
15987   SmallVector<int, 4> RepeatedMask;
15988   bool Is128BitLaneRepeatedShuffle =
15989       is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
15990   if (Is128BitLaneRepeatedShuffle) {
15991     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
15992     if (V2.isUndef())
15993       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
15994                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
15995
15996     // Use dedicated unpack instructions for masks that match their pattern.
15997     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
15998       return V;
15999   }
16000
16001   // Try to use shift instructions.
16002   if (SDValue Shift =
16003           lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16004                               DAG, /*BitwiseOnly*/ false))
16005     return Shift;
16006
16007   if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16008     if (SDValue Rotate =
16009             lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16010       return Rotate;
16011
16012   // If we have VLX support, we can use VALIGN or EXPAND.
16013   if (Subtarget.hasVLX()) {
16014     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16015                                               Zeroable, Subtarget, DAG))
16016       return Rotate;
16017
16018     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
16019                                          DAG, Subtarget))
16020       return V;
16021   }
16022
16023   // Try to use byte rotation instructions.
16024   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16025                                                 Subtarget, DAG))
16026     return Rotate;
16027
16028   // Try to create an in-lane repeating shuffle mask and then shuffle the
16029   // results into the target lanes.
16030   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16031           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16032     return V;
16033
16034   if (V2.isUndef()) {
16035     // Try to produce a fixed cross-128-bit lane permute followed by unpack
16036     // because that should be faster than the variable permute alternatives.
16037     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
16038       return V;
16039
16040     // If the shuffle patterns aren't repeated but it's a single input, directly
16041     // generate a cross-lane VPERMD instruction.
16042     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16043     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16044   }
16045
16046   // Assume that a single SHUFPS is faster than an alternative sequence of
16047   // multiple instructions (even if the CPU has a domain penalty).
16048   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16049   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16050     SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16051     SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16052     SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16053                                             CastV1, CastV2, DAG);
16054     return DAG.getBitcast(MVT::v8i32, ShufPS);
16055   }
16056
16057   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16058   // shuffle.
16059   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16060           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16061     return Result;
16062
16063   // Otherwise fall back on generic blend lowering.
16064   return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16065                                               Subtarget, DAG);
16066 }
16067
16068 /// Handle lowering of 16-lane 16-bit integer shuffles.
16069 ///
16070 /// This routine is only called when we have AVX2 and thus a reasonable
16071 /// instruction set for v16i16 shuffling..
16072 static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16073                                   const APInt &Zeroable, SDValue V1, SDValue V2,
16074                                   const X86Subtarget &Subtarget,
16075                                   SelectionDAG &DAG) {
16076   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16077   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16078   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16079   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16080
16081   // Whenever we can lower this as a zext, that instruction is strictly faster
16082   // than any alternative. It also allows us to fold memory operands into the
16083   // shuffle in many cases.
16084   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16085           DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16086     return ZExt;
16087
16088   // Check for being able to broadcast a single element.
16089   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16090                                                   Subtarget, DAG))
16091     return Broadcast;
16092
16093   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16094                                           Zeroable, Subtarget, DAG))
16095     return Blend;
16096
16097   // Use dedicated unpack instructions for masks that match their pattern.
16098   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
16099     return V;
16100
16101   // Use dedicated pack instructions for masks that match their pattern.
16102   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
16103                                        Subtarget))
16104     return V;
16105
16106   // Try to use lower using a truncation.
16107   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16108                                        Subtarget, DAG))
16109     return V;
16110
16111   // Try to use shift instructions.
16112   if (SDValue Shift =
16113           lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16114                               Subtarget, DAG, /*BitwiseOnly*/ false))
16115     return Shift;
16116
16117   // Try to use byte rotation instructions.
16118   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16119                                                 Subtarget, DAG))
16120     return Rotate;
16121
16122   // Try to create an in-lane repeating shuffle mask and then shuffle the
16123   // results into the target lanes.
16124   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16125           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16126     return V;
16127
16128   if (V2.isUndef()) {
16129     // Try to use bit rotation instructions.
16130     if (SDValue Rotate =
16131             lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16132       return Rotate;
16133
16134     // Try to produce a fixed cross-128-bit lane permute followed by unpack
16135     // because that should be faster than the variable permute alternatives.
16136     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
16137       return V;
16138
16139     // There are no generalized cross-lane shuffle operations available on i16
16140     // element types.
16141     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16142       if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16143               DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16144         return V;
16145
16146       return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16147                                                  DAG, Subtarget);
16148     }
16149
16150     SmallVector<int, 8> RepeatedMask;
16151     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16152       // As this is a single-input shuffle, the repeated mask should be
16153       // a strictly valid v8i16 mask that we can pass through to the v8i16
16154       // lowering to handle even the v16 case.
16155       return lowerV8I16GeneralSingleInputShuffle(
16156           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16157     }
16158   }
16159
16160   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16161                                               Zeroable, Subtarget, DAG))
16162     return PSHUFB;
16163
16164   // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16165   if (Subtarget.hasBWI())
16166     return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16167
16168   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16169   // shuffle.
16170   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16171           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16172     return Result;
16173
16174   // Try to permute the lanes and then use a per-lane permute.
16175   if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16176           DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16177     return V;
16178
16179   // Try to match an interleave of two v16i16s and lower them as unpck and
16180   // permutes using ymms.
16181   if (!Subtarget.hasAVX512())
16182     if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16183                                                       Mask, DAG))
16184       return V;
16185
16186   // Otherwise fall back on generic lowering.
16187   return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16188                                     Subtarget, DAG);
16189 }
16190
16191 /// Handle lowering of 32-lane 8-bit integer shuffles.
16192 ///
16193 /// This routine is only called when we have AVX2 and thus a reasonable
16194 /// instruction set for v32i8 shuffling..
16195 static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16196                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16197                                  const X86Subtarget &Subtarget,
16198                                  SelectionDAG &DAG) {
16199   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16200   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16201   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16202   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16203
16204   // Whenever we can lower this as a zext, that instruction is strictly faster
16205   // than any alternative. It also allows us to fold memory operands into the
16206   // shuffle in many cases.
16207   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16208                                                    Zeroable, Subtarget, DAG))
16209     return ZExt;
16210
16211   // Check for being able to broadcast a single element.
16212   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16213                                                   Subtarget, DAG))
16214     return Broadcast;
16215
16216   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16217                                           Zeroable, Subtarget, DAG))
16218     return Blend;
16219
16220   // Use dedicated unpack instructions for masks that match their pattern.
16221   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
16222     return V;
16223
16224   // Use dedicated pack instructions for masks that match their pattern.
16225   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
16226                                        Subtarget))
16227     return V;
16228
16229   // Try to use lower using a truncation.
16230   if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16231                                        Subtarget, DAG))
16232     return V;
16233
16234   // Try to use shift instructions.
16235   if (SDValue Shift =
16236           lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16237                               DAG, /*BitwiseOnly*/ false))
16238     return Shift;
16239
16240   // Try to use byte rotation instructions.
16241   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16242                                                 Subtarget, DAG))
16243     return Rotate;
16244
16245   // Try to use bit rotation instructions.
16246   if (V2.isUndef())
16247     if (SDValue Rotate =
16248             lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16249       return Rotate;
16250
16251   // Try to create an in-lane repeating shuffle mask and then shuffle the
16252   // results into the target lanes.
16253   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16254           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16255     return V;
16256
16257   // There are no generalized cross-lane shuffle operations available on i8
16258   // element types.
16259   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16260     // Try to produce a fixed cross-128-bit lane permute followed by unpack
16261     // because that should be faster than the variable permute alternatives.
16262     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
16263       return V;
16264
16265     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16266             DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16267       return V;
16268
16269     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16270                                                DAG, Subtarget);
16271   }
16272
16273   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16274                                               Zeroable, Subtarget, DAG))
16275     return PSHUFB;
16276
16277   // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16278   if (Subtarget.hasVBMI())
16279     return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16280
16281   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16282   // shuffle.
16283   if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16284           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16285     return Result;
16286
16287   // Try to permute the lanes and then use a per-lane permute.
16288   if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16289           DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16290     return V;
16291
16292   // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16293   // by zeroable elements in the remaining 24 elements. Turn this into two
16294   // vmovqb instructions shuffled together.
16295   if (Subtarget.hasVLX())
16296     if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16297                                                   Mask, Zeroable, DAG))
16298       return V;
16299
16300   // Try to match an interleave of two v32i8s and lower them as unpck and
16301   // permutes using ymms.
16302   if (!Subtarget.hasAVX512())
16303     if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16304                                                       Mask, DAG))
16305       return V;
16306
16307   // Otherwise fall back on generic lowering.
16308   return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16309                                     Subtarget, DAG);
16310 }
16311
16312 /// High-level routine to lower various 256-bit x86 vector shuffles.
16313 ///
16314 /// This routine either breaks down the specific type of a 256-bit x86 vector
16315 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
16316 /// together based on the available instructions.
16317 static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
16318                                   SDValue V1, SDValue V2, const APInt &Zeroable,
16319                                   const X86Subtarget &Subtarget,
16320                                   SelectionDAG &DAG) {
16321   // If we have a single input to the zero element, insert that into V1 if we
16322   // can do so cheaply.
16323   int NumElts = VT.getVectorNumElements();
16324   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16325
16326   if (NumV2Elements == 1 && Mask[0] >= NumElts)
16327     if (SDValue Insertion = lowerShuffleAsElementInsertion(
16328             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16329       return Insertion;
16330
16331   // Handle special cases where the lower or upper half is UNDEF.
16332   if (SDValue V =
16333           lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16334     return V;
16335
16336   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16337   // can check for those subtargets here and avoid much of the subtarget
16338   // querying in the per-vector-type lowering routines. With AVX1 we have
16339   // essentially *zero* ability to manipulate a 256-bit vector with integer
16340   // types. Since we'll use floating point types there eventually, just
16341   // immediately cast everything to a float and operate entirely in that domain.
16342   if (VT.isInteger() && !Subtarget.hasAVX2()) {
16343     int ElementBits = VT.getScalarSizeInBits();
16344     if (ElementBits < 32) {
16345       // No floating point type available, if we can't use the bit operations
16346       // for masking/blending then decompose into 128-bit vectors.
16347       if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16348                                             Subtarget, DAG))
16349         return V;
16350       if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16351         return V;
16352       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16353     }
16354
16355     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16356                                 VT.getVectorNumElements());
16357     V1 = DAG.getBitcast(FpVT, V1);
16358     V2 = DAG.getBitcast(FpVT, V2);
16359     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16360   }
16361
16362   if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16363     V1 = DAG.getBitcast(MVT::v16i16, V1);
16364     V2 = DAG.getBitcast(MVT::v16i16, V2);
16365     return DAG.getBitcast(VT,
16366                           DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16367   }
16368
16369   switch (VT.SimpleTy) {
16370   case MVT::v4f64:
16371     return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16372   case MVT::v4i64:
16373     return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16374   case MVT::v8f32:
16375     return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16376   case MVT::v8i32:
16377     return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16378   case MVT::v16i16:
16379     return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16380   case MVT::v32i8:
16381     return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16382
16383   default:
16384     llvm_unreachable("Not a valid 256-bit x86 vector type!");
16385   }
16386 }
16387
16388 /// Try to lower a vector shuffle as a 128-bit shuffles.
16389 static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
16390                                   const APInt &Zeroable, SDValue V1, SDValue V2,
16391                                   const X86Subtarget &Subtarget,
16392                                   SelectionDAG &DAG) {
16393   assert(VT.getScalarSizeInBits() == 64 &&
16394          "Unexpected element type size for 128bit shuffle.");
16395
16396   // To handle 256 bit vector requires VLX and most probably
16397   // function lowerV2X128VectorShuffle() is better solution.
16398   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16399
16400   // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16401   SmallVector<int, 4> Widened128Mask;
16402   if (!canWidenShuffleElements(Mask, Widened128Mask))
16403     return SDValue();
16404   assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16405
16406   // Try to use an insert into a zero vector.
16407   if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16408       (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16409     unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16410     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16411     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16412                               DAG.getIntPtrConstant(0, DL));
16413     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16414                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
16415                        DAG.getIntPtrConstant(0, DL));
16416   }
16417
16418   // Check for patterns which can be matched with a single insert of a 256-bit
16419   // subvector.
16420   bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16421   if (OnlyUsesV1 ||
16422       isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16423     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16424     SDValue SubVec =
16425         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16426                     DAG.getIntPtrConstant(0, DL));
16427     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16428                        DAG.getIntPtrConstant(4, DL));
16429   }
16430
16431   // See if this is an insertion of the lower 128-bits of V2 into V1.
16432   bool IsInsert = true;
16433   int V2Index = -1;
16434   for (int i = 0; i < 4; ++i) {
16435     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16436     if (Widened128Mask[i] < 0)
16437       continue;
16438
16439     // Make sure all V1 subvectors are in place.
16440     if (Widened128Mask[i] < 4) {
16441       if (Widened128Mask[i] != i) {
16442         IsInsert = false;
16443         break;
16444       }
16445     } else {
16446       // Make sure we only have a single V2 index and its the lowest 128-bits.
16447       if (V2Index >= 0 || Widened128Mask[i] != 4) {
16448         IsInsert = false;
16449         break;
16450       }
16451       V2Index = i;
16452     }
16453   }
16454   if (IsInsert && V2Index >= 0) {
16455     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16456     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
16457                                  DAG.getIntPtrConstant(0, DL));
16458     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
16459   }
16460
16461   // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
16462   // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
16463   // possible we at least ensure the lanes stay sequential to help later
16464   // combines.
16465   SmallVector<int, 2> Widened256Mask;
16466   if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
16467     Widened128Mask.clear();
16468     narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
16469   }
16470
16471   // Try to lower to vshuf64x2/vshuf32x4.
16472   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
16473   int PermMask[4] = {-1, -1, -1, -1};
16474   // Ensure elements came from the same Op.
16475   for (int i = 0; i < 4; ++i) {
16476     assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16477     if (Widened128Mask[i] < 0)
16478       continue;
16479
16480     SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
16481     unsigned OpIndex = i / 2;
16482     if (Ops[OpIndex].isUndef())
16483       Ops[OpIndex] = Op;
16484     else if (Ops[OpIndex] != Op)
16485       return SDValue();
16486
16487     PermMask[i] = Widened128Mask[i] % 4;
16488   }
16489
16490   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
16491                      getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
16492 }
16493
16494 /// Handle lowering of 8-lane 64-bit floating point shuffles.
16495 static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16496                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16497                                  const X86Subtarget &Subtarget,
16498                                  SelectionDAG &DAG) {
16499   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16500   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16501   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16502
16503   if (V2.isUndef()) {
16504     // Use low duplicate instructions for masks that match their pattern.
16505     if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
16506       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
16507
16508     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
16509       // Non-half-crossing single input shuffles can be lowered with an
16510       // interleaved permutation.
16511       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16512                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
16513                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
16514                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
16515       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
16516                          DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16517     }
16518
16519     SmallVector<int, 4> RepeatedMask;
16520     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
16521       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
16522                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16523   }
16524
16525   if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
16526                                            V2, Subtarget, DAG))
16527     return Shuf128;
16528
16529   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
16530     return Unpck;
16531
16532   // Check if the blend happens to exactly fit that of SHUFPD.
16533   if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
16534                                           Zeroable, Subtarget, DAG))
16535     return Op;
16536
16537   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
16538                                        DAG, Subtarget))
16539     return V;
16540
16541   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
16542                                           Zeroable, Subtarget, DAG))
16543     return Blend;
16544
16545   return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
16546 }
16547
16548 /// Handle lowering of 16-lane 32-bit floating point shuffles.
16549 static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16550                                   const APInt &Zeroable, SDValue V1, SDValue V2,
16551                                   const X86Subtarget &Subtarget,
16552                                   SelectionDAG &DAG) {
16553   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16554   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16555   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16556
16557   // If the shuffle mask is repeated in each 128-bit lane, we have many more
16558   // options to efficiently lower the shuffle.
16559   SmallVector<int, 4> RepeatedMask;
16560   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
16561     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16562
16563     // Use even/odd duplicate instructions for masks that match their pattern.
16564     if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16565       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
16566     if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16567       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
16568
16569     if (V2.isUndef())
16570       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
16571                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16572
16573     // Use dedicated unpack instructions for masks that match their pattern.
16574     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
16575       return V;
16576
16577     if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16578                                             Zeroable, Subtarget, DAG))
16579       return Blend;
16580
16581     // Otherwise, fall back to a SHUFPS sequence.
16582     return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
16583   }
16584
16585   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16586                                           Zeroable, Subtarget, DAG))
16587     return Blend;
16588
16589   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16590           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16591     return DAG.getBitcast(MVT::v16f32, ZExt);
16592
16593   // Try to create an in-lane repeating shuffle mask and then shuffle the
16594   // results into the target lanes.
16595   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16596           DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
16597     return V;
16598
16599   // If we have a single input shuffle with different shuffle patterns in the
16600   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
16601   if (V2.isUndef() &&
16602       !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
16603     SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
16604     return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
16605   }
16606
16607   // If we have AVX512F support, we can use VEXPAND.
16608   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
16609                                              V1, V2, DAG, Subtarget))
16610     return V;
16611
16612   return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
16613 }
16614
16615 /// Handle lowering of 8-lane 64-bit integer shuffles.
16616 static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16617                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16618                                  const X86Subtarget &Subtarget,
16619                                  SelectionDAG &DAG) {
16620   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16621   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16622   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16623
16624   // Try to use shift instructions if fast.
16625   if (Subtarget.preferLowerShuffleAsShift())
16626     if (SDValue Shift =
16627             lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
16628                                 Subtarget, DAG, /*BitwiseOnly*/ true))
16629       return Shift;
16630
16631   if (V2.isUndef()) {
16632     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16633     // can use lower latency instructions that will operate on all four
16634     // 128-bit lanes.
16635     SmallVector<int, 2> Repeated128Mask;
16636     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
16637       SmallVector<int, 4> PSHUFDMask;
16638       narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
16639       return DAG.getBitcast(
16640           MVT::v8i64,
16641           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
16642                       DAG.getBitcast(MVT::v16i32, V1),
16643                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16644     }
16645
16646     SmallVector<int, 4> Repeated256Mask;
16647     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
16648       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
16649                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
16650   }
16651
16652   if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
16653                                            V2, Subtarget, DAG))
16654     return Shuf128;
16655
16656   // Try to use shift instructions.
16657   if (SDValue Shift =
16658           lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
16659                               DAG, /*BitwiseOnly*/ false))
16660     return Shift;
16661
16662   // Try to use VALIGN.
16663   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
16664                                             Zeroable, Subtarget, DAG))
16665     return Rotate;
16666
16667   // Try to use PALIGNR.
16668   if (Subtarget.hasBWI())
16669     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
16670                                                   Subtarget, DAG))
16671       return Rotate;
16672
16673   if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
16674     return Unpck;
16675
16676   // If we have AVX512F support, we can use VEXPAND.
16677   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
16678                                        DAG, Subtarget))
16679     return V;
16680
16681   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
16682                                           Zeroable, Subtarget, DAG))
16683     return Blend;
16684
16685   return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
16686 }
16687
16688 /// Handle lowering of 16-lane 32-bit integer shuffles.
16689 static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16690                                   const APInt &Zeroable, SDValue V1, SDValue V2,
16691                                   const X86Subtarget &Subtarget,
16692                                   SelectionDAG &DAG) {
16693   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16694   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16695   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16696
16697   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16698
16699   // Whenever we can lower this as a zext, that instruction is strictly faster
16700   // than any alternative. It also allows us to fold memory operands into the
16701   // shuffle in many cases.
16702   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16703           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16704     return ZExt;
16705
16706   // Try to use shift instructions if fast.
16707   if (Subtarget.preferLowerShuffleAsShift()) {
16708     if (SDValue Shift =
16709             lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16710                                 Subtarget, DAG, /*BitwiseOnly*/ true))
16711       return Shift;
16712     if (NumV2Elements == 0)
16713       if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
16714                                                    Subtarget, DAG))
16715         return Rotate;
16716   }
16717
16718   // If the shuffle mask is repeated in each 128-bit lane we can use more
16719   // efficient instructions that mirror the shuffles across the four 128-bit
16720   // lanes.
16721   SmallVector<int, 4> RepeatedMask;
16722   bool Is128BitLaneRepeatedShuffle =
16723       is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
16724   if (Is128BitLaneRepeatedShuffle) {
16725     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16726     if (V2.isUndef())
16727       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
16728                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16729
16730     // Use dedicated unpack instructions for masks that match their pattern.
16731     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
16732       return V;
16733   }
16734
16735   // Try to use shift instructions.
16736   if (SDValue Shift =
16737           lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16738                               Subtarget, DAG, /*BitwiseOnly*/ false))
16739     return Shift;
16740
16741   if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
16742     if (SDValue Rotate =
16743             lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
16744       return Rotate;
16745
16746   // Try to use VALIGN.
16747   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
16748                                             Zeroable, Subtarget, DAG))
16749     return Rotate;
16750
16751   // Try to use byte rotation instructions.
16752   if (Subtarget.hasBWI())
16753     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
16754                                                   Subtarget, DAG))
16755       return Rotate;
16756
16757   // Assume that a single SHUFPS is faster than using a permv shuffle.
16758   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16759   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16760     SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
16761     SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
16762     SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
16763                                             CastV1, CastV2, DAG);
16764     return DAG.getBitcast(MVT::v16i32, ShufPS);
16765   }
16766
16767   // Try to create an in-lane repeating shuffle mask and then shuffle the
16768   // results into the target lanes.
16769   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16770           DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
16771     return V;
16772
16773   // If we have AVX512F support, we can use VEXPAND.
16774   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
16775                                        DAG, Subtarget))
16776     return V;
16777
16778   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
16779                                           Zeroable, Subtarget, DAG))
16780     return Blend;
16781
16782   return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
16783 }
16784
16785 /// Handle lowering of 32-lane 16-bit integer shuffles.
16786 static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16787                                   const APInt &Zeroable, SDValue V1, SDValue V2,
16788                                   const X86Subtarget &Subtarget,
16789                                   SelectionDAG &DAG) {
16790   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
16791   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
16792   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16793   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
16794
16795   // Whenever we can lower this as a zext, that instruction is strictly faster
16796   // than any alternative. It also allows us to fold memory operands into the
16797   // shuffle in many cases.
16798   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16799           DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16800     return ZExt;
16801
16802   // Use dedicated unpack instructions for masks that match their pattern.
16803   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
16804     return V;
16805
16806   // Use dedicated pack instructions for masks that match their pattern.
16807   if (SDValue V =
16808           lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
16809     return V;
16810
16811   // Try to use shift instructions.
16812   if (SDValue Shift =
16813           lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
16814                               Subtarget, DAG, /*BitwiseOnly*/ false))
16815     return Shift;
16816
16817   // Try to use byte rotation instructions.
16818   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
16819                                                 Subtarget, DAG))
16820     return Rotate;
16821
16822   if (V2.isUndef()) {
16823     // Try to use bit rotation instructions.
16824     if (SDValue Rotate =
16825             lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
16826       return Rotate;
16827
16828     SmallVector<int, 8> RepeatedMask;
16829     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
16830       // As this is a single-input shuffle, the repeated mask should be
16831       // a strictly valid v8i16 mask that we can pass through to the v8i16
16832       // lowering to handle even the v32 case.
16833       return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
16834                                                  RepeatedMask, Subtarget, DAG);
16835     }
16836   }
16837
16838   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
16839                                           Zeroable, Subtarget, DAG))
16840     return Blend;
16841
16842   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
16843                                               Zeroable, Subtarget, DAG))
16844     return PSHUFB;
16845
16846   return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
16847 }
16848
16849 /// Handle lowering of 64-lane 8-bit integer shuffles.
16850 static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16851                                  const APInt &Zeroable, SDValue V1, SDValue V2,
16852                                  const X86Subtarget &Subtarget,
16853                                  SelectionDAG &DAG) {
16854   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
16855   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
16856   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
16857   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
16858
16859   // Whenever we can lower this as a zext, that instruction is strictly faster
16860   // than any alternative. It also allows us to fold memory operands into the
16861   // shuffle in many cases.
16862   if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16863           DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
16864     return ZExt;
16865
16866   // Use dedicated unpack instructions for masks that match their pattern.
16867   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
16868     return V;
16869
16870   // Use dedicated pack instructions for masks that match their pattern.
16871   if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
16872                                        Subtarget))
16873     return V;
16874
16875   // Try to use shift instructions.
16876   if (SDValue Shift =
16877           lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
16878                               DAG, /*BitwiseOnly*/ false))
16879     return Shift;
16880
16881   // Try to use byte rotation instructions.
16882   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
16883                                                 Subtarget, DAG))
16884     return Rotate;
16885
16886   // Try to use bit rotation instructions.
16887   if (V2.isUndef())
16888     if (SDValue Rotate =
16889             lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
16890       return Rotate;
16891
16892   // Lower as AND if possible.
16893   if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
16894                                              Zeroable, Subtarget, DAG))
16895     return Masked;
16896
16897   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
16898                                               Zeroable, Subtarget, DAG))
16899     return PSHUFB;
16900
16901   // Try to create an in-lane repeating shuffle mask and then shuffle the
16902   // results into the target lanes.
16903   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16904           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
16905     return V;
16906
16907   if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(
16908           DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
16909     return Result;
16910
16911   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
16912                                           Zeroable, Subtarget, DAG))
16913     return Blend;
16914
16915   if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
16916     // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
16917     // PALIGNR will be cheaper than the second PSHUFB+OR.
16918     if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
16919                                                        Mask, Subtarget, DAG))
16920       return V;
16921
16922     // If we can't directly blend but can use PSHUFB, that will be better as it
16923     // can both shuffle and set up the inefficient blend.
16924     bool V1InUse, V2InUse;
16925     return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
16926                                         DAG, V1InUse, V2InUse);
16927   }
16928
16929   // Try to simplify this by merging 128-bit lanes to enable a lane-based
16930   // shuffle.
16931   if (!V2.isUndef())
16932     if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16933             DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
16934       return Result;
16935
16936   // VBMI can use VPERMV/VPERMV3 byte shuffles.
16937   if (Subtarget.hasVBMI())
16938     return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
16939
16940   return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16941 }
16942
16943 /// High-level routine to lower various 512-bit x86 vector shuffles.
16944 ///
16945 /// This routine either breaks down the specific type of a 512-bit x86 vector
16946 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
16947 /// together based on the available instructions.
16948 static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16949                                   MVT VT, SDValue V1, SDValue V2,
16950                                   const APInt &Zeroable,
16951                                   const X86Subtarget &Subtarget,
16952                                   SelectionDAG &DAG) {
16953   assert(Subtarget.hasAVX512() &&
16954          "Cannot lower 512-bit vectors w/ basic ISA!");
16955
16956   // If we have a single input to the zero element, insert that into V1 if we
16957   // can do so cheaply.
16958   int NumElts = Mask.size();
16959   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16960
16961   if (NumV2Elements == 1 && Mask[0] >= NumElts)
16962     if (SDValue Insertion = lowerShuffleAsElementInsertion(
16963             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16964       return Insertion;
16965
16966   // Handle special cases where the lower or upper half is UNDEF.
16967   if (SDValue V =
16968           lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16969     return V;
16970
16971   // Check for being able to broadcast a single element.
16972   if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
16973                                                   Subtarget, DAG))
16974     return Broadcast;
16975
16976   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
16977     // Try using bit ops for masking and blending before falling back to
16978     // splitting.
16979     if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16980                                           Subtarget, DAG))
16981       return V;
16982     if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16983       return V;
16984
16985     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16986   }
16987
16988   if (VT == MVT::v32f16) {
16989     if (!Subtarget.hasBWI())
16990       return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
16991                                   /*SimpleOnly*/ false);
16992
16993     V1 = DAG.getBitcast(MVT::v32i16, V1);
16994     V2 = DAG.getBitcast(MVT::v32i16, V2);
16995     return DAG.getBitcast(MVT::v32f16,
16996                           DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
16997   }
16998
16999   // Dispatch to each element type for lowering. If we don't have support for
17000   // specific element type shuffles at 512 bits, immediately split them and
17001   // lower them. Each lowering routine of a given type is allowed to assume that
17002   // the requisite ISA extensions for that element type are available.
17003   switch (VT.SimpleTy) {
17004   case MVT::v8f64:
17005     return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17006   case MVT::v16f32:
17007     return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17008   case MVT::v8i64:
17009     return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17010   case MVT::v16i32:
17011     return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17012   case MVT::v32i16:
17013     return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17014   case MVT::v64i8:
17015     return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17016
17017   default:
17018     llvm_unreachable("Not a valid 512-bit x86 vector type!");
17019   }
17020 }
17021
17022 static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
17023                                          MVT VT, SDValue V1, SDValue V2,
17024                                          const X86Subtarget &Subtarget,
17025                                          SelectionDAG &DAG) {
17026   // Shuffle should be unary.
17027   if (!V2.isUndef())
17028     return SDValue();
17029
17030   int ShiftAmt = -1;
17031   int NumElts = Mask.size();
17032   for (int i = 0; i != NumElts; ++i) {
17033     int M = Mask[i];
17034     assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17035            "Unexpected mask index.");
17036     if (M < 0)
17037       continue;
17038
17039     // The first non-undef element determines our shift amount.
17040     if (ShiftAmt < 0) {
17041       ShiftAmt = M - i;
17042       // Need to be shifting right.
17043       if (ShiftAmt <= 0)
17044         return SDValue();
17045     }
17046     // All non-undef elements must shift by the same amount.
17047     if (ShiftAmt != M - i)
17048       return SDValue();
17049   }
17050   assert(ShiftAmt >= 0 && "All undef?");
17051
17052   // Great we found a shift right.
17053   SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17054   Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17055                     DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17056   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17057                      DAG.getIntPtrConstant(0, DL));
17058 }
17059
17060 // Determine if this shuffle can be implemented with a KSHIFT instruction.
17061 // Returns the shift amount if possible or -1 if not. This is a simplified
17062 // version of matchShuffleAsShift.
17063 static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17064                                     int MaskOffset, const APInt &Zeroable) {
17065   int Size = Mask.size();
17066
17067   auto CheckZeros = [&](int Shift, bool Left) {
17068     for (int j = 0; j < Shift; ++j)
17069       if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17070         return false;
17071
17072     return true;
17073   };
17074
17075   auto MatchShift = [&](int Shift, bool Left) {
17076     unsigned Pos = Left ? Shift : 0;
17077     unsigned Low = Left ? 0 : Shift;
17078     unsigned Len = Size - Shift;
17079     return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17080   };
17081
17082   for (int Shift = 1; Shift != Size; ++Shift)
17083     for (bool Left : {true, false})
17084       if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17085         Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
17086         return Shift;
17087       }
17088
17089   return -1;
17090 }
17091
17092
17093 // Lower vXi1 vector shuffles.
17094 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
17095 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
17096 // vector, shuffle and then truncate it back.
17097 static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
17098                                 MVT VT, SDValue V1, SDValue V2,
17099                                 const APInt &Zeroable,
17100                                 const X86Subtarget &Subtarget,
17101                                 SelectionDAG &DAG) {
17102   assert(Subtarget.hasAVX512() &&
17103          "Cannot lower 512-bit vectors w/o basic ISA!");
17104
17105   int NumElts = Mask.size();
17106
17107   // Try to recognize shuffles that are just padding a subvector with zeros.
17108   int SubvecElts = 0;
17109   int Src = -1;
17110   for (int i = 0; i != NumElts; ++i) {
17111     if (Mask[i] >= 0) {
17112       // Grab the source from the first valid mask. All subsequent elements need
17113       // to use this same source.
17114       if (Src < 0)
17115         Src = Mask[i] / NumElts;
17116       if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17117         break;
17118     }
17119
17120     ++SubvecElts;
17121   }
17122   assert(SubvecElts != NumElts && "Identity shuffle?");
17123
17124   // Clip to a power 2.
17125   SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17126
17127   // Make sure the number of zeroable bits in the top at least covers the bits
17128   // not covered by the subvector.
17129   if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17130     assert(Src >= 0 && "Expected a source!");
17131     MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17132     SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
17133                                   Src == 0 ? V1 : V2,
17134                                   DAG.getIntPtrConstant(0, DL));
17135     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17136                        DAG.getConstant(0, DL, VT),
17137                        Extract, DAG.getIntPtrConstant(0, DL));
17138   }
17139
17140   // Try a simple shift right with undef elements. Later we'll try with zeros.
17141   if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
17142                                                 DAG))
17143     return Shift;
17144
17145   // Try to match KSHIFTs.
17146   unsigned Offset = 0;
17147   for (SDValue V : { V1, V2 }) {
17148     unsigned Opcode;
17149     int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17150     if (ShiftAmt >= 0) {
17151       SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17152       MVT WideVT = Res.getSimpleValueType();
17153       // Widened right shifts need two shifts to ensure we shift in zeroes.
17154       if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17155         int WideElts = WideVT.getVectorNumElements();
17156         // Shift left to put the original vector in the MSBs of the new size.
17157         Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17158                           DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17159         // Increase the shift amount to account for the left shift.
17160         ShiftAmt += WideElts - NumElts;
17161       }
17162
17163       Res = DAG.getNode(Opcode, DL, WideVT, Res,
17164                         DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17165       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17166                          DAG.getIntPtrConstant(0, DL));
17167     }
17168     Offset += NumElts; // Increment for next iteration.
17169   }
17170
17171   // If we're broadcasting a SETCC result, try to broadcast the ops instead.
17172   // TODO: What other unary shuffles would benefit from this?
17173   if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&
17174       V1->hasOneUse()) {
17175     SDValue Op0 = V1.getOperand(0);
17176     SDValue Op1 = V1.getOperand(1);
17177     ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17178     EVT OpVT = Op0.getValueType();
17179     return DAG.getSetCC(
17180         DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17181         DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17182   }
17183
17184   MVT ExtVT;
17185   switch (VT.SimpleTy) {
17186   default:
17187     llvm_unreachable("Expected a vector of i1 elements");
17188   case MVT::v2i1:
17189     ExtVT = MVT::v2i64;
17190     break;
17191   case MVT::v4i1:
17192     ExtVT = MVT::v4i32;
17193     break;
17194   case MVT::v8i1:
17195     // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17196     // shuffle.
17197     ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17198     break;
17199   case MVT::v16i1:
17200     // Take 512-bit type, unless we are avoiding 512-bit types and have the
17201     // 256-bit operation available.
17202     ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17203     break;
17204   case MVT::v32i1:
17205     // Take 512-bit type, unless we are avoiding 512-bit types and have the
17206     // 256-bit operation available.
17207     assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17208     ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17209     break;
17210   case MVT::v64i1:
17211     // Fall back to scalarization. FIXME: We can do better if the shuffle
17212     // can be partitioned cleanly.
17213     if (!Subtarget.useBWIRegs())
17214       return SDValue();
17215     ExtVT = MVT::v64i8;
17216     break;
17217   }
17218
17219   V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17220   V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17221
17222   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17223   // i1 was sign extended we can use X86ISD::CVT2MASK.
17224   int NumElems = VT.getVectorNumElements();
17225   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17226       (Subtarget.hasDQI() && (NumElems < 32)))
17227     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17228                        Shuffle, ISD::SETGT);
17229
17230   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17231 }
17232
17233 /// Helper function that returns true if the shuffle mask should be
17234 /// commuted to improve canonicalization.
17235 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
17236   int NumElements = Mask.size();
17237
17238   int NumV1Elements = 0, NumV2Elements = 0;
17239   for (int M : Mask)
17240     if (M < 0)
17241       continue;
17242     else if (M < NumElements)
17243       ++NumV1Elements;
17244     else
17245       ++NumV2Elements;
17246
17247   // Commute the shuffle as needed such that more elements come from V1 than
17248   // V2. This allows us to match the shuffle pattern strictly on how many
17249   // elements come from V1 without handling the symmetric cases.
17250   if (NumV2Elements > NumV1Elements)
17251     return true;
17252
17253   assert(NumV1Elements > 0 && "No V1 indices");
17254
17255   if (NumV2Elements == 0)
17256     return false;
17257
17258   // When the number of V1 and V2 elements are the same, try to minimize the
17259   // number of uses of V2 in the low half of the vector. When that is tied,
17260   // ensure that the sum of indices for V1 is equal to or lower than the sum
17261   // indices for V2. When those are equal, try to ensure that the number of odd
17262   // indices for V1 is lower than the number of odd indices for V2.
17263   if (NumV1Elements == NumV2Elements) {
17264     int LowV1Elements = 0, LowV2Elements = 0;
17265     for (int M : Mask.slice(0, NumElements / 2))
17266       if (M >= NumElements)
17267         ++LowV2Elements;
17268       else if (M >= 0)
17269         ++LowV1Elements;
17270     if (LowV2Elements > LowV1Elements)
17271       return true;
17272     if (LowV2Elements == LowV1Elements) {
17273       int SumV1Indices = 0, SumV2Indices = 0;
17274       for (int i = 0, Size = Mask.size(); i < Size; ++i)
17275         if (Mask[i] >= NumElements)
17276           SumV2Indices += i;
17277         else if (Mask[i] >= 0)
17278           SumV1Indices += i;
17279       if (SumV2Indices < SumV1Indices)
17280         return true;
17281       if (SumV2Indices == SumV1Indices) {
17282         int NumV1OddIndices = 0, NumV2OddIndices = 0;
17283         for (int i = 0, Size = Mask.size(); i < Size; ++i)
17284           if (Mask[i] >= NumElements)
17285             NumV2OddIndices += i % 2;
17286           else if (Mask[i] >= 0)
17287             NumV1OddIndices += i % 2;
17288         if (NumV2OddIndices < NumV1OddIndices)
17289           return true;
17290       }
17291     }
17292   }
17293
17294   return false;
17295 }
17296
17297 static bool canCombineAsMaskOperation(SDValue V,
17298                                       const X86Subtarget &Subtarget) {
17299   if (!Subtarget.hasAVX512())
17300     return false;
17301
17302   if (!V.getValueType().isSimple())
17303     return false;
17304
17305   MVT VT = V.getSimpleValueType().getScalarType();
17306   if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17307     return false;
17308
17309   // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17310   // are preferable to blendw/blendvb/masked-mov.
17311   if ((VT == MVT::i16 || VT == MVT::i8) &&
17312       V.getSimpleValueType().getSizeInBits() < 512)
17313     return false;
17314
17315   auto HasMaskOperation = [&](SDValue V) {
17316     // TODO: Currently we only check limited opcode. We probably extend
17317     // it to all binary operation by checking TLI.isBinOp().
17318     switch (V->getOpcode()) {
17319     default:
17320       return false;
17321     case ISD::ADD:
17322     case ISD::SUB:
17323     case ISD::AND:
17324     case ISD::XOR:
17325     case ISD::OR:
17326     case ISD::SMAX:
17327     case ISD::SMIN:
17328     case ISD::UMAX:
17329     case ISD::UMIN:
17330     case ISD::ABS:
17331     case ISD::SHL:
17332     case ISD::SRL:
17333     case ISD::SRA:
17334     case ISD::MUL:
17335       break;
17336     }
17337     if (!V->hasOneUse())
17338       return false;
17339
17340     return true;
17341   };
17342
17343   if (HasMaskOperation(V))
17344     return true;
17345
17346   return false;
17347 }
17348
17349 // Forward declaration.
17350 static SDValue canonicalizeShuffleMaskWithHorizOp(
17351     MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
17352     unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17353     const X86Subtarget &Subtarget);
17354
17355     /// Top-level lowering for x86 vector shuffles.
17356 ///
17357 /// This handles decomposition, canonicalization, and lowering of all x86
17358 /// vector shuffles. Most of the specific lowering strategies are encapsulated
17359 /// above in helper routines. The canonicalization attempts to widen shuffles
17360 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
17361 /// s.t. only one of the two inputs needs to be tested, etc.
17362 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
17363                                    SelectionDAG &DAG) {
17364   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17365   ArrayRef<int> OrigMask = SVOp->getMask();
17366   SDValue V1 = Op.getOperand(0);
17367   SDValue V2 = Op.getOperand(1);
17368   MVT VT = Op.getSimpleValueType();
17369   int NumElements = VT.getVectorNumElements();
17370   SDLoc DL(Op);
17371   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17372
17373   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17374          "Can't lower MMX shuffles");
17375
17376   bool V1IsUndef = V1.isUndef();
17377   bool V2IsUndef = V2.isUndef();
17378   if (V1IsUndef && V2IsUndef)
17379     return DAG.getUNDEF(VT);
17380
17381   // When we create a shuffle node we put the UNDEF node to second operand,
17382   // but in some cases the first operand may be transformed to UNDEF.
17383   // In this case we should just commute the node.
17384   if (V1IsUndef)
17385     return DAG.getCommutedVectorShuffle(*SVOp);
17386
17387   // Check for non-undef masks pointing at an undef vector and make the masks
17388   // undef as well. This makes it easier to match the shuffle based solely on
17389   // the mask.
17390   if (V2IsUndef &&
17391       any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17392     SmallVector<int, 8> NewMask(OrigMask);
17393     for (int &M : NewMask)
17394       if (M >= NumElements)
17395         M = -1;
17396     return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17397   }
17398
17399   // Check for illegal shuffle mask element index values.
17400   int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17401   (void)MaskUpperLimit;
17402   assert(llvm::all_of(OrigMask,
17403                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17404          "Out of bounds shuffle index");
17405
17406   // We actually see shuffles that are entirely re-arrangements of a set of
17407   // zero inputs. This mostly happens while decomposing complex shuffles into
17408   // simple ones. Directly lower these as a buildvector of zeros.
17409   APInt KnownUndef, KnownZero;
17410   computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17411
17412   APInt Zeroable = KnownUndef | KnownZero;
17413   if (Zeroable.isAllOnes())
17414     return getZeroVector(VT, Subtarget, DAG, DL);
17415
17416   bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17417
17418   // Try to collapse shuffles into using a vector type with fewer elements but
17419   // wider element types. We cap this to not form integers or floating point
17420   // elements wider than 64 bits. It does not seem beneficial to form i128
17421   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17422   SmallVector<int, 16> WidenedMask;
17423   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17424       !canCombineAsMaskOperation(V1, Subtarget) &&
17425       !canCombineAsMaskOperation(V2, Subtarget) &&
17426       canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17427     // Shuffle mask widening should not interfere with a broadcast opportunity
17428     // by obfuscating the operands with bitcasts.
17429     // TODO: Avoid lowering directly from this top-level function: make this
17430     // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17431     if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17432                                                     Subtarget, DAG))
17433       return Broadcast;
17434
17435     MVT NewEltVT = VT.isFloatingPoint()
17436                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
17437                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
17438     int NewNumElts = NumElements / 2;
17439     MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
17440     // Make sure that the new vector type is legal. For example, v2f64 isn't
17441     // legal on SSE1.
17442     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
17443       if (V2IsZero) {
17444         // Modify the new Mask to take all zeros from the all-zero vector.
17445         // Choose indices that are blend-friendly.
17446         bool UsedZeroVector = false;
17447         assert(is_contained(WidenedMask, SM_SentinelZero) &&
17448                "V2's non-undef elements are used?!");
17449         for (int i = 0; i != NewNumElts; ++i)
17450           if (WidenedMask[i] == SM_SentinelZero) {
17451             WidenedMask[i] = i + NewNumElts;
17452             UsedZeroVector = true;
17453           }
17454         // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17455         // some elements to be undef.
17456         if (UsedZeroVector)
17457           V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
17458       }
17459       V1 = DAG.getBitcast(NewVT, V1);
17460       V2 = DAG.getBitcast(NewVT, V2);
17461       return DAG.getBitcast(
17462           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
17463     }
17464   }
17465
17466   SmallVector<SDValue> Ops = {V1, V2};
17467   SmallVector<int> Mask(OrigMask);
17468
17469   // Canonicalize the shuffle with any horizontal ops inputs.
17470   // NOTE: This may update Ops and Mask.
17471   if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
17472           Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
17473     return DAG.getBitcast(VT, HOp);
17474
17475   V1 = DAG.getBitcast(VT, Ops[0]);
17476   V2 = DAG.getBitcast(VT, Ops[1]);
17477   assert(NumElements == (int)Mask.size() &&
17478          "canonicalizeShuffleMaskWithHorizOp "
17479          "shouldn't alter the shuffle mask size");
17480
17481   // Commute the shuffle if it will improve canonicalization.
17482   if (canonicalizeShuffleMaskWithCommute(Mask)) {
17483     ShuffleVectorSDNode::commuteMask(Mask);
17484     std::swap(V1, V2);
17485   }
17486
17487   // For each vector width, delegate to a specialized lowering routine.
17488   if (VT.is128BitVector())
17489     return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17490
17491   if (VT.is256BitVector())
17492     return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17493
17494   if (VT.is512BitVector())
17495     return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17496
17497   if (Is1BitVector)
17498     return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17499
17500   llvm_unreachable("Unimplemented!");
17501 }
17502
17503 /// Try to lower a VSELECT instruction to a vector shuffle.
17504 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
17505                                            const X86Subtarget &Subtarget,
17506                                            SelectionDAG &DAG) {
17507   SDValue Cond = Op.getOperand(0);
17508   SDValue LHS = Op.getOperand(1);
17509   SDValue RHS = Op.getOperand(2);
17510   MVT VT = Op.getSimpleValueType();
17511
17512   // Only non-legal VSELECTs reach this lowering, convert those into generic
17513   // shuffles and re-use the shuffle lowering path for blends.
17514   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
17515     SmallVector<int, 32> Mask;
17516     if (createShuffleMaskFromVSELECT(Mask, Cond))
17517       return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
17518   }
17519
17520   return SDValue();
17521 }
17522
17523 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
17524   SDValue Cond = Op.getOperand(0);
17525   SDValue LHS = Op.getOperand(1);
17526   SDValue RHS = Op.getOperand(2);
17527
17528   SDLoc dl(Op);
17529   MVT VT = Op.getSimpleValueType();
17530   if (isSoftF16(VT, Subtarget)) {
17531     MVT NVT = VT.changeVectorElementTypeToInteger();
17532     return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
17533                                           DAG.getBitcast(NVT, LHS),
17534                                           DAG.getBitcast(NVT, RHS)));
17535   }
17536
17537   // A vselect where all conditions and data are constants can be optimized into
17538   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
17539   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
17540       ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
17541       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
17542     return SDValue();
17543
17544   // Try to lower this to a blend-style vector shuffle. This can handle all
17545   // constant condition cases.
17546   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
17547     return BlendOp;
17548
17549   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
17550   // with patterns on the mask registers on AVX-512.
17551   MVT CondVT = Cond.getSimpleValueType();
17552   unsigned CondEltSize = Cond.getScalarValueSizeInBits();
17553   if (CondEltSize == 1)
17554     return Op;
17555
17556   // Variable blends are only legal from SSE4.1 onward.
17557   if (!Subtarget.hasSSE41())
17558     return SDValue();
17559
17560   unsigned EltSize = VT.getScalarSizeInBits();
17561   unsigned NumElts = VT.getVectorNumElements();
17562
17563   // Expand v32i16/v64i8 without BWI.
17564   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
17565     return SDValue();
17566
17567   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
17568   // into an i1 condition so that we can use the mask-based 512-bit blend
17569   // instructions.
17570   if (VT.getSizeInBits() == 512) {
17571     // Build a mask by testing the condition against zero.
17572     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
17573     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
17574                                 DAG.getConstant(0, dl, CondVT),
17575                                 ISD::SETNE);
17576     // Now return a new VSELECT using the mask.
17577     return DAG.getSelect(dl, VT, Mask, LHS, RHS);
17578   }
17579
17580   // SEXT/TRUNC cases where the mask doesn't match the destination size.
17581   if (CondEltSize != EltSize) {
17582     // If we don't have a sign splat, rely on the expansion.
17583     if (CondEltSize != DAG.ComputeNumSignBits(Cond))
17584       return SDValue();
17585
17586     MVT NewCondSVT = MVT::getIntegerVT(EltSize);
17587     MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
17588     Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
17589     return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
17590   }
17591
17592   // Only some types will be legal on some subtargets. If we can emit a legal
17593   // VSELECT-matching blend, return Op, and but if we need to expand, return
17594   // a null value.
17595   switch (VT.SimpleTy) {
17596   default:
17597     // Most of the vector types have blends past SSE4.1.
17598     return Op;
17599
17600   case MVT::v32i8:
17601     // The byte blends for AVX vectors were introduced only in AVX2.
17602     if (Subtarget.hasAVX2())
17603       return Op;
17604
17605     return SDValue();
17606
17607   case MVT::v8i16:
17608   case MVT::v16i16: {
17609     // Bitcast everything to the vXi8 type and use a vXi8 vselect.
17610     MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
17611     Cond = DAG.getBitcast(CastVT, Cond);
17612     LHS = DAG.getBitcast(CastVT, LHS);
17613     RHS = DAG.getBitcast(CastVT, RHS);
17614     SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
17615     return DAG.getBitcast(VT, Select);
17616   }
17617   }
17618 }
17619
17620 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
17621   MVT VT = Op.getSimpleValueType();
17622   SDValue Vec = Op.getOperand(0);
17623   SDValue Idx = Op.getOperand(1);
17624   assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
17625   SDLoc dl(Op);
17626
17627   if (!Vec.getSimpleValueType().is128BitVector())
17628     return SDValue();
17629
17630   if (VT.getSizeInBits() == 8) {
17631     // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
17632     // we're going to zero extend the register or fold the store.
17633     if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&
17634         !X86::mayFoldIntoStore(Op))
17635       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
17636                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17637                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
17638
17639     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
17640     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
17641                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17642     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17643   }
17644
17645   if (VT == MVT::f32) {
17646     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
17647     // the result back to FR32 register. It's only worth matching if the
17648     // result has a single use which is a store or a bitcast to i32.  And in
17649     // the case of a store, it's not worth it if the index is a constant 0,
17650     // because a MOVSSmr can be used instead, which is smaller and faster.
17651     if (!Op.hasOneUse())
17652       return SDValue();
17653     SDNode *User = *Op.getNode()->use_begin();
17654     if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
17655         (User->getOpcode() != ISD::BITCAST ||
17656          User->getValueType(0) != MVT::i32))
17657       return SDValue();
17658     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17659                                   DAG.getBitcast(MVT::v4i32, Vec), Idx);
17660     return DAG.getBitcast(MVT::f32, Extract);
17661   }
17662
17663   if (VT == MVT::i32 || VT == MVT::i64)
17664       return Op;
17665
17666   return SDValue();
17667 }
17668
17669 /// Extract one bit from mask vector, like v16i1 or v8i1.
17670 /// AVX-512 feature.
17671 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
17672                                         const X86Subtarget &Subtarget) {
17673   SDValue Vec = Op.getOperand(0);
17674   SDLoc dl(Vec);
17675   MVT VecVT = Vec.getSimpleValueType();
17676   SDValue Idx = Op.getOperand(1);
17677   auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17678   MVT EltVT = Op.getSimpleValueType();
17679
17680   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
17681          "Unexpected vector type in ExtractBitFromMaskVector");
17682
17683   // variable index can't be handled in mask registers,
17684   // extend vector to VR512/128
17685   if (!IdxC) {
17686     unsigned NumElts = VecVT.getVectorNumElements();
17687     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
17688     // than extending to 128/256bit.
17689     if (NumElts == 1) {
17690       Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17691       MVT IntVT = MVT::getIntegerVT(Vec.getValueType().getVectorNumElements());
17692       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
17693     }
17694     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17695     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17696     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
17697     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
17698     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
17699   }
17700
17701   unsigned IdxVal = IdxC->getZExtValue();
17702   if (IdxVal == 0) // the operation is legal
17703     return Op;
17704
17705   // Extend to natively supported kshift.
17706   Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17707
17708   // Use kshiftr instruction to move to the lower element.
17709   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
17710                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17711
17712   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17713                      DAG.getIntPtrConstant(0, dl));
17714 }
17715
17716 // Helper to find all the extracted elements from a vector.
17717 static APInt getExtractedDemandedElts(SDNode *N) {
17718   MVT VT = N->getSimpleValueType(0);
17719   unsigned NumElts = VT.getVectorNumElements();
17720   APInt DemandedElts = APInt::getZero(NumElts);
17721   for (SDNode *User : N->uses()) {
17722     switch (User->getOpcode()) {
17723     case X86ISD::PEXTRB:
17724     case X86ISD::PEXTRW:
17725     case ISD::EXTRACT_VECTOR_ELT:
17726       if (!isa<ConstantSDNode>(User->getOperand(1))) {
17727         DemandedElts.setAllBits();
17728         return DemandedElts;
17729       }
17730       DemandedElts.setBit(User->getConstantOperandVal(1));
17731       break;
17732     case ISD::BITCAST: {
17733       if (!User->getValueType(0).isSimple() ||
17734           !User->getValueType(0).isVector()) {
17735         DemandedElts.setAllBits();
17736         return DemandedElts;
17737       }
17738       APInt DemandedSrcElts = getExtractedDemandedElts(User);
17739       DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
17740       break;
17741     }
17742     default:
17743       DemandedElts.setAllBits();
17744       return DemandedElts;
17745     }
17746   }
17747   return DemandedElts;
17748 }
17749
17750 SDValue
17751 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
17752                                            SelectionDAG &DAG) const {
17753   SDLoc dl(Op);
17754   SDValue Vec = Op.getOperand(0);
17755   MVT VecVT = Vec.getSimpleValueType();
17756   SDValue Idx = Op.getOperand(1);
17757   auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17758
17759   if (VecVT.getVectorElementType() == MVT::i1)
17760     return ExtractBitFromMaskVector(Op, DAG, Subtarget);
17761
17762   if (!IdxC) {
17763     // Its more profitable to go through memory (1 cycles throughput)
17764     // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
17765     // IACA tool was used to get performance estimation
17766     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
17767     //
17768     // example : extractelement <16 x i8> %a, i32 %i
17769     //
17770     // Block Throughput: 3.00 Cycles
17771     // Throughput Bottleneck: Port5
17772     //
17773     // | Num Of |   Ports pressure in cycles  |    |
17774     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
17775     // ---------------------------------------------
17776     // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
17777     // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
17778     // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
17779     // Total Num Of Uops: 4
17780     //
17781     //
17782     // Block Throughput: 1.00 Cycles
17783     // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
17784     //
17785     // |    |  Ports pressure in cycles   |  |
17786     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
17787     // ---------------------------------------------------------
17788     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
17789     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
17790     // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
17791     // Total Num Of Uops: 4
17792
17793     return SDValue();
17794   }
17795
17796   unsigned IdxVal = IdxC->getZExtValue();
17797
17798   // If this is a 256-bit vector result, first extract the 128-bit vector and
17799   // then extract the element from the 128-bit vector.
17800   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
17801     // Get the 128-bit vector.
17802     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
17803     MVT EltVT = VecVT.getVectorElementType();
17804
17805     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
17806     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
17807
17808     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
17809     // this can be done with a mask.
17810     IdxVal &= ElemsPerChunk - 1;
17811     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17812                        DAG.getIntPtrConstant(IdxVal, dl));
17813   }
17814
17815   assert(VecVT.is128BitVector() && "Unexpected vector length");
17816
17817   MVT VT = Op.getSimpleValueType();
17818
17819   if (VT == MVT::i16) {
17820     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
17821     // we're going to zero extend the register or fold the store (SSE41 only).
17822     if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
17823         !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
17824       if (Subtarget.hasFP16())
17825         return Op;
17826
17827       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
17828                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17829                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
17830     }
17831
17832     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
17833                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17834     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17835   }
17836
17837   if (Subtarget.hasSSE41())
17838     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
17839       return Res;
17840
17841   // Only extract a single element from a v16i8 source - determine the common
17842   // DWORD/WORD that all extractions share, and extract the sub-byte.
17843   // TODO: Add QWORD MOVQ extraction?
17844   if (VT == MVT::i8) {
17845     APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
17846     assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
17847
17848     // Extract either the lowest i32 or any i16, and extract the sub-byte.
17849     int DWordIdx = IdxVal / 4;
17850     if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
17851       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17852                                 DAG.getBitcast(MVT::v4i32, Vec),
17853                                 DAG.getIntPtrConstant(DWordIdx, dl));
17854       int ShiftVal = (IdxVal % 4) * 8;
17855       if (ShiftVal != 0)
17856         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
17857                           DAG.getConstant(ShiftVal, dl, MVT::i8));
17858       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
17859     }
17860
17861     int WordIdx = IdxVal / 2;
17862     if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
17863       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
17864                                 DAG.getBitcast(MVT::v8i16, Vec),
17865                                 DAG.getIntPtrConstant(WordIdx, dl));
17866       int ShiftVal = (IdxVal % 2) * 8;
17867       if (ShiftVal != 0)
17868         Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
17869                           DAG.getConstant(ShiftVal, dl, MVT::i8));
17870       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
17871     }
17872   }
17873
17874   if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
17875     if (IdxVal == 0)
17876       return Op;
17877
17878     // Shuffle the element to the lowest element, then movss or movsh.
17879     SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
17880     Mask[0] = static_cast<int>(IdxVal);
17881     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
17882     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
17883                        DAG.getIntPtrConstant(0, dl));
17884   }
17885
17886   if (VT.getSizeInBits() == 64) {
17887     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
17888     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
17889     //        to match extract_elt for f64.
17890     if (IdxVal == 0)
17891       return Op;
17892
17893     // UNPCKHPD the element to the lowest double word, then movsd.
17894     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
17895     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
17896     int Mask[2] = { 1, -1 };
17897     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
17898     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
17899                        DAG.getIntPtrConstant(0, dl));
17900   }
17901
17902   return SDValue();
17903 }
17904
17905 /// Insert one bit to mask vector, like v16i1 or v8i1.
17906 /// AVX-512 feature.
17907 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
17908                                      const X86Subtarget &Subtarget) {
17909   SDLoc dl(Op);
17910   SDValue Vec = Op.getOperand(0);
17911   SDValue Elt = Op.getOperand(1);
17912   SDValue Idx = Op.getOperand(2);
17913   MVT VecVT = Vec.getSimpleValueType();
17914
17915   if (!isa<ConstantSDNode>(Idx)) {
17916     // Non constant index. Extend source and destination,
17917     // insert element and then truncate the result.
17918     unsigned NumElts = VecVT.getVectorNumElements();
17919     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17920     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17921     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
17922       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
17923       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
17924     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
17925   }
17926
17927   // Copy into a k-register, extract to v1i1 and insert_subvector.
17928   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
17929   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
17930 }
17931
17932 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
17933                                                   SelectionDAG &DAG) const {
17934   MVT VT = Op.getSimpleValueType();
17935   MVT EltVT = VT.getVectorElementType();
17936   unsigned NumElts = VT.getVectorNumElements();
17937   unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
17938
17939   if (EltVT == MVT::i1)
17940     return InsertBitToMaskVector(Op, DAG, Subtarget);
17941
17942   SDLoc dl(Op);
17943   SDValue N0 = Op.getOperand(0);
17944   SDValue N1 = Op.getOperand(1);
17945   SDValue N2 = Op.getOperand(2);
17946   auto *N2C = dyn_cast<ConstantSDNode>(N2);
17947
17948   if (EltVT == MVT::bf16) {
17949     MVT IVT = VT.changeVectorElementTypeToInteger();
17950     SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
17951                               DAG.getBitcast(IVT, N0),
17952                               DAG.getBitcast(MVT::i16, N1), N2);
17953     return DAG.getBitcast(VT, Res);
17954   }
17955
17956   if (!N2C) {
17957     // Variable insertion indices, usually we're better off spilling to stack,
17958     // but AVX512 can use a variable compare+select by comparing against all
17959     // possible vector indices, and FP insertion has less gpr->simd traffic.
17960     if (!(Subtarget.hasBWI() ||
17961           (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
17962           (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
17963       return SDValue();
17964
17965     MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
17966     MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
17967     if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
17968       return SDValue();
17969
17970     SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
17971     SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
17972     SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
17973
17974     SmallVector<SDValue, 16> RawIndices;
17975     for (unsigned I = 0; I != NumElts; ++I)
17976       RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
17977     SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
17978
17979     // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
17980     return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
17981                            ISD::CondCode::SETEQ);
17982   }
17983
17984   if (N2C->getAPIntValue().uge(NumElts))
17985     return SDValue();
17986   uint64_t IdxVal = N2C->getZExtValue();
17987
17988   bool IsZeroElt = X86::isZeroNode(N1);
17989   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
17990
17991   if (IsZeroElt || IsAllOnesElt) {
17992     // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
17993     // We don't deal with i8 0 since it appears to be handled elsewhere.
17994     if (IsAllOnesElt &&
17995         ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
17996          ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
17997       SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
17998       SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
17999       SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18000       CstVectorElts[IdxVal] = OnesCst;
18001       SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18002       return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18003     }
18004     // See if we can do this more efficiently with a blend shuffle with a
18005     // rematerializable vector.
18006     if (Subtarget.hasSSE41() &&
18007         (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18008       SmallVector<int, 8> BlendMask;
18009       for (unsigned i = 0; i != NumElts; ++i)
18010         BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18011       SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18012                                     : getOnesVector(VT, DAG, dl);
18013       return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18014     }
18015   }
18016
18017   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18018   // into that, and then insert the subvector back into the result.
18019   if (VT.is256BitVector() || VT.is512BitVector()) {
18020     // With a 256-bit vector, we can insert into the zero element efficiently
18021     // using a blend if we have AVX or AVX2 and the right data type.
18022     if (VT.is256BitVector() && IdxVal == 0) {
18023       // TODO: It is worthwhile to cast integer to floating point and back
18024       // and incur a domain crossing penalty if that's what we'll end up
18025       // doing anyway after extracting to a 128-bit vector.
18026       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18027           (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18028         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18029         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18030                            DAG.getTargetConstant(1, dl, MVT::i8));
18031       }
18032     }
18033
18034     unsigned NumEltsIn128 = 128 / EltSizeInBits;
18035     assert(isPowerOf2_32(NumEltsIn128) &&
18036            "Vectors will always have power-of-two number of elements.");
18037
18038     // If we are not inserting into the low 128-bit vector chunk,
18039     // then prefer the broadcast+blend sequence.
18040     // FIXME: relax the profitability check iff all N1 uses are insertions.
18041     if (IdxVal >= NumEltsIn128 &&
18042         ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18043          (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18044           X86::mayFoldLoad(N1, Subtarget)))) {
18045       SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18046       SmallVector<int, 8> BlendMask;
18047       for (unsigned i = 0; i != NumElts; ++i)
18048         BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18049       return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18050     }
18051
18052     // Get the desired 128-bit vector chunk.
18053     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18054
18055     // Insert the element into the desired chunk.
18056     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18057     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18058
18059     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18060                     DAG.getIntPtrConstant(IdxIn128, dl));
18061
18062     // Insert the changed part back into the bigger vector
18063     return insert128BitVector(N0, V, IdxVal, DAG, dl);
18064   }
18065   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18066
18067   // This will be just movw/movd/movq/movsh/movss/movsd.
18068   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18069     if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18070         EltVT == MVT::f16 || EltVT == MVT::i64) {
18071       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18072       return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18073     }
18074
18075     // We can't directly insert an i8 or i16 into a vector, so zero extend
18076     // it to i32 first.
18077     if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18078       N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18079       MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18080       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18081       N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18082       return DAG.getBitcast(VT, N1);
18083     }
18084   }
18085
18086   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18087   // argument. SSE41 required for pinsrb.
18088   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18089     unsigned Opc;
18090     if (VT == MVT::v8i16) {
18091       assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18092       Opc = X86ISD::PINSRW;
18093     } else {
18094       assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18095       assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18096       Opc = X86ISD::PINSRB;
18097     }
18098
18099     assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18100     N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18101     N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18102     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18103   }
18104
18105   if (Subtarget.hasSSE41()) {
18106     if (EltVT == MVT::f32) {
18107       // Bits [7:6] of the constant are the source select. This will always be
18108       //   zero here. The DAG Combiner may combine an extract_elt index into
18109       //   these bits. For example (insert (extract, 3), 2) could be matched by
18110       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18111       // Bits [5:4] of the constant are the destination select. This is the
18112       //   value of the incoming immediate.
18113       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18114       //   combine either bitwise AND or insert of float 0.0 to set these bits.
18115
18116       bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18117       if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18118         // If this is an insertion of 32-bits into the low 32-bits of
18119         // a vector, we prefer to generate a blend with immediate rather
18120         // than an insertps. Blends are simpler operations in hardware and so
18121         // will always have equal or better performance than insertps.
18122         // But if optimizing for size and there's a load folding opportunity,
18123         // generate insertps because blendps does not have a 32-bit memory
18124         // operand form.
18125         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18126         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18127                            DAG.getTargetConstant(1, dl, MVT::i8));
18128       }
18129       // Create this as a scalar to vector..
18130       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18131       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18132                          DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18133     }
18134
18135     // PINSR* works with constant index.
18136     if (EltVT == MVT::i32 || EltVT == MVT::i64)
18137       return Op;
18138   }
18139
18140   return SDValue();
18141 }
18142
18143 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
18144                                      SelectionDAG &DAG) {
18145   SDLoc dl(Op);
18146   MVT OpVT = Op.getSimpleValueType();
18147
18148   // It's always cheaper to replace a xor+movd with xorps and simplifies further
18149   // combines.
18150   if (X86::isZeroNode(Op.getOperand(0)))
18151     return getZeroVector(OpVT, Subtarget, DAG, dl);
18152
18153   // If this is a 256-bit vector result, first insert into a 128-bit
18154   // vector and then insert into the 256-bit vector.
18155   if (!OpVT.is128BitVector()) {
18156     // Insert into a 128-bit vector.
18157     unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18158     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
18159                                  OpVT.getVectorNumElements() / SizeFactor);
18160
18161     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18162
18163     // Insert the 128-bit vector.
18164     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18165   }
18166   assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18167          "Expected an SSE type!");
18168
18169   // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18170   // tblgen.
18171   if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18172     return Op;
18173
18174   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18175   return DAG.getBitcast(
18176       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18177 }
18178
18179 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
18180 // simple superregister reference or explicit instructions to insert
18181 // the upper bits of a vector.
18182 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
18183                                      SelectionDAG &DAG) {
18184   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18185
18186   return insert1BitVector(Op, DAG, Subtarget);
18187 }
18188
18189 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
18190                                       SelectionDAG &DAG) {
18191   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18192          "Only vXi1 extract_subvectors need custom lowering");
18193
18194   SDLoc dl(Op);
18195   SDValue Vec = Op.getOperand(0);
18196   uint64_t IdxVal = Op.getConstantOperandVal(1);
18197
18198   if (IdxVal == 0) // the operation is legal
18199     return Op;
18200
18201   // Extend to natively supported kshift.
18202   Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18203
18204   // Shift to the LSB.
18205   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18206                     DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18207
18208   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18209                      DAG.getIntPtrConstant(0, dl));
18210 }
18211
18212 // Returns the appropriate wrapper opcode for a global reference.
18213 unsigned X86TargetLowering::getGlobalWrapperKind(
18214     const GlobalValue *GV, const unsigned char OpFlags) const {
18215   // References to absolute symbols are never PC-relative.
18216   if (GV && GV->isAbsoluteSymbolRef())
18217     return X86ISD::Wrapper;
18218
18219   // The following OpFlags under RIP-rel PIC use RIP.
18220   if (Subtarget.isPICStyleRIPRel() &&
18221       (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18222        OpFlags == X86II::MO_DLLIMPORT))
18223     return X86ISD::WrapperRIP;
18224
18225   // In the medium model, functions can always be referenced RIP-relatively,
18226   // since they must be within 2GiB. This is also possible in non-PIC mode, and
18227   // shorter than the 64-bit absolute immediate that would otherwise be emitted.
18228   if (getTargetMachine().getCodeModel() == CodeModel::Medium &&
18229       isa_and_nonnull<Function>(GV))
18230     return X86ISD::WrapperRIP;
18231
18232   // GOTPCREL references must always use RIP.
18233   if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18234     return X86ISD::WrapperRIP;
18235
18236   return X86ISD::Wrapper;
18237 }
18238
18239 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18240 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18241 // one of the above mentioned nodes. It has to be wrapped because otherwise
18242 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18243 // be used to form addressing mode. These wrapped nodes will be selected
18244 // into MOV32ri.
18245 SDValue
18246 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18247   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18248
18249   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18250   // global base reg.
18251   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18252
18253   auto PtrVT = getPointerTy(DAG.getDataLayout());
18254   SDValue Result = DAG.getTargetConstantPool(
18255       CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18256   SDLoc DL(CP);
18257   Result =
18258       DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18259   // With PIC, the address is actually $g + Offset.
18260   if (OpFlag) {
18261     Result =
18262         DAG.getNode(ISD::ADD, DL, PtrVT,
18263                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18264   }
18265
18266   return Result;
18267 }
18268
18269 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18270   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18271
18272   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18273   // global base reg.
18274   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18275
18276   auto PtrVT = getPointerTy(DAG.getDataLayout());
18277   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18278   SDLoc DL(JT);
18279   Result =
18280       DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18281
18282   // With PIC, the address is actually $g + Offset.
18283   if (OpFlag)
18284     Result =
18285         DAG.getNode(ISD::ADD, DL, PtrVT,
18286                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18287
18288   return Result;
18289 }
18290
18291 SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18292                                                SelectionDAG &DAG) const {
18293   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18294 }
18295
18296 SDValue
18297 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18298   // Create the TargetBlockAddressAddress node.
18299   unsigned char OpFlags =
18300     Subtarget.classifyBlockAddressReference();
18301   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18302   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18303   SDLoc dl(Op);
18304   auto PtrVT = getPointerTy(DAG.getDataLayout());
18305   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18306   Result =
18307       DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18308
18309   // With PIC, the address is actually $g + Offset.
18310   if (isGlobalRelativeToPICBase(OpFlags)) {
18311     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18312                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18313   }
18314
18315   return Result;
18316 }
18317
18318 /// Creates target global address or external symbol nodes for calls or
18319 /// other uses.
18320 SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18321                                                  bool ForCall) const {
18322   // Unpack the global address or external symbol.
18323   const SDLoc &dl = SDLoc(Op);
18324   const GlobalValue *GV = nullptr;
18325   int64_t Offset = 0;
18326   const char *ExternalSym = nullptr;
18327   if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18328     GV = G->getGlobal();
18329     Offset = G->getOffset();
18330   } else {
18331     const auto *ES = cast<ExternalSymbolSDNode>(Op);
18332     ExternalSym = ES->getSymbol();
18333   }
18334
18335   // Calculate some flags for address lowering.
18336   const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
18337   unsigned char OpFlags;
18338   if (ForCall)
18339     OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18340   else
18341     OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18342   bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18343   bool NeedsLoad = isGlobalStubReference(OpFlags);
18344
18345   CodeModel::Model M = DAG.getTarget().getCodeModel();
18346   auto PtrVT = getPointerTy(DAG.getDataLayout());
18347   SDValue Result;
18348
18349   if (GV) {
18350     // Create a target global address if this is a global. If possible, fold the
18351     // offset into the global address reference. Otherwise, ADD it on later.
18352     // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18353     // allowed because if the address of foo is 0, the ELF R_X86_64_32
18354     // relocation will compute to a negative value, which is invalid.
18355     int64_t GlobalOffset = 0;
18356     if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
18357         X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
18358       std::swap(GlobalOffset, Offset);
18359     }
18360     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18361   } else {
18362     // If this is not a global address, this must be an external symbol.
18363     Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18364   }
18365
18366   // If this is a direct call, avoid the wrapper if we don't need to do any
18367   // loads or adds. This allows SDAG ISel to match direct calls.
18368   if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
18369     return Result;
18370
18371   Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
18372
18373   // With PIC, the address is actually $g + Offset.
18374   if (HasPICReg) {
18375     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18376                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18377   }
18378
18379   // For globals that require a load from a stub to get the address, emit the
18380   // load.
18381   if (NeedsLoad)
18382     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
18383                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
18384
18385   // If there was a non-zero offset that we didn't fold, create an explicit
18386   // addition for it.
18387   if (Offset != 0)
18388     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
18389                          DAG.getConstant(Offset, dl, PtrVT));
18390
18391   return Result;
18392 }
18393
18394 SDValue
18395 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
18396   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18397 }
18398
18399 static SDValue
18400 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
18401            SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
18402            unsigned char OperandFlags, bool LocalDynamic = false) {
18403   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
18404   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18405   SDLoc dl(GA);
18406   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18407                                            GA->getValueType(0),
18408                                            GA->getOffset(),
18409                                            OperandFlags);
18410
18411   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
18412                                            : X86ISD::TLSADDR;
18413
18414   if (InGlue) {
18415     SDValue Ops[] = { Chain,  TGA, *InGlue };
18416     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18417   } else {
18418     SDValue Ops[]  = { Chain, TGA };
18419     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18420   }
18421
18422   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
18423   MFI.setAdjustsStack(true);
18424   MFI.setHasCalls(true);
18425
18426   SDValue Glue = Chain.getValue(1);
18427   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
18428 }
18429
18430 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
18431 static SDValue
18432 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18433                                 const EVT PtrVT) {
18434   SDValue InGlue;
18435   SDLoc dl(GA);  // ? function entry point might be better
18436   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18437                                    DAG.getNode(X86ISD::GlobalBaseReg,
18438                                                SDLoc(), PtrVT), InGlue);
18439   InGlue = Chain.getValue(1);
18440
18441   return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
18442 }
18443
18444 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
18445 static SDValue
18446 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18447                                 const EVT PtrVT) {
18448   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18449                     X86::RAX, X86II::MO_TLSGD);
18450 }
18451
18452 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
18453 static SDValue
18454 LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18455                                  const EVT PtrVT) {
18456   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18457                     X86::EAX, X86II::MO_TLSGD);
18458 }
18459
18460 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
18461                                            SelectionDAG &DAG, const EVT PtrVT,
18462                                            bool Is64Bit, bool Is64BitLP64) {
18463   SDLoc dl(GA);
18464
18465   // Get the start address of the TLS block for this module.
18466   X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
18467       .getInfo<X86MachineFunctionInfo>();
18468   MFI->incNumLocalDynamicTLSAccesses();
18469
18470   SDValue Base;
18471   if (Is64Bit) {
18472     unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
18473     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
18474                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
18475   } else {
18476     SDValue InGlue;
18477     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18478         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
18479     InGlue = Chain.getValue(1);
18480     Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
18481                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
18482   }
18483
18484   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
18485   // of Base.
18486
18487   // Build x@dtpoff.
18488   unsigned char OperandFlags = X86II::MO_DTPOFF;
18489   unsigned WrapperKind = X86ISD::Wrapper;
18490   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18491                                            GA->getValueType(0),
18492                                            GA->getOffset(), OperandFlags);
18493   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18494
18495   // Add x@dtpoff with the base.
18496   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
18497 }
18498
18499 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
18500 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18501                                    const EVT PtrVT, TLSModel::Model model,
18502                                    bool is64Bit, bool isPIC) {
18503   SDLoc dl(GA);
18504
18505   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
18506   Value *Ptr = Constant::getNullValue(
18507       PointerType::get(*DAG.getContext(), is64Bit ? 257 : 256));
18508
18509   SDValue ThreadPointer =
18510       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18511                   MachinePointerInfo(Ptr));
18512
18513   unsigned char OperandFlags = 0;
18514   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
18515   // initialexec.
18516   unsigned WrapperKind = X86ISD::Wrapper;
18517   if (model == TLSModel::LocalExec) {
18518     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
18519   } else if (model == TLSModel::InitialExec) {
18520     if (is64Bit) {
18521       OperandFlags = X86II::MO_GOTTPOFF;
18522       WrapperKind = X86ISD::WrapperRIP;
18523     } else {
18524       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
18525     }
18526   } else {
18527     llvm_unreachable("Unexpected model");
18528   }
18529
18530   // emit "addl x@ntpoff,%eax" (local exec)
18531   // or "addl x@indntpoff,%eax" (initial exec)
18532   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
18533   SDValue TGA =
18534       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18535                                  GA->getOffset(), OperandFlags);
18536   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18537
18538   if (model == TLSModel::InitialExec) {
18539     if (isPIC && !is64Bit) {
18540       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
18541                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18542                            Offset);
18543     }
18544
18545     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
18546                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
18547   }
18548
18549   // The address of the thread local variable is the add of the thread
18550   // pointer with the offset of the variable.
18551   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
18552 }
18553
18554 SDValue
18555 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
18556
18557   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
18558
18559   if (DAG.getTarget().useEmulatedTLS())
18560     return LowerToTLSEmulatedModel(GA, DAG);
18561
18562   const GlobalValue *GV = GA->getGlobal();
18563   auto PtrVT = getPointerTy(DAG.getDataLayout());
18564   bool PositionIndependent = isPositionIndependent();
18565
18566   if (Subtarget.isTargetELF()) {
18567     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
18568     switch (model) {
18569       case TLSModel::GeneralDynamic:
18570         if (Subtarget.is64Bit()) {
18571           if (Subtarget.isTarget64BitLP64())
18572             return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
18573           return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
18574         }
18575         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
18576       case TLSModel::LocalDynamic:
18577         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
18578                                            Subtarget.isTarget64BitLP64());
18579       case TLSModel::InitialExec:
18580       case TLSModel::LocalExec:
18581         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
18582                                    PositionIndependent);
18583     }
18584     llvm_unreachable("Unknown TLS model.");
18585   }
18586
18587   if (Subtarget.isTargetDarwin()) {
18588     // Darwin only has one model of TLS.  Lower to that.
18589     unsigned char OpFlag = 0;
18590     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
18591                            X86ISD::WrapperRIP : X86ISD::Wrapper;
18592
18593     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18594     // global base reg.
18595     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
18596     if (PIC32)
18597       OpFlag = X86II::MO_TLVP_PIC_BASE;
18598     else
18599       OpFlag = X86II::MO_TLVP;
18600     SDLoc DL(Op);
18601     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
18602                                                 GA->getValueType(0),
18603                                                 GA->getOffset(), OpFlag);
18604     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
18605
18606     // With PIC32, the address is actually $g + Offset.
18607     if (PIC32)
18608       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
18609                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18610                            Offset);
18611
18612     // Lowering the machine isd will make sure everything is in the right
18613     // location.
18614     SDValue Chain = DAG.getEntryNode();
18615     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18616     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
18617     SDValue Args[] = { Chain, Offset };
18618     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
18619     Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
18620
18621     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
18622     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
18623     MFI.setAdjustsStack(true);
18624
18625     // And our return value (tls address) is in the standard call return value
18626     // location.
18627     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
18628     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
18629   }
18630
18631   if (Subtarget.isOSWindows()) {
18632     // Just use the implicit TLS architecture
18633     // Need to generate something similar to:
18634     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
18635     //                                  ; from TEB
18636     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
18637     //   mov     rcx, qword [rdx+rcx*8]
18638     //   mov     eax, .tls$:tlsvar
18639     //   [rax+rcx] contains the address
18640     // Windows 64bit: gs:0x58
18641     // Windows 32bit: fs:__tls_array
18642
18643     SDLoc dl(GA);
18644     SDValue Chain = DAG.getEntryNode();
18645
18646     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
18647     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
18648     // use its literal value of 0x2C.
18649     Value *Ptr = Constant::getNullValue(
18650         Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), 256)
18651                             : PointerType::get(*DAG.getContext(), 257));
18652
18653     SDValue TlsArray = Subtarget.is64Bit()
18654                            ? DAG.getIntPtrConstant(0x58, dl)
18655                            : (Subtarget.isTargetWindowsGNU()
18656                                   ? DAG.getIntPtrConstant(0x2C, dl)
18657                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
18658
18659     SDValue ThreadPointer =
18660         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
18661
18662     SDValue res;
18663     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
18664       res = ThreadPointer;
18665     } else {
18666       // Load the _tls_index variable
18667       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
18668       if (Subtarget.is64Bit())
18669         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
18670                              MachinePointerInfo(), MVT::i32);
18671       else
18672         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
18673
18674       const DataLayout &DL = DAG.getDataLayout();
18675       SDValue Scale =
18676           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
18677       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
18678
18679       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
18680     }
18681
18682     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
18683
18684     // Get the offset of start of .tls section
18685     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18686                                              GA->getValueType(0),
18687                                              GA->getOffset(), X86II::MO_SECREL);
18688     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
18689
18690     // The address of the thread local variable is the add of the thread
18691     // pointer with the offset of the variable.
18692     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
18693   }
18694
18695   llvm_unreachable("TLS not implemented for this target.");
18696 }
18697
18698 /// Lower SRA_PARTS and friends, which return two i32 values
18699 /// and take a 2 x i32 value to shift plus a shift amount.
18700 /// TODO: Can this be moved to general expansion code?
18701 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
18702   SDValue Lo, Hi;
18703   DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
18704   return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
18705 }
18706
18707 // Try to use a packed vector operation to handle i64 on 32-bit targets when
18708 // AVX512DQ is enabled.
18709 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
18710                                         const X86Subtarget &Subtarget) {
18711   assert((Op.getOpcode() == ISD::SINT_TO_FP ||
18712           Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
18713           Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
18714           Op.getOpcode() == ISD::UINT_TO_FP) &&
18715          "Unexpected opcode!");
18716   bool IsStrict = Op->isStrictFPOpcode();
18717   unsigned OpNo = IsStrict ? 1 : 0;
18718   SDValue Src = Op.getOperand(OpNo);
18719   MVT SrcVT = Src.getSimpleValueType();
18720   MVT VT = Op.getSimpleValueType();
18721
18722    if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
18723        (VT != MVT::f32 && VT != MVT::f64))
18724     return SDValue();
18725
18726   // Pack the i64 into a vector, do the operation and extract.
18727
18728   // Using 256-bit to ensure result is 128-bits for f32 case.
18729   unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
18730   MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
18731   MVT VecVT = MVT::getVectorVT(VT, NumElts);
18732
18733   SDLoc dl(Op);
18734   SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
18735   if (IsStrict) {
18736     SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
18737                                  {Op.getOperand(0), InVec});
18738     SDValue Chain = CvtVec.getValue(1);
18739     SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18740                                 DAG.getIntPtrConstant(0, dl));
18741     return DAG.getMergeValues({Value, Chain}, dl);
18742   }
18743
18744   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
18745
18746   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18747                      DAG.getIntPtrConstant(0, dl));
18748 }
18749
18750 // Try to use a packed vector operation to handle i64 on 32-bit targets.
18751 static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
18752                                  const X86Subtarget &Subtarget) {
18753   assert((Op.getOpcode() == ISD::SINT_TO_FP ||
18754           Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
18755           Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
18756           Op.getOpcode() == ISD::UINT_TO_FP) &&
18757          "Unexpected opcode!");
18758   bool IsStrict = Op->isStrictFPOpcode();
18759   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
18760   MVT SrcVT = Src.getSimpleValueType();
18761   MVT VT = Op.getSimpleValueType();
18762
18763   if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
18764     return SDValue();
18765
18766   // Pack the i64 into a vector, do the operation and extract.
18767
18768   assert(Subtarget.hasFP16() && "Expected FP16");
18769
18770   SDLoc dl(Op);
18771   SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
18772   if (IsStrict) {
18773     SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
18774                                  {Op.getOperand(0), InVec});
18775     SDValue Chain = CvtVec.getValue(1);
18776     SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18777                                 DAG.getIntPtrConstant(0, dl));
18778     return DAG.getMergeValues({Value, Chain}, dl);
18779   }
18780
18781   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
18782
18783   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18784                      DAG.getIntPtrConstant(0, dl));
18785 }
18786
18787 static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
18788                           const X86Subtarget &Subtarget) {
18789   switch (Opcode) {
18790     case ISD::SINT_TO_FP:
18791       // TODO: Handle wider types with AVX/AVX512.
18792       if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
18793         return false;
18794       // CVTDQ2PS or (V)CVTDQ2PD
18795       return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
18796
18797     case ISD::UINT_TO_FP:
18798       // TODO: Handle wider types and i64 elements.
18799       if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
18800         return false;
18801       // VCVTUDQ2PS or VCVTUDQ2PD
18802       return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
18803
18804     default:
18805       return false;
18806   }
18807 }
18808
18809 /// Given a scalar cast operation that is extracted from a vector, try to
18810 /// vectorize the cast op followed by extraction. This will avoid an expensive
18811 /// round-trip between XMM and GPR.
18812 static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
18813                                       const X86Subtarget &Subtarget) {
18814   // TODO: This could be enhanced to handle smaller integer types by peeking
18815   // through an extend.
18816   SDValue Extract = Cast.getOperand(0);
18817   MVT DestVT = Cast.getSimpleValueType();
18818   if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
18819       !isa<ConstantSDNode>(Extract.getOperand(1)))
18820     return SDValue();
18821
18822   // See if we have a 128-bit vector cast op for this type of cast.
18823   SDValue VecOp = Extract.getOperand(0);
18824   MVT FromVT = VecOp.getSimpleValueType();
18825   unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
18826   MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
18827   MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
18828   if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
18829     return SDValue();
18830
18831   // If we are extracting from a non-zero element, first shuffle the source
18832   // vector to allow extracting from element zero.
18833   SDLoc DL(Cast);
18834   if (!isNullConstant(Extract.getOperand(1))) {
18835     SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
18836     Mask[0] = Extract.getConstantOperandVal(1);
18837     VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
18838   }
18839   // If the source vector is wider than 128-bits, extract the low part. Do not
18840   // create an unnecessarily wide vector cast op.
18841   if (FromVT != Vec128VT)
18842     VecOp = extract128BitVector(VecOp, 0, DAG, DL);
18843
18844   // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
18845   // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
18846   SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
18847   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
18848                      DAG.getIntPtrConstant(0, DL));
18849 }
18850
18851 /// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
18852 /// try to vectorize the cast ops. This will avoid an expensive round-trip
18853 /// between XMM and GPR.
18854 static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
18855                                 const X86Subtarget &Subtarget) {
18856   // TODO: Allow FP_TO_UINT.
18857   SDValue CastToInt = CastToFP.getOperand(0);
18858   MVT VT = CastToFP.getSimpleValueType();
18859   if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
18860     return SDValue();
18861
18862   MVT IntVT = CastToInt.getSimpleValueType();
18863   SDValue X = CastToInt.getOperand(0);
18864   MVT SrcVT = X.getSimpleValueType();
18865   if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
18866     return SDValue();
18867
18868   // See if we have 128-bit vector cast instructions for this type of cast.
18869   // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
18870   if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
18871       IntVT != MVT::i32)
18872     return SDValue();
18873
18874   unsigned SrcSize = SrcVT.getSizeInBits();
18875   unsigned IntSize = IntVT.getSizeInBits();
18876   unsigned VTSize = VT.getSizeInBits();
18877   MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
18878   MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
18879   MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
18880
18881   // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
18882   unsigned ToIntOpcode =
18883       SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
18884   unsigned ToFPOpcode =
18885       IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
18886
18887   // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
18888   //
18889   // We are not defining the high elements (for example, zero them) because
18890   // that could nullify any performance advantage that we hoped to gain from
18891   // this vector op hack. We do not expect any adverse effects (like denorm
18892   // penalties) with cast ops.
18893   SDLoc DL(CastToFP);
18894   SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
18895   SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
18896   SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
18897   SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
18898   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
18899 }
18900
18901 static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
18902                                     const X86Subtarget &Subtarget) {
18903   SDLoc DL(Op);
18904   bool IsStrict = Op->isStrictFPOpcode();
18905   MVT VT = Op->getSimpleValueType(0);
18906   SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
18907
18908   if (Subtarget.hasDQI()) {
18909     assert(!Subtarget.hasVLX() && "Unexpected features");
18910
18911     assert((Src.getSimpleValueType() == MVT::v2i64 ||
18912             Src.getSimpleValueType() == MVT::v4i64) &&
18913            "Unsupported custom type");
18914
18915     // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
18916     assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
18917            "Unexpected VT!");
18918     MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
18919
18920     // Need to concat with zero vector for strict fp to avoid spurious
18921     // exceptions.
18922     SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
18923                            : DAG.getUNDEF(MVT::v8i64);
18924     Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
18925                       DAG.getIntPtrConstant(0, DL));
18926     SDValue Res, Chain;
18927     if (IsStrict) {
18928       Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
18929                         {Op->getOperand(0), Src});
18930       Chain = Res.getValue(1);
18931     } else {
18932       Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
18933     }
18934
18935     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18936                       DAG.getIntPtrConstant(0, DL));
18937
18938     if (IsStrict)
18939       return DAG.getMergeValues({Res, Chain}, DL);
18940     return Res;
18941   }
18942
18943   bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
18944                   Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
18945   if (VT != MVT::v4f32 || IsSigned)
18946     return SDValue();
18947
18948   SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
18949   SDValue One  = DAG.getConstant(1, DL, MVT::v4i64);
18950   SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
18951                              DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
18952                              DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
18953   SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
18954   SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
18955   SmallVector<SDValue, 4> SignCvts(4);
18956   SmallVector<SDValue, 4> Chains(4);
18957   for (int i = 0; i != 4; ++i) {
18958     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
18959                               DAG.getIntPtrConstant(i, DL));
18960     if (IsStrict) {
18961       SignCvts[i] =
18962           DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
18963                       {Op.getOperand(0), Elt});
18964       Chains[i] = SignCvts[i].getValue(1);
18965     } else {
18966       SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
18967     }
18968   }
18969   SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
18970
18971   SDValue Slow, Chain;
18972   if (IsStrict) {
18973     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18974     Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
18975                        {Chain, SignCvt, SignCvt});
18976     Chain = Slow.getValue(1);
18977   } else {
18978     Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
18979   }
18980
18981   IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
18982   SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
18983
18984   if (IsStrict)
18985     return DAG.getMergeValues({Cvt, Chain}, DL);
18986
18987   return Cvt;
18988 }
18989
18990 static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
18991   bool IsStrict = Op->isStrictFPOpcode();
18992   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
18993   SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
18994   MVT VT = Op.getSimpleValueType();
18995   MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
18996   SDLoc dl(Op);
18997
18998   SDValue Rnd = DAG.getIntPtrConstant(0, dl);
18999   if (IsStrict)
19000     return DAG.getNode(
19001         ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19002         {Chain,
19003          DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19004          Rnd});
19005   return DAG.getNode(ISD::FP_ROUND, dl, VT,
19006                      DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19007 }
19008
19009 static bool isLegalConversion(MVT VT, bool IsSigned,
19010                               const X86Subtarget &Subtarget) {
19011   if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19012     return true;
19013   if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19014     return true;
19015   if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19016     return true;
19017   if (Subtarget.useAVX512Regs()) {
19018     if (VT == MVT::v16i32)
19019       return true;
19020     if (VT == MVT::v8i64 && Subtarget.hasDQI())
19021       return true;
19022   }
19023   if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19024       (VT == MVT::v2i64 || VT == MVT::v4i64))
19025     return true;
19026   return false;
19027 }
19028
19029 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19030                                            SelectionDAG &DAG) const {
19031   bool IsStrict = Op->isStrictFPOpcode();
19032   unsigned OpNo = IsStrict ? 1 : 0;
19033   SDValue Src = Op.getOperand(OpNo);
19034   SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19035   MVT SrcVT = Src.getSimpleValueType();
19036   MVT VT = Op.getSimpleValueType();
19037   SDLoc dl(Op);
19038
19039   if (isSoftF16(VT, Subtarget))
19040     return promoteXINT_TO_FP(Op, DAG);
19041   else if (isLegalConversion(SrcVT, true, Subtarget))
19042     return Op;
19043
19044   if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19045     return LowerWin64_INT128_TO_FP(Op, DAG);
19046
19047   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
19048     return Extract;
19049
19050   if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
19051     return R;
19052
19053   if (SrcVT.isVector()) {
19054     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19055       // Note: Since v2f64 is a legal type. We don't need to zero extend the
19056       // source for strict FP.
19057       if (IsStrict)
19058         return DAG.getNode(
19059             X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19060             {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19061                                 DAG.getUNDEF(SrcVT))});
19062       return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19063                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19064                                      DAG.getUNDEF(SrcVT)));
19065     }
19066     if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19067       return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
19068
19069     return SDValue();
19070   }
19071
19072   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19073          "Unknown SINT_TO_FP to lower!");
19074
19075   bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19076
19077   // These are really Legal; return the operand so the caller accepts it as
19078   // Legal.
19079   if (SrcVT == MVT::i32 && UseSSEReg)
19080     return Op;
19081   if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19082     return Op;
19083
19084   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
19085     return V;
19086   if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
19087     return V;
19088
19089   // SSE doesn't have an i16 conversion so we need to promote.
19090   if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19091     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19092     if (IsStrict)
19093       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19094                          {Chain, Ext});
19095
19096     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19097   }
19098
19099   if (VT == MVT::f128 || !Subtarget.hasX87())
19100     return SDValue();
19101
19102   SDValue ValueToStore = Src;
19103   if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19104     // Bitcasting to f64 here allows us to do a single 64-bit store from
19105     // an SSE register, avoiding the store forwarding penalty that would come
19106     // with two 32-bit stores.
19107     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19108
19109   unsigned Size = SrcVT.getStoreSize();
19110   Align Alignment(Size);
19111   MachineFunction &MF = DAG.getMachineFunction();
19112   auto PtrVT = getPointerTy(MF.getDataLayout());
19113   int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19114   MachinePointerInfo MPI =
19115       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
19116   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19117   Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19118   std::pair<SDValue, SDValue> Tmp =
19119       BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19120
19121   if (IsStrict)
19122     return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19123
19124   return Tmp.first;
19125 }
19126
19127 std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19128     EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19129     MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19130   // Build the FILD
19131   SDVTList Tys;
19132   bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19133   if (useSSE)
19134     Tys = DAG.getVTList(MVT::f80, MVT::Other);
19135   else
19136     Tys = DAG.getVTList(DstVT, MVT::Other);
19137
19138   SDValue FILDOps[] = {Chain, Pointer};
19139   SDValue Result =
19140       DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19141                               Alignment, MachineMemOperand::MOLoad);
19142   Chain = Result.getValue(1);
19143
19144   if (useSSE) {
19145     MachineFunction &MF = DAG.getMachineFunction();
19146     unsigned SSFISize = DstVT.getStoreSize();
19147     int SSFI =
19148         MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19149     auto PtrVT = getPointerTy(MF.getDataLayout());
19150     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19151     Tys = DAG.getVTList(MVT::Other);
19152     SDValue FSTOps[] = {Chain, Result, StackSlot};
19153     MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
19154         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
19155         MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19156
19157     Chain =
19158         DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19159     Result = DAG.getLoad(
19160         DstVT, DL, Chain, StackSlot,
19161         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
19162     Chain = Result.getValue(1);
19163   }
19164
19165   return { Result, Chain };
19166 }
19167
19168 /// Horizontal vector math instructions may be slower than normal math with
19169 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19170 /// implementation, and likely shuffle complexity of the alternate sequence.
19171 static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19172                                   const X86Subtarget &Subtarget) {
19173   bool IsOptimizingSize = DAG.shouldOptForSize();
19174   bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19175   return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19176 }
19177
19178 /// 64-bit unsigned integer to double expansion.
19179 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
19180                                    const X86Subtarget &Subtarget) {
19181   // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19182   // when converting 0 when rounding toward negative infinity. Caller will
19183   // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19184   assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19185   // This algorithm is not obvious. Here it is what we're trying to output:
19186   /*
19187      movq       %rax,  %xmm0
19188      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19189      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19190      #ifdef __SSE3__
19191        haddpd   %xmm0, %xmm0
19192      #else
19193        pshufd   $0x4e, %xmm0, %xmm1
19194        addpd    %xmm1, %xmm0
19195      #endif
19196   */
19197
19198   SDLoc dl(Op);
19199   LLVMContext *Context = DAG.getContext();
19200
19201   // Build some magic constants.
19202   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19203   Constant *C0 = ConstantDataVector::get(*Context, CV0);
19204   auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19205   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19206
19207   SmallVector<Constant*,2> CV1;
19208   CV1.push_back(
19209     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19210                                       APInt(64, 0x4330000000000000ULL))));
19211   CV1.push_back(
19212     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19213                                       APInt(64, 0x4530000000000000ULL))));
19214   Constant *C1 = ConstantVector::get(CV1);
19215   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19216
19217   // Load the 64-bit value into an XMM register.
19218   SDValue XR1 =
19219       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19220   SDValue CLod0 = DAG.getLoad(
19221       MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19222       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
19223   SDValue Unpck1 =
19224       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19225
19226   SDValue CLod1 = DAG.getLoad(
19227       MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19228       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
19229   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19230   // TODO: Are there any fast-math-flags to propagate here?
19231   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19232   SDValue Result;
19233
19234   if (Subtarget.hasSSE3() &&
19235       shouldUseHorizontalOp(true, DAG, Subtarget)) {
19236     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19237   } else {
19238     SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19239     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19240   }
19241   Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19242                        DAG.getIntPtrConstant(0, dl));
19243   return Result;
19244 }
19245
19246 /// 32-bit unsigned integer to float expansion.
19247 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
19248                                    const X86Subtarget &Subtarget) {
19249   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19250   SDLoc dl(Op);
19251   // FP constant to bias correct the final result.
19252   SDValue Bias = DAG.getConstantFP(
19253       llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19254
19255   // Load the 32-bit value into an XMM register.
19256   SDValue Load =
19257       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19258
19259   // Zero out the upper parts of the register.
19260   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19261
19262   // Or the load with the bias.
19263   SDValue Or = DAG.getNode(
19264       ISD::OR, dl, MVT::v2i64,
19265       DAG.getBitcast(MVT::v2i64, Load),
19266       DAG.getBitcast(MVT::v2i64,
19267                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19268   Or =
19269       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19270                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
19271
19272   if (Op.getNode()->isStrictFPOpcode()) {
19273     // Subtract the bias.
19274     // TODO: Are there any fast-math-flags to propagate here?
19275     SDValue Chain = Op.getOperand(0);
19276     SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19277                               {Chain, Or, Bias});
19278
19279     if (Op.getValueType() == Sub.getValueType())
19280       return Sub;
19281
19282     // Handle final rounding.
19283     std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19284         Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19285
19286     return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19287   }
19288
19289   // Subtract the bias.
19290   // TODO: Are there any fast-math-flags to propagate here?
19291   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19292
19293   // Handle final rounding.
19294   return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19295 }
19296
19297 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
19298                                      const X86Subtarget &Subtarget,
19299                                      const SDLoc &DL) {
19300   if (Op.getSimpleValueType() != MVT::v2f64)
19301     return SDValue();
19302
19303   bool IsStrict = Op->isStrictFPOpcode();
19304
19305   SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19306   assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
19307
19308   if (Subtarget.hasAVX512()) {
19309     if (!Subtarget.hasVLX()) {
19310       // Let generic type legalization widen this.
19311       if (!IsStrict)
19312         return SDValue();
19313       // Otherwise pad the integer input with 0s and widen the operation.
19314       N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19315                        DAG.getConstant(0, DL, MVT::v2i32));
19316       SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
19317                                 {Op.getOperand(0), N0});
19318       SDValue Chain = Res.getValue(1);
19319       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
19320                         DAG.getIntPtrConstant(0, DL));
19321       return DAG.getMergeValues({Res, Chain}, DL);
19322     }
19323
19324     // Legalize to v4i32 type.
19325     N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19326                      DAG.getUNDEF(MVT::v2i32));
19327     if (IsStrict)
19328       return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
19329                          {Op.getOperand(0), N0});
19330     return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
19331   }
19332
19333   // Zero extend to 2i64, OR with the floating point representation of 2^52.
19334   // This gives us the floating point equivalent of 2^52 + the i32 integer
19335   // since double has 52-bits of mantissa. Then subtract 2^52 in floating
19336   // point leaving just our i32 integers in double format.
19337   SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
19338   SDValue VBias = DAG.getConstantFP(
19339       llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
19340   SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
19341                            DAG.getBitcast(MVT::v2i64, VBias));
19342   Or = DAG.getBitcast(MVT::v2f64, Or);
19343
19344   if (IsStrict)
19345     return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
19346                        {Op.getOperand(0), Or, VBias});
19347   return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
19348 }
19349
19350 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
19351                                      const X86Subtarget &Subtarget) {
19352   SDLoc DL(Op);
19353   bool IsStrict = Op->isStrictFPOpcode();
19354   SDValue V = Op->getOperand(IsStrict ? 1 : 0);
19355   MVT VecIntVT = V.getSimpleValueType();
19356   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
19357          "Unsupported custom type");
19358
19359   if (Subtarget.hasAVX512()) {
19360     // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19361     assert(!Subtarget.hasVLX() && "Unexpected features");
19362     MVT VT = Op->getSimpleValueType(0);
19363
19364     // v8i32->v8f64 is legal with AVX512 so just return it.
19365     if (VT == MVT::v8f64)
19366       return Op;
19367
19368     assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
19369            "Unexpected VT!");
19370     MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
19371     MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
19372     // Need to concat with zero vector for strict fp to avoid spurious
19373     // exceptions.
19374     SDValue Tmp =
19375         IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
19376     V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
19377                     DAG.getIntPtrConstant(0, DL));
19378     SDValue Res, Chain;
19379     if (IsStrict) {
19380       Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
19381                         {Op->getOperand(0), V});
19382       Chain = Res.getValue(1);
19383     } else {
19384       Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
19385     }
19386
19387     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19388                       DAG.getIntPtrConstant(0, DL));
19389
19390     if (IsStrict)
19391       return DAG.getMergeValues({Res, Chain}, DL);
19392     return Res;
19393   }
19394
19395   if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
19396       Op->getSimpleValueType(0) == MVT::v4f64) {
19397     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
19398     Constant *Bias = ConstantFP::get(
19399         *DAG.getContext(),
19400         APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
19401     auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19402     SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
19403     SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
19404     SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
19405     SDValue VBias = DAG.getMemIntrinsicNode(
19406         X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
19407         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
19408         MachineMemOperand::MOLoad);
19409
19410     SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
19411                              DAG.getBitcast(MVT::v4i64, VBias));
19412     Or = DAG.getBitcast(MVT::v4f64, Or);
19413
19414     if (IsStrict)
19415       return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
19416                          {Op.getOperand(0), Or, VBias});
19417     return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
19418   }
19419
19420   // The algorithm is the following:
19421   // #ifdef __SSE4_1__
19422   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19423   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19424   //                                 (uint4) 0x53000000, 0xaa);
19425   // #else
19426   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19427   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
19428   // #endif
19429   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19430   //     return (float4) lo + fhi;
19431
19432   bool Is128 = VecIntVT == MVT::v4i32;
19433   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
19434   // If we convert to something else than the supported type, e.g., to v4f64,
19435   // abort early.
19436   if (VecFloatVT != Op->getSimpleValueType(0))
19437     return SDValue();
19438
19439   // In the #idef/#else code, we have in common:
19440   // - The vector of constants:
19441   // -- 0x4b000000
19442   // -- 0x53000000
19443   // - A shift:
19444   // -- v >> 16
19445
19446   // Create the splat vector for 0x4b000000.
19447   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
19448   // Create the splat vector for 0x53000000.
19449   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
19450
19451   // Create the right shift.
19452   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
19453   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
19454
19455   SDValue Low, High;
19456   if (Subtarget.hasSSE41()) {
19457     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
19458     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19459     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
19460     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
19461     // Low will be bitcasted right away, so do not bother bitcasting back to its
19462     // original type.
19463     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
19464                       VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19465     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19466     //                                 (uint4) 0x53000000, 0xaa);
19467     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
19468     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
19469     // High will be bitcasted right away, so do not bother bitcasting back to
19470     // its original type.
19471     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
19472                        VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19473   } else {
19474     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
19475     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19476     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
19477     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
19478
19479     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
19480     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
19481   }
19482
19483   // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
19484   SDValue VecCstFSub = DAG.getConstantFP(
19485       APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
19486
19487   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19488   // NOTE: By using fsub of a positive constant instead of fadd of a negative
19489   // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
19490   // enabled. See PR24512.
19491   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
19492   // TODO: Are there any fast-math-flags to propagate here?
19493   //     (float4) lo;
19494   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
19495   //     return (float4) lo + fhi;
19496   if (IsStrict) {
19497     SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
19498                                 {Op.getOperand(0), HighBitcast, VecCstFSub});
19499     return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
19500                        {FHigh.getValue(1), LowBitcast, FHigh});
19501   }
19502
19503   SDValue FHigh =
19504       DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
19505   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
19506 }
19507
19508 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
19509                                    const X86Subtarget &Subtarget) {
19510   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19511   SDValue N0 = Op.getOperand(OpNo);
19512   MVT SrcVT = N0.getSimpleValueType();
19513   SDLoc dl(Op);
19514
19515   switch (SrcVT.SimpleTy) {
19516   default:
19517     llvm_unreachable("Custom UINT_TO_FP is not supported!");
19518   case MVT::v2i32:
19519     return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
19520   case MVT::v4i32:
19521   case MVT::v8i32:
19522     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
19523   case MVT::v2i64:
19524   case MVT::v4i64:
19525     return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
19526   }
19527 }
19528
19529 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
19530                                            SelectionDAG &DAG) const {
19531   bool IsStrict = Op->isStrictFPOpcode();
19532   unsigned OpNo = IsStrict ? 1 : 0;
19533   SDValue Src = Op.getOperand(OpNo);
19534   SDLoc dl(Op);
19535   auto PtrVT = getPointerTy(DAG.getDataLayout());
19536   MVT SrcVT = Src.getSimpleValueType();
19537   MVT DstVT = Op->getSimpleValueType(0);
19538   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19539
19540   // Bail out when we don't have native conversion instructions.
19541   if (DstVT == MVT::f128)
19542     return SDValue();
19543
19544   if (isSoftF16(DstVT, Subtarget))
19545     return promoteXINT_TO_FP(Op, DAG);
19546   else if (isLegalConversion(SrcVT, false, Subtarget))
19547     return Op;
19548
19549   if (DstVT.isVector())
19550     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
19551
19552   if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19553     return LowerWin64_INT128_TO_FP(Op, DAG);
19554
19555   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
19556     return Extract;
19557
19558   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
19559       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
19560     // Conversions from unsigned i32 to f32/f64 are legal,
19561     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
19562     return Op;
19563   }
19564
19565   // Promote i32 to i64 and use a signed conversion on 64-bit targets.
19566   if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
19567     Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
19568     if (IsStrict)
19569       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
19570                          {Chain, Src});
19571     return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
19572   }
19573
19574   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
19575     return V;
19576   if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
19577     return V;
19578
19579   // The transform for i64->f64 isn't correct for 0 when rounding to negative
19580   // infinity. It produces -0.0, so disable under strictfp.
19581   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
19582       !IsStrict)
19583     return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
19584   // The transform for i32->f64/f32 isn't correct for 0 when rounding to
19585   // negative infinity. So disable under strictfp. Using FILD instead.
19586   if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
19587       !IsStrict)
19588     return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
19589   if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
19590       (DstVT == MVT::f32 || DstVT == MVT::f64))
19591     return SDValue();
19592
19593   // Make a 64-bit buffer, and use it to build an FILD.
19594   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
19595   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
19596   Align SlotAlign(8);
19597   MachinePointerInfo MPI =
19598     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
19599   if (SrcVT == MVT::i32) {
19600     SDValue OffsetSlot =
19601         DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
19602     SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
19603     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
19604                                   OffsetSlot, MPI.getWithOffset(4), SlotAlign);
19605     std::pair<SDValue, SDValue> Tmp =
19606         BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
19607     if (IsStrict)
19608       return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19609
19610     return Tmp.first;
19611   }
19612
19613   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
19614   SDValue ValueToStore = Src;
19615   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
19616     // Bitcasting to f64 here allows us to do a single 64-bit store from
19617     // an SSE register, avoiding the store forwarding penalty that would come
19618     // with two 32-bit stores.
19619     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19620   }
19621   SDValue Store =
19622       DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
19623   // For i64 source, we need to add the appropriate power of 2 if the input
19624   // was negative. We must be careful to do the computation in x87 extended
19625   // precision, not in SSE.
19626   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19627   SDValue Ops[] = { Store, StackSlot };
19628   SDValue Fild =
19629       DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
19630                               SlotAlign, MachineMemOperand::MOLoad);
19631   Chain = Fild.getValue(1);
19632
19633
19634   // Check whether the sign bit is set.
19635   SDValue SignSet = DAG.getSetCC(
19636       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
19637       Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
19638
19639   // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
19640   APInt FF(64, 0x5F80000000000000ULL);
19641   SDValue FudgePtr = DAG.getConstantPool(
19642       ConstantInt::get(*DAG.getContext(), FF), PtrVT);
19643   Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
19644
19645   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
19646   SDValue Zero = DAG.getIntPtrConstant(0, dl);
19647   SDValue Four = DAG.getIntPtrConstant(4, dl);
19648   SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
19649   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
19650
19651   // Load the value out, extending it from f32 to f80.
19652   SDValue Fudge = DAG.getExtLoad(
19653       ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
19654       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
19655       CPAlignment);
19656   Chain = Fudge.getValue(1);
19657   // Extend everything to 80 bits to force it to be done on x87.
19658   // TODO: Are there any fast-math-flags to propagate here?
19659   if (IsStrict) {
19660     unsigned Opc = ISD::STRICT_FADD;
19661     // Windows needs the precision control changed to 80bits around this add.
19662     if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19663       Opc = X86ISD::STRICT_FP80_ADD;
19664
19665     SDValue Add =
19666         DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
19667     // STRICT_FP_ROUND can't handle equal types.
19668     if (DstVT == MVT::f80)
19669       return Add;
19670     return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
19671                        {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
19672   }
19673   unsigned Opc = ISD::FADD;
19674   // Windows needs the precision control changed to 80bits around this add.
19675   if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19676     Opc = X86ISD::FP80_ADD;
19677
19678   SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
19679   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
19680                      DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
19681 }
19682
19683 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
19684 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
19685 // just return an SDValue().
19686 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
19687 // to i16, i32 or i64, and we lower it to a legal sequence and return the
19688 // result.
19689 SDValue
19690 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
19691                                    bool IsSigned, SDValue &Chain) const {
19692   bool IsStrict = Op->isStrictFPOpcode();
19693   SDLoc DL(Op);
19694
19695   EVT DstTy = Op.getValueType();
19696   SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
19697   EVT TheVT = Value.getValueType();
19698   auto PtrVT = getPointerTy(DAG.getDataLayout());
19699
19700   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
19701     // f16 must be promoted before using the lowering in this routine.
19702     // fp128 does not use this lowering.
19703     return SDValue();
19704   }
19705
19706   // If using FIST to compute an unsigned i64, we'll need some fixup
19707   // to handle values above the maximum signed i64.  A FIST is always
19708   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
19709   bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
19710
19711   // FIXME: This does not generate an invalid exception if the input does not
19712   // fit in i32. PR44019
19713   if (!IsSigned && DstTy != MVT::i64) {
19714     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
19715     // The low 32 bits of the fist result will have the correct uint32 result.
19716     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
19717     DstTy = MVT::i64;
19718   }
19719
19720   assert(DstTy.getSimpleVT() <= MVT::i64 &&
19721          DstTy.getSimpleVT() >= MVT::i16 &&
19722          "Unknown FP_TO_INT to lower!");
19723
19724   // We lower FP->int64 into FISTP64 followed by a load from a temporary
19725   // stack slot.
19726   MachineFunction &MF = DAG.getMachineFunction();
19727   unsigned MemSize = DstTy.getStoreSize();
19728   int SSFI =
19729       MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
19730   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19731
19732   Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19733
19734   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
19735
19736   if (UnsignedFixup) {
19737     //
19738     // Conversion to unsigned i64 is implemented with a select,
19739     // depending on whether the source value fits in the range
19740     // of a signed i64.  Let Thresh be the FP equivalent of
19741     // 0x8000000000000000ULL.
19742     //
19743     //  Adjust = (Value >= Thresh) ? 0x80000000 : 0;
19744     //  FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
19745     //  FistSrc = (Value - FltOfs);
19746     //  Fist-to-mem64 FistSrc
19747     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
19748     //  to XOR'ing the high 32 bits with Adjust.
19749     //
19750     // Being a power of 2, Thresh is exactly representable in all FP formats.
19751     // For X87 we'd like to use the smallest FP type for this constant, but
19752     // for DAG type consistency we have to match the FP operand type.
19753
19754     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
19755     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
19756     bool LosesInfo = false;
19757     if (TheVT == MVT::f64)
19758       // The rounding mode is irrelevant as the conversion should be exact.
19759       Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
19760                               &LosesInfo);
19761     else if (TheVT == MVT::f80)
19762       Status = Thresh.convert(APFloat::x87DoubleExtended(),
19763                               APFloat::rmNearestTiesToEven, &LosesInfo);
19764
19765     assert(Status == APFloat::opOK && !LosesInfo &&
19766            "FP conversion should have been exact");
19767
19768     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
19769
19770     EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
19771                                    *DAG.getContext(), TheVT);
19772     SDValue Cmp;
19773     if (IsStrict) {
19774       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
19775                          /*IsSignaling*/ true);
19776       Chain = Cmp.getValue(1);
19777     } else {
19778       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
19779     }
19780
19781     // Our preferred lowering of
19782     //
19783     // (Value >= Thresh) ? 0x8000000000000000ULL : 0
19784     //
19785     // is
19786     //
19787     // (Value >= Thresh) << 63
19788     //
19789     // but since we can get here after LegalOperations, DAGCombine might do the
19790     // wrong thing if we create a select. So, directly create the preferred
19791     // version.
19792     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
19793     SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
19794     Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
19795
19796     SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
19797                                    DAG.getConstantFP(0.0, DL, TheVT));
19798
19799     if (IsStrict) {
19800       Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
19801                           { Chain, Value, FltOfs });
19802       Chain = Value.getValue(1);
19803     } else
19804       Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
19805   }
19806
19807   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
19808
19809   // FIXME This causes a redundant load/store if the SSE-class value is already
19810   // in memory, such as if it is on the callstack.
19811   if (isScalarFPTypeInSSEReg(TheVT)) {
19812     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
19813     Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
19814     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19815     SDValue Ops[] = { Chain, StackSlot };
19816
19817     unsigned FLDSize = TheVT.getStoreSize();
19818     assert(FLDSize <= MemSize && "Stack slot not big enough");
19819     MachineMemOperand *MMO = MF.getMachineMemOperand(
19820         MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
19821     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
19822     Chain = Value.getValue(1);
19823   }
19824
19825   // Build the FP_TO_INT*_IN_MEM
19826   MachineMemOperand *MMO = MF.getMachineMemOperand(
19827       MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
19828   SDValue Ops[] = { Chain, Value, StackSlot };
19829   SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
19830                                          DAG.getVTList(MVT::Other),
19831                                          Ops, DstTy, MMO);
19832
19833   SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
19834   Chain = Res.getValue(1);
19835
19836   // If we need an unsigned fixup, XOR the result with adjust.
19837   if (UnsignedFixup)
19838     Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
19839
19840   return Res;
19841 }
19842
19843 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
19844                               const X86Subtarget &Subtarget) {
19845   MVT VT = Op.getSimpleValueType();
19846   SDValue In = Op.getOperand(0);
19847   MVT InVT = In.getSimpleValueType();
19848   SDLoc dl(Op);
19849   unsigned Opc = Op.getOpcode();
19850
19851   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
19852   assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
19853          "Unexpected extension opcode");
19854   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
19855          "Expected same number of elements");
19856   assert((VT.getVectorElementType() == MVT::i16 ||
19857           VT.getVectorElementType() == MVT::i32 ||
19858           VT.getVectorElementType() == MVT::i64) &&
19859          "Unexpected element type");
19860   assert((InVT.getVectorElementType() == MVT::i8 ||
19861           InVT.getVectorElementType() == MVT::i16 ||
19862           InVT.getVectorElementType() == MVT::i32) &&
19863          "Unexpected element type");
19864
19865   unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
19866
19867   if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
19868     assert(InVT == MVT::v32i8 && "Unexpected VT!");
19869     return splitVectorIntUnary(Op, DAG);
19870   }
19871
19872   if (Subtarget.hasInt256())
19873     return Op;
19874
19875   // Optimize vectors in AVX mode:
19876   //
19877   //   v8i16 -> v8i32
19878   //   Use vpmovzwd for 4 lower elements  v8i16 -> v4i32.
19879   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
19880   //   Concat upper and lower parts.
19881   //
19882   //   v4i32 -> v4i64
19883   //   Use vpmovzdq for 4 lower elements  v4i32 -> v2i64.
19884   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
19885   //   Concat upper and lower parts.
19886   //
19887   MVT HalfVT = VT.getHalfNumVectorElementsVT();
19888   SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
19889
19890   // Short-circuit if we can determine that each 128-bit half is the same value.
19891   // Otherwise, this is difficult to match and optimize.
19892   if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
19893     if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
19894       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
19895
19896   SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
19897   SDValue Undef = DAG.getUNDEF(InVT);
19898   bool NeedZero = Opc == ISD::ZERO_EXTEND;
19899   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
19900   OpHi = DAG.getBitcast(HalfVT, OpHi);
19901
19902   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
19903 }
19904
19905 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
19906 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
19907                                    const SDLoc &dl, SelectionDAG &DAG) {
19908   assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
19909   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
19910                            DAG.getIntPtrConstant(0, dl));
19911   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
19912                            DAG.getIntPtrConstant(8, dl));
19913   Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
19914   Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
19915   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
19916   return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19917 }
19918
19919 static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
19920                                       const X86Subtarget &Subtarget,
19921                                       SelectionDAG &DAG) {
19922   MVT VT = Op->getSimpleValueType(0);
19923   SDValue In = Op->getOperand(0);
19924   MVT InVT = In.getSimpleValueType();
19925   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
19926   SDLoc DL(Op);
19927   unsigned NumElts = VT.getVectorNumElements();
19928
19929   // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
19930   // avoids a constant pool load.
19931   if (VT.getVectorElementType() != MVT::i8) {
19932     SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
19933     return DAG.getNode(ISD::SRL, DL, VT, Extend,
19934                        DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
19935   }
19936
19937   // Extend VT if BWI is not supported.
19938   MVT ExtVT = VT;
19939   if (!Subtarget.hasBWI()) {
19940     // If v16i32 is to be avoided, we'll need to split and concatenate.
19941     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
19942       return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
19943
19944     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
19945   }
19946
19947   // Widen to 512-bits if VLX is not supported.
19948   MVT WideVT = ExtVT;
19949   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
19950     NumElts *= 512 / ExtVT.getSizeInBits();
19951     InVT = MVT::getVectorVT(MVT::i1, NumElts);
19952     In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
19953                      In, DAG.getIntPtrConstant(0, DL));
19954     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
19955                               NumElts);
19956   }
19957
19958   SDValue One = DAG.getConstant(1, DL, WideVT);
19959   SDValue Zero = DAG.getConstant(0, DL, WideVT);
19960
19961   SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
19962
19963   // Truncate if we had to extend above.
19964   if (VT != ExtVT) {
19965     WideVT = MVT::getVectorVT(MVT::i8, NumElts);
19966     SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
19967   }
19968
19969   // Extract back to 128/256-bit if we widened.
19970   if (WideVT != VT)
19971     SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
19972                               DAG.getIntPtrConstant(0, DL));
19973
19974   return SelectedVal;
19975 }
19976
19977 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19978                                 SelectionDAG &DAG) {
19979   SDValue In = Op.getOperand(0);
19980   MVT SVT = In.getSimpleValueType();
19981
19982   if (SVT.getVectorElementType() == MVT::i1)
19983     return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
19984
19985   assert(Subtarget.hasAVX() && "Expected AVX support");
19986   return LowerAVXExtend(Op, DAG, Subtarget);
19987 }
19988
19989 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
19990 /// It makes use of the fact that vectors with enough leading sign/zero bits
19991 /// prevent the PACKSS/PACKUS from saturating the results.
19992 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
19993 /// within each 128-bit lane.
19994 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
19995                                       const SDLoc &DL, SelectionDAG &DAG,
19996                                       const X86Subtarget &Subtarget) {
19997   assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
19998          "Unexpected PACK opcode");
19999   assert(DstVT.isVector() && "VT not a vector?");
20000
20001   // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20002   if (!Subtarget.hasSSE2())
20003     return SDValue();
20004
20005   EVT SrcVT = In.getValueType();
20006
20007   // No truncation required, we might get here due to recursive calls.
20008   if (SrcVT == DstVT)
20009     return In;
20010
20011   unsigned NumElems = SrcVT.getVectorNumElements();
20012   if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20013     return SDValue();
20014
20015   unsigned DstSizeInBits = DstVT.getSizeInBits();
20016   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20017   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20018   assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20019
20020   LLVMContext &Ctx = *DAG.getContext();
20021   EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20022   EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20023
20024   // Pack to the largest type possible:
20025   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20026   EVT InVT = MVT::i16, OutVT = MVT::i8;
20027   if (SrcVT.getScalarSizeInBits() > 16 &&
20028       (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20029     InVT = MVT::i32;
20030     OutVT = MVT::i16;
20031   }
20032
20033   // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20034   // On pre-AVX512, pack the src in both halves to help value tracking.
20035   if (SrcSizeInBits <= 128) {
20036     InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20037     OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20038     In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20039     SDValue LHS = DAG.getBitcast(InVT, In);
20040     SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20041     SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20042     Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20043     Res = DAG.getBitcast(PackedVT, Res);
20044     return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20045   }
20046
20047   // Split lower/upper subvectors.
20048   SDValue Lo, Hi;
20049   std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20050
20051   // If Hi is undef, then don't bother packing it and widen the result instead.
20052   if (Hi.isUndef()) {
20053     EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20054     if (SDValue Res =
20055             truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20056       return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20057   }
20058
20059   unsigned SubSizeInBits = SrcSizeInBits / 2;
20060   InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20061   OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20062
20063   // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20064   if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20065     Lo = DAG.getBitcast(InVT, Lo);
20066     Hi = DAG.getBitcast(InVT, Hi);
20067     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20068     return DAG.getBitcast(DstVT, Res);
20069   }
20070
20071   // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20072   // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20073   if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20074     Lo = DAG.getBitcast(InVT, Lo);
20075     Hi = DAG.getBitcast(InVT, Hi);
20076     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20077
20078     // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20079     // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20080     // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20081     SmallVector<int, 64> Mask;
20082     int Scale = 64 / OutVT.getScalarSizeInBits();
20083     narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20084     Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20085
20086     if (DstVT.is256BitVector())
20087       return DAG.getBitcast(DstVT, Res);
20088
20089     // If 512bit -> 128bit truncate another stage.
20090     Res = DAG.getBitcast(PackedVT, Res);
20091     return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20092   }
20093
20094   // Recursively pack lower/upper subvectors, concat result and pack again.
20095   assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20096
20097   if (PackedVT.is128BitVector()) {
20098     // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20099     // type legalization.
20100     SDValue Res =
20101         truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20102     return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20103   }
20104
20105   EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20106   Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20107   Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20108   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20109   return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20110 }
20111
20112 /// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20113 /// e.g. trunc <8 x i32> X to <8 x i16> -->
20114 /// MaskX = X & 0xffff (clear high bits to prevent saturation)
20115 /// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20116 static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL,
20117                                         const X86Subtarget &Subtarget,
20118                                         SelectionDAG &DAG) {
20119   In = DAG.getZeroExtendInReg(In, DL, DstVT);
20120   return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20121 }
20122
20123 /// Truncate using inreg sign extension and X86ISD::PACKSS.
20124 static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL,
20125                                         const X86Subtarget &Subtarget,
20126                                         SelectionDAG &DAG) {
20127   EVT SrcVT = In.getValueType();
20128   In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20129                    DAG.getValueType(DstVT));
20130   return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20131 }
20132
20133 /// Helper to determine if \p In truncated to \p DstVT has the necessary
20134 /// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20135 /// possibly by converting a SRL node to SRA for sign extension.
20136 static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20137                                      SDValue In, const SDLoc &DL,
20138                                      SelectionDAG &DAG,
20139                                      const X86Subtarget &Subtarget) {
20140   // Requires SSE2.
20141   if (!Subtarget.hasSSE2())
20142     return SDValue();
20143
20144   EVT SrcVT = In.getValueType();
20145   EVT DstSVT = DstVT.getVectorElementType();
20146   EVT SrcSVT = SrcVT.getVectorElementType();
20147
20148   // Check we have a truncation suited for PACKSS/PACKUS.
20149   if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20150         (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20151     return SDValue();
20152
20153   assert(SrcSVT.getSizeInBits() > DstSVT.getSizeInBits() && "Bad truncation");
20154   unsigned NumStages = Log2_32(SrcSVT.getSizeInBits() / DstSVT.getSizeInBits());
20155
20156   // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20157   // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20158   // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20159   if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20160       (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20161       (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20162     return SDValue();
20163
20164   // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20165   // split this for packing.
20166   if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20167       !isFreeToSplitVector(In.getNode(), DAG) &&
20168       (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20169     return SDValue();
20170
20171   // Don't truncate AVX512 targets as multiple PACK nodes stages.
20172   if (Subtarget.hasAVX512() && NumStages > 1)
20173     return SDValue();
20174
20175   unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
20176   unsigned NumPackedSignBits = std::min<unsigned>(DstSVT.getSizeInBits(), 16);
20177   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20178
20179   // Truncate with PACKUS if we are truncating a vector with leading zero
20180   // bits that extend all the way to the packed/truncated value.
20181   // e.g. Masks, zext_in_reg, etc.
20182   // Pre-SSE41 we can only use PACKUSWB.
20183   KnownBits Known = DAG.computeKnownBits(In);
20184   if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20185     PackOpcode = X86ISD::PACKUS;
20186     return In;
20187   }
20188
20189   // Truncate with PACKSS if we are truncating a vector with sign-bits
20190   // that extend all the way to the packed/truncated value.
20191   // e.g. Comparison result, sext_in_reg, etc.
20192   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20193
20194   // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20195   // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20196   // see through BITCASTs later on and combines/simplifications can't then use
20197   // it.
20198   if (DstSVT == MVT::i32 && NumSignBits != SrcSVT.getSizeInBits() &&
20199       !Subtarget.hasAVX512())
20200     return SDValue();
20201
20202   unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20203   if (MinSignBits < NumSignBits) {
20204     PackOpcode = X86ISD::PACKSS;
20205     return In;
20206   }
20207
20208   // If we have a srl that only generates signbits that we will discard in
20209   // the truncation then we can use PACKSS by converting the srl to a sra.
20210   // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20211   if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20212     if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
20213             In, APInt::getAllOnes(SrcVT.getVectorNumElements()))) {
20214       if (*ShAmt == MinSignBits) {
20215         PackOpcode = X86ISD::PACKSS;
20216         return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20217       }
20218     }
20219
20220   return SDValue();
20221 }
20222
20223 /// This function lowers a vector truncation of 'extended sign-bits' or
20224 /// 'extended zero-bits' values.
20225 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20226 static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In,
20227                                                 const SDLoc &DL,
20228                                                 const X86Subtarget &Subtarget,
20229                                                 SelectionDAG &DAG) {
20230   MVT SrcVT = In.getSimpleValueType();
20231   MVT DstSVT = DstVT.getVectorElementType();
20232   MVT SrcSVT = SrcVT.getVectorElementType();
20233   if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20234         (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20235     return SDValue();
20236
20237   // If the upper half of the source is undef, then attempt to split and
20238   // only truncate the lower half.
20239   if (DstVT.getSizeInBits() >= 128) {
20240     SmallVector<SDValue> LowerOps;
20241     if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20242       MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20243       if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20244                                                          Subtarget, DAG))
20245         return widenSubVector(Res, false, Subtarget, DAG, DL,
20246                               DstVT.getSizeInBits());
20247     }
20248   }
20249
20250   unsigned PackOpcode;
20251   if (SDValue Src =
20252           matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
20253     return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20254
20255   return SDValue();
20256 }
20257
20258 /// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20259 /// X86ISD::PACKUS/X86ISD::PACKSS operations.
20260 static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL,
20261                                     const X86Subtarget &Subtarget,
20262                                     SelectionDAG &DAG) {
20263   MVT SrcVT = In.getSimpleValueType();
20264   MVT DstSVT = DstVT.getVectorElementType();
20265   MVT SrcSVT = SrcVT.getVectorElementType();
20266   unsigned NumElems = DstVT.getVectorNumElements();
20267   if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20268         (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20269         NumElems >= 8))
20270     return SDValue();
20271
20272   // SSSE3's pshufb results in less instructions in the cases below.
20273   if (Subtarget.hasSSSE3() && NumElems == 8) {
20274     if (SrcSVT == MVT::i16)
20275       return SDValue();
20276     if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
20277       return SDValue();
20278   }
20279
20280   // If the upper half of the source is undef, then attempt to split and
20281   // only truncate the lower half.
20282   if (DstVT.getSizeInBits() >= 128) {
20283     SmallVector<SDValue> LowerOps;
20284     if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20285       MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20286       if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
20287         return widenSubVector(Res, false, Subtarget, DAG, DL,
20288                               DstVT.getSizeInBits());
20289     }
20290   }
20291
20292   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20293   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20294   // truncate 2 x v4i32 to v8i16.
20295   if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
20296     return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
20297
20298   if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20299     return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
20300
20301   // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20302   if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20303     MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
20304     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
20305     return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
20306   }
20307
20308   return SDValue();
20309 }
20310
20311 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
20312                                   const X86Subtarget &Subtarget) {
20313
20314   SDLoc DL(Op);
20315   MVT VT = Op.getSimpleValueType();
20316   SDValue In = Op.getOperand(0);
20317   MVT InVT = In.getSimpleValueType();
20318
20319   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
20320
20321   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
20322   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20323   if (InVT.getScalarSizeInBits() <= 16) {
20324     if (Subtarget.hasBWI()) {
20325       // legal, will go to VPMOVB2M, VPMOVW2M
20326       if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20327         // We need to shift to get the lsb into sign position.
20328         // Shift packed bytes not supported natively, bitcast to word
20329         MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
20330         In = DAG.getNode(ISD::SHL, DL, ExtVT,
20331                          DAG.getBitcast(ExtVT, In),
20332                          DAG.getConstant(ShiftInx, DL, ExtVT));
20333         In = DAG.getBitcast(InVT, In);
20334       }
20335       return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
20336                           In, ISD::SETGT);
20337     }
20338     // Use TESTD/Q, extended vector to packed dword/qword.
20339     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
20340            "Unexpected vector type.");
20341     unsigned NumElts = InVT.getVectorNumElements();
20342     assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
20343     // We need to change to a wider element type that we have support for.
20344     // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
20345     // For 16 element vectors we extend to v16i32 unless we are explicitly
20346     // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
20347     // we need to split into two 8 element vectors which we can extend to v8i32,
20348     // truncate and concat the results. There's an additional complication if
20349     // the original type is v16i8. In that case we can't split the v16i8
20350     // directly, so we need to shuffle high elements to low and use
20351     // sign_extend_vector_inreg.
20352     if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
20353       SDValue Lo, Hi;
20354       if (InVT == MVT::v16i8) {
20355         Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
20356         Hi = DAG.getVectorShuffle(
20357             InVT, DL, In, In,
20358             {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
20359         Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
20360       } else {
20361         assert(InVT == MVT::v16i16 && "Unexpected VT!");
20362         Lo = extract128BitVector(In, 0, DAG, DL);
20363         Hi = extract128BitVector(In, 8, DAG, DL);
20364       }
20365       // We're split now, just emit two truncates and a concat. The two
20366       // truncates will trigger legalization to come back to this function.
20367       Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
20368       Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
20369       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20370     }
20371     // We either have 8 elements or we're allowed to use 512-bit vectors.
20372     // If we have VLX, we want to use the narrowest vector that can get the
20373     // job done so we use vXi32.
20374     MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
20375     MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
20376     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
20377     InVT = ExtVT;
20378     ShiftInx = InVT.getScalarSizeInBits() - 1;
20379   }
20380
20381   if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20382     // We need to shift to get the lsb into sign position.
20383     In = DAG.getNode(ISD::SHL, DL, InVT, In,
20384                      DAG.getConstant(ShiftInx, DL, InVT));
20385   }
20386   // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
20387   if (Subtarget.hasDQI())
20388     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
20389   return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
20390 }
20391
20392 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
20393   SDLoc DL(Op);
20394   MVT VT = Op.getSimpleValueType();
20395   SDValue In = Op.getOperand(0);
20396   MVT InVT = In.getSimpleValueType();
20397   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
20398          "Invalid TRUNCATE operation");
20399
20400   // If we're called by the type legalizer, handle a few cases.
20401   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20402   if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
20403     if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
20404         VT.is128BitVector() && Subtarget.hasAVX512()) {
20405       assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
20406              "Unexpected subtarget!");
20407       // The default behavior is to truncate one step, concatenate, and then
20408       // truncate the remainder. We'd rather produce two 64-bit results and
20409       // concatenate those.
20410       SDValue Lo, Hi;
20411       std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
20412
20413       EVT LoVT, HiVT;
20414       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
20415
20416       Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
20417       Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
20418       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20419     }
20420
20421     // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
20422     if (!Subtarget.hasAVX512() ||
20423         (InVT.is512BitVector() && VT.is256BitVector()))
20424       if (SDValue SignPack =
20425               LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20426         return SignPack;
20427
20428     // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
20429     if (!Subtarget.hasAVX512())
20430       return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
20431
20432     // Otherwise let default legalization handle it.
20433     return SDValue();
20434   }
20435
20436   if (VT.getVectorElementType() == MVT::i1)
20437     return LowerTruncateVecI1(Op, DAG, Subtarget);
20438
20439   // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
20440   // concat from subvectors to use VPTRUNC etc.
20441   if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
20442     if (SDValue SignPack =
20443             LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20444       return SignPack;
20445
20446   // vpmovqb/w/d, vpmovdb/w, vpmovwb
20447   if (Subtarget.hasAVX512()) {
20448     if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
20449       assert(VT == MVT::v32i8 && "Unexpected VT!");
20450       return splitVectorIntUnary(Op, DAG);
20451     }
20452
20453     // word to byte only under BWI. Otherwise we have to promoted to v16i32
20454     // and then truncate that. But we should only do that if we haven't been
20455     // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
20456     // handled by isel patterns.
20457     if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
20458         Subtarget.canExtendTo512DQ())
20459       return Op;
20460   }
20461
20462   // Handle truncation of V256 to V128 using shuffles.
20463   assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
20464
20465   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
20466     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
20467     if (Subtarget.hasInt256()) {
20468       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
20469       In = DAG.getBitcast(MVT::v8i32, In);
20470       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
20471       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
20472                          DAG.getIntPtrConstant(0, DL));
20473     }
20474
20475     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20476                                DAG.getIntPtrConstant(0, DL));
20477     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20478                                DAG.getIntPtrConstant(2, DL));
20479     static const int ShufMask[] = {0, 2, 4, 6};
20480     return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
20481                                 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
20482   }
20483
20484   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
20485     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
20486     if (Subtarget.hasInt256()) {
20487       // The PSHUFB mask:
20488       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
20489                                       -1, -1, -1, -1, -1, -1, -1, -1,
20490                                       16, 17, 20, 21, 24, 25, 28, 29,
20491                                       -1, -1, -1, -1, -1, -1, -1, -1 };
20492       In = DAG.getBitcast(MVT::v32i8, In);
20493       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
20494       In = DAG.getBitcast(MVT::v4i64, In);
20495
20496       static const int ShufMask2[] = {0, 2, -1, -1};
20497       In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
20498       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20499                        DAG.getIntPtrConstant(0, DL));
20500       return DAG.getBitcast(MVT::v8i16, In);
20501     }
20502
20503     return Subtarget.hasSSE41()
20504                ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
20505                : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
20506   }
20507
20508   if (VT == MVT::v16i8 && InVT == MVT::v16i16)
20509     return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
20510
20511   llvm_unreachable("All 256->128 cases should have been handled above!");
20512 }
20513
20514 // We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
20515 // behaves on out of range inputs to generate optimized conversions.
20516 static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
20517                                     SelectionDAG &DAG,
20518                                     const X86Subtarget &Subtarget) {
20519   MVT SrcVT = Src.getSimpleValueType();
20520   unsigned DstBits = VT.getScalarSizeInBits();
20521   assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
20522
20523   // Calculate the converted result for values in the range 0 to
20524   // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
20525   SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
20526   SDValue Big =
20527       DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
20528                   DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
20529                               DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
20530
20531   // The "CVTTP2SI" instruction conveniently sets the sign bit if
20532   // and only if the value was out of range. So we can use that
20533   // as our indicator that we rather use "Big" instead of "Small".
20534   //
20535   // Use "Small" if "IsOverflown" has all bits cleared
20536   // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
20537
20538   // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
20539   // use the slightly slower blendv select instead.
20540   if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
20541     SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
20542     return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
20543   }
20544
20545   SDValue IsOverflown =
20546       DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
20547                   DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
20548   return DAG.getNode(ISD::OR, dl, VT, Small,
20549                      DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
20550 }
20551
20552 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
20553   bool IsStrict = Op->isStrictFPOpcode();
20554   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
20555                   Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
20556   MVT VT = Op->getSimpleValueType(0);
20557   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20558   SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
20559   MVT SrcVT = Src.getSimpleValueType();
20560   SDLoc dl(Op);
20561
20562   SDValue Res;
20563   if (isSoftF16(SrcVT, Subtarget)) {
20564     MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20565     if (IsStrict)
20566       return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
20567                          {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
20568                                              {NVT, MVT::Other}, {Chain, Src})});
20569     return DAG.getNode(Op.getOpcode(), dl, VT,
20570                        DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
20571   } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
20572     return Op;
20573   }
20574
20575   if (VT.isVector()) {
20576     if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
20577       MVT ResVT = MVT::v4i32;
20578       MVT TruncVT = MVT::v4i1;
20579       unsigned Opc;
20580       if (IsStrict)
20581         Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
20582       else
20583         Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20584
20585       if (!IsSigned && !Subtarget.hasVLX()) {
20586         assert(Subtarget.useAVX512Regs() && "Unexpected features!");
20587         // Widen to 512-bits.
20588         ResVT = MVT::v8i32;
20589         TruncVT = MVT::v8i1;
20590         Opc = Op.getOpcode();
20591         // Need to concat with zero vector for strict fp to avoid spurious
20592         // exceptions.
20593         // TODO: Should we just do this for non-strict as well?
20594         SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
20595                                : DAG.getUNDEF(MVT::v8f64);
20596         Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
20597                           DAG.getIntPtrConstant(0, dl));
20598       }
20599       if (IsStrict) {
20600         Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
20601         Chain = Res.getValue(1);
20602       } else {
20603         Res = DAG.getNode(Opc, dl, ResVT, Src);
20604       }
20605
20606       Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
20607       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
20608                         DAG.getIntPtrConstant(0, dl));
20609       if (IsStrict)
20610         return DAG.getMergeValues({Res, Chain}, dl);
20611       return Res;
20612     }
20613
20614     if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
20615       if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
20616         return Op;
20617
20618       MVT ResVT = VT;
20619       MVT EleVT = VT.getVectorElementType();
20620       if (EleVT != MVT::i64)
20621         ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
20622
20623       if (SrcVT != MVT::v8f16) {
20624         SDValue Tmp =
20625             IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
20626         SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
20627         Ops[0] = Src;
20628         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
20629       }
20630
20631       if (IsStrict) {
20632         Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
20633                                    : X86ISD::STRICT_CVTTP2UI,
20634                           dl, {ResVT, MVT::Other}, {Chain, Src});
20635         Chain = Res.getValue(1);
20636       } else {
20637         Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
20638                           ResVT, Src);
20639       }
20640
20641       // TODO: Need to add exception check code for strict FP.
20642       if (EleVT.getSizeInBits() < 16) {
20643         ResVT = MVT::getVectorVT(EleVT, 8);
20644         Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
20645       }
20646
20647       if (ResVT != VT)
20648         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20649                           DAG.getIntPtrConstant(0, dl));
20650
20651       if (IsStrict)
20652         return DAG.getMergeValues({Res, Chain}, dl);
20653       return Res;
20654     }
20655
20656     // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
20657     if (VT.getVectorElementType() == MVT::i16) {
20658       assert((SrcVT.getVectorElementType() == MVT::f32 ||
20659               SrcVT.getVectorElementType() == MVT::f64) &&
20660              "Expected f32/f64 vector!");
20661       MVT NVT = VT.changeVectorElementType(MVT::i32);
20662       if (IsStrict) {
20663         Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
20664                                    : ISD::STRICT_FP_TO_UINT,
20665                           dl, {NVT, MVT::Other}, {Chain, Src});
20666         Chain = Res.getValue(1);
20667       } else {
20668         Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
20669                           NVT, Src);
20670       }
20671
20672       // TODO: Need to add exception check code for strict FP.
20673       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20674
20675       if (IsStrict)
20676         return DAG.getMergeValues({Res, Chain}, dl);
20677       return Res;
20678     }
20679
20680     // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
20681     if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
20682       assert(!IsSigned && "Expected unsigned conversion!");
20683       assert(Subtarget.useAVX512Regs() && "Requires avx512f");
20684       return Op;
20685     }
20686
20687     // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
20688     if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
20689         (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
20690         Subtarget.useAVX512Regs()) {
20691       assert(!IsSigned && "Expected unsigned conversion!");
20692       assert(!Subtarget.hasVLX() && "Unexpected features!");
20693       MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20694       MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20695       // Need to concat with zero vector for strict fp to avoid spurious
20696       // exceptions.
20697       // TODO: Should we just do this for non-strict as well?
20698       SDValue Tmp =
20699           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20700       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20701                         DAG.getIntPtrConstant(0, dl));
20702
20703       if (IsStrict) {
20704         Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
20705                           {Chain, Src});
20706         Chain = Res.getValue(1);
20707       } else {
20708         Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
20709       }
20710
20711       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20712                         DAG.getIntPtrConstant(0, dl));
20713
20714       if (IsStrict)
20715         return DAG.getMergeValues({Res, Chain}, dl);
20716       return Res;
20717     }
20718
20719     // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
20720     if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
20721         (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
20722         Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
20723       assert(!Subtarget.hasVLX() && "Unexpected features!");
20724       MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20725       // Need to concat with zero vector for strict fp to avoid spurious
20726       // exceptions.
20727       // TODO: Should we just do this for non-strict as well?
20728       SDValue Tmp =
20729           IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20730       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20731                         DAG.getIntPtrConstant(0, dl));
20732
20733       if (IsStrict) {
20734         Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
20735                           {Chain, Src});
20736         Chain = Res.getValue(1);
20737       } else {
20738         Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
20739       }
20740
20741       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20742                         DAG.getIntPtrConstant(0, dl));
20743
20744       if (IsStrict)
20745         return DAG.getMergeValues({Res, Chain}, dl);
20746       return Res;
20747     }
20748
20749     if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
20750       if (!Subtarget.hasVLX()) {
20751         // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
20752         // legalizer and then widened again by vector op legalization.
20753         if (!IsStrict)
20754           return SDValue();
20755
20756         SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
20757         SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
20758                                   {Src, Zero, Zero, Zero});
20759         Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
20760                           {Chain, Tmp});
20761         SDValue Chain = Tmp.getValue(1);
20762         Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
20763                           DAG.getIntPtrConstant(0, dl));
20764         return DAG.getMergeValues({Tmp, Chain}, dl);
20765       }
20766
20767       assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
20768       SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
20769                                 DAG.getUNDEF(MVT::v2f32));
20770       if (IsStrict) {
20771         unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
20772                                 : X86ISD::STRICT_CVTTP2UI;
20773         return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
20774       }
20775       unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20776       return DAG.getNode(Opc, dl, VT, Tmp);
20777     }
20778
20779     // Generate optimized instructions for pre AVX512 unsigned conversions from
20780     // vXf32 to vXi32.
20781     if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
20782         (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
20783         (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
20784       assert(!IsSigned && "Expected unsigned conversion!");
20785       return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
20786     }
20787
20788     return SDValue();
20789   }
20790
20791   assert(!VT.isVector());
20792
20793   bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
20794
20795   if (!IsSigned && UseSSEReg) {
20796     // Conversions from f32/f64 with AVX512 should be legal.
20797     if (Subtarget.hasAVX512())
20798       return Op;
20799
20800     // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
20801     // behaves on out of range inputs to generate optimized conversions.
20802     if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
20803                       (VT == MVT::i64 && Subtarget.is64Bit()))) {
20804       unsigned DstBits = VT.getScalarSizeInBits();
20805       APInt UIntLimit = APInt::getSignMask(DstBits);
20806       SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
20807                                         DAG.getConstant(UIntLimit, dl, VT));
20808       MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
20809
20810       // Calculate the converted result for values in the range:
20811       // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
20812       // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
20813       SDValue Small =
20814           DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
20815                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
20816       SDValue Big = DAG.getNode(
20817           X86ISD::CVTTS2SI, dl, VT,
20818           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
20819                       DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
20820
20821       // The "CVTTS2SI" instruction conveniently sets the sign bit if
20822       // and only if the value was out of range. So we can use that
20823       // as our indicator that we rather use "Big" instead of "Small".
20824       //
20825       // Use "Small" if "IsOverflown" has all bits cleared
20826       // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
20827       SDValue IsOverflown = DAG.getNode(
20828           ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
20829       return DAG.getNode(ISD::OR, dl, VT, Small,
20830                          DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
20831     }
20832
20833     // Use default expansion for i64.
20834     if (VT == MVT::i64)
20835       return SDValue();
20836
20837     assert(VT == MVT::i32 && "Unexpected VT!");
20838
20839     // Promote i32 to i64 and use a signed operation on 64-bit targets.
20840     // FIXME: This does not generate an invalid exception if the input does not
20841     // fit in i32. PR44019
20842     if (Subtarget.is64Bit()) {
20843       if (IsStrict) {
20844         Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
20845                           {Chain, Src});
20846         Chain = Res.getValue(1);
20847       } else
20848         Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
20849
20850       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20851       if (IsStrict)
20852         return DAG.getMergeValues({Res, Chain}, dl);
20853       return Res;
20854     }
20855
20856     // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
20857     // use fisttp which will be handled later.
20858     if (!Subtarget.hasSSE3())
20859       return SDValue();
20860   }
20861
20862   // Promote i16 to i32 if we can use a SSE operation or the type is f128.
20863   // FIXME: This does not generate an invalid exception if the input does not
20864   // fit in i16. PR44019
20865   if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
20866     assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
20867     if (IsStrict) {
20868       Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
20869                         {Chain, Src});
20870       Chain = Res.getValue(1);
20871     } else
20872       Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
20873
20874     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20875     if (IsStrict)
20876       return DAG.getMergeValues({Res, Chain}, dl);
20877     return Res;
20878   }
20879
20880   // If this is a FP_TO_SINT using SSEReg we're done.
20881   if (UseSSEReg && IsSigned)
20882     return Op;
20883
20884   // fp128 needs to use a libcall.
20885   if (SrcVT == MVT::f128) {
20886     RTLIB::Libcall LC;
20887     if (IsSigned)
20888       LC = RTLIB::getFPTOSINT(SrcVT, VT);
20889     else
20890       LC = RTLIB::getFPTOUINT(SrcVT, VT);
20891
20892     MakeLibCallOptions CallOptions;
20893     std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
20894                                                   SDLoc(Op), Chain);
20895
20896     if (IsStrict)
20897       return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
20898
20899     return Tmp.first;
20900   }
20901
20902   // Fall back to X87.
20903   if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
20904     if (IsStrict)
20905       return DAG.getMergeValues({V, Chain}, dl);
20906     return V;
20907   }
20908
20909   llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
20910 }
20911
20912 SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
20913                                              SelectionDAG &DAG) const {
20914   SDValue Src = Op.getOperand(0);
20915   MVT SrcVT = Src.getSimpleValueType();
20916
20917   if (SrcVT == MVT::f16)
20918     return SDValue();
20919
20920   // If the source is in an SSE register, the node is Legal.
20921   if (isScalarFPTypeInSSEReg(SrcVT))
20922     return Op;
20923
20924   return LRINT_LLRINTHelper(Op.getNode(), DAG);
20925 }
20926
20927 SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
20928                                               SelectionDAG &DAG) const {
20929   EVT DstVT = N->getValueType(0);
20930   SDValue Src = N->getOperand(0);
20931   EVT SrcVT = Src.getValueType();
20932
20933   if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
20934     // f16 must be promoted before using the lowering in this routine.
20935     // fp128 does not use this lowering.
20936     return SDValue();
20937   }
20938
20939   SDLoc DL(N);
20940   SDValue Chain = DAG.getEntryNode();
20941
20942   bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
20943
20944   // If we're converting from SSE, the stack slot needs to hold both types.
20945   // Otherwise it only needs to hold the DstVT.
20946   EVT OtherVT = UseSSE ? SrcVT : DstVT;
20947   SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
20948   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
20949   MachinePointerInfo MPI =
20950       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
20951
20952   if (UseSSE) {
20953     assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
20954     Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
20955     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20956     SDValue Ops[] = { Chain, StackPtr };
20957
20958     Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
20959                                   /*Align*/ std::nullopt,
20960                                   MachineMemOperand::MOLoad);
20961     Chain = Src.getValue(1);
20962   }
20963
20964   SDValue StoreOps[] = { Chain, Src, StackPtr };
20965   Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
20966                                   StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
20967                                   MachineMemOperand::MOStore);
20968
20969   return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
20970 }
20971
20972 SDValue
20973 X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
20974   // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
20975   // but making use of X86 specifics to produce better instruction sequences.
20976   SDNode *Node = Op.getNode();
20977   bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
20978   unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
20979   SDLoc dl(SDValue(Node, 0));
20980   SDValue Src = Node->getOperand(0);
20981
20982   // There are three types involved here: SrcVT is the source floating point
20983   // type, DstVT is the type of the result, and TmpVT is the result of the
20984   // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
20985   // DstVT).
20986   EVT SrcVT = Src.getValueType();
20987   EVT DstVT = Node->getValueType(0);
20988   EVT TmpVT = DstVT;
20989
20990   // This code is only for floats and doubles. Fall back to generic code for
20991   // anything else.
20992   if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
20993     return SDValue();
20994
20995   EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
20996   unsigned SatWidth = SatVT.getScalarSizeInBits();
20997   unsigned DstWidth = DstVT.getScalarSizeInBits();
20998   unsigned TmpWidth = TmpVT.getScalarSizeInBits();
20999   assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21000          "Expected saturation width smaller than result width");
21001
21002   // Promote result of FP_TO_*INT to at least 32 bits.
21003   if (TmpWidth < 32) {
21004     TmpVT = MVT::i32;
21005     TmpWidth = 32;
21006   }
21007
21008   // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21009   // us to use a native signed conversion instead.
21010   if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21011     TmpVT = MVT::i64;
21012     TmpWidth = 64;
21013   }
21014
21015   // If the saturation width is smaller than the size of the temporary result,
21016   // we can always use signed conversion, which is native.
21017   if (SatWidth < TmpWidth)
21018     FpToIntOpcode = ISD::FP_TO_SINT;
21019
21020   // Determine minimum and maximum integer values and their corresponding
21021   // floating-point values.
21022   APInt MinInt, MaxInt;
21023   if (IsSigned) {
21024     MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
21025     MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
21026   } else {
21027     MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
21028     MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
21029   }
21030
21031   APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21032   APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21033
21034   APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21035     MinInt, IsSigned, APFloat::rmTowardZero);
21036   APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21037     MaxInt, IsSigned, APFloat::rmTowardZero);
21038   bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21039                           && !(MaxStatus & APFloat::opStatus::opInexact);
21040
21041   SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21042   SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21043
21044   // If the integer bounds are exactly representable as floats, emit a
21045   // min+max+fptoi sequence. Otherwise use comparisons and selects.
21046   if (AreExactFloatBounds) {
21047     if (DstVT != TmpVT) {
21048       // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21049       SDValue MinClamped = DAG.getNode(
21050         X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21051       // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21052       SDValue BothClamped = DAG.getNode(
21053         X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21054       // Convert clamped value to integer.
21055       SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21056
21057       // NaN will become INDVAL, with the top bit set and the rest zero.
21058       // Truncation will discard the top bit, resulting in zero.
21059       return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21060     }
21061
21062     // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21063     SDValue MinClamped = DAG.getNode(
21064       X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21065     // Clamp by MaxFloat from above. NaN cannot occur.
21066     SDValue BothClamped = DAG.getNode(
21067       X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21068     // Convert clamped value to integer.
21069     SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21070
21071     if (!IsSigned) {
21072       // In the unsigned case we're done, because we mapped NaN to MinFloat,
21073       // which is zero.
21074       return FpToInt;
21075     }
21076
21077     // Otherwise, select zero if Src is NaN.
21078     SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21079     return DAG.getSelectCC(
21080       dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21081   }
21082
21083   SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21084   SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21085
21086   // Result of direct conversion, which may be selected away.
21087   SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21088
21089   if (DstVT != TmpVT) {
21090     // NaN will become INDVAL, with the top bit set and the rest zero.
21091     // Truncation will discard the top bit, resulting in zero.
21092     FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21093   }
21094
21095   SDValue Select = FpToInt;
21096   // For signed conversions where we saturate to the same size as the
21097   // result type of the fptoi instructions, INDVAL coincides with integer
21098   // minimum, so we don't need to explicitly check it.
21099   if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21100     // If Src ULT MinFloat, select MinInt. In particular, this also selects
21101     // MinInt if Src is NaN.
21102     Select = DAG.getSelectCC(
21103       dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21104   }
21105
21106   // If Src OGT MaxFloat, select MaxInt.
21107   Select = DAG.getSelectCC(
21108     dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21109
21110   // In the unsigned case we are done, because we mapped NaN to MinInt, which
21111   // is already zero. The promoted case was already handled above.
21112   if (!IsSigned || DstVT != TmpVT) {
21113     return Select;
21114   }
21115
21116   // Otherwise, select 0 if Src is NaN.
21117   SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21118   return DAG.getSelectCC(
21119     dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21120 }
21121
21122 SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21123   bool IsStrict = Op->isStrictFPOpcode();
21124
21125   SDLoc DL(Op);
21126   MVT VT = Op.getSimpleValueType();
21127   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21128   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21129   MVT SVT = In.getSimpleValueType();
21130
21131   // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21132   // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
21133   if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
21134                           !Subtarget.getTargetTriple().isOSDarwin()))
21135     return SDValue();
21136
21137   if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
21138       (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
21139     return Op;
21140
21141   if (SVT == MVT::f16) {
21142     if (Subtarget.hasFP16())
21143       return Op;
21144
21145     if (VT != MVT::f32) {
21146       if (IsStrict)
21147         return DAG.getNode(
21148             ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
21149             {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
21150                                 {MVT::f32, MVT::Other}, {Chain, In})});
21151
21152       return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21153                          DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
21154     }
21155
21156     if (!Subtarget.hasF16C()) {
21157       if (!Subtarget.getTargetTriple().isOSDarwin())
21158         return SDValue();
21159
21160       assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
21161
21162       // Need a libcall, but ABI for f16 is soft-float on MacOS.
21163       TargetLowering::CallLoweringInfo CLI(DAG);
21164       Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21165
21166       In = DAG.getBitcast(MVT::i16, In);
21167       TargetLowering::ArgListTy Args;
21168       TargetLowering::ArgListEntry Entry;
21169       Entry.Node = In;
21170       Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
21171       Entry.IsSExt = false;
21172       Entry.IsZExt = true;
21173       Args.push_back(Entry);
21174
21175       SDValue Callee = DAG.getExternalSymbol(
21176           getLibcallName(RTLIB::FPEXT_F16_F32),
21177           getPointerTy(DAG.getDataLayout()));
21178       CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21179           CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
21180           std::move(Args));
21181
21182       SDValue Res;
21183       std::tie(Res,Chain) = LowerCallTo(CLI);
21184       if (IsStrict)
21185         Res = DAG.getMergeValues({Res, Chain}, DL);
21186
21187       return Res;
21188     }
21189
21190     In = DAG.getBitcast(MVT::i16, In);
21191     In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
21192                      getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
21193                      DAG.getIntPtrConstant(0, DL));
21194     SDValue Res;
21195     if (IsStrict) {
21196       Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
21197                         {Chain, In});
21198       Chain = Res.getValue(1);
21199     } else {
21200       Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
21201                         DAG.getTargetConstant(4, DL, MVT::i32));
21202     }
21203     Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
21204                       DAG.getIntPtrConstant(0, DL));
21205     if (IsStrict)
21206       return DAG.getMergeValues({Res, Chain}, DL);
21207     return Res;
21208   }
21209
21210   if (!SVT.isVector())
21211     return Op;
21212
21213   if (SVT.getVectorElementType() == MVT::bf16) {
21214     // FIXME: Do we need to support strict FP?
21215     assert(!IsStrict && "Strict FP doesn't support BF16");
21216     if (VT.getVectorElementType() == MVT::f64) {
21217       MVT TmpVT = VT.changeVectorElementType(MVT::f32);
21218       return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21219                          DAG.getNode(ISD::FP_EXTEND, DL, TmpVT, In));
21220     }
21221     assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
21222     MVT NVT = SVT.changeVectorElementType(MVT::i32);
21223     In = DAG.getBitcast(SVT.changeTypeToInteger(), In);
21224     In = DAG.getNode(ISD::ZERO_EXTEND, DL, NVT, In);
21225     In = DAG.getNode(ISD::SHL, DL, NVT, In, DAG.getConstant(16, DL, NVT));
21226     return DAG.getBitcast(VT, In);
21227   }
21228
21229   if (SVT.getVectorElementType() == MVT::f16) {
21230     if (Subtarget.hasFP16() && isTypeLegal(SVT))
21231       return Op;
21232     assert(Subtarget.hasF16C() && "Unexpected features!");
21233     if (SVT == MVT::v2f16)
21234       In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
21235                        DAG.getUNDEF(MVT::v2f16));
21236     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
21237                               DAG.getUNDEF(MVT::v4f16));
21238     if (IsStrict)
21239       return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21240                          {Op->getOperand(0), Res});
21241     return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21242   } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
21243     return Op;
21244   }
21245
21246   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21247
21248   SDValue Res =
21249       DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21250   if (IsStrict)
21251     return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21252                        {Op->getOperand(0), Res});
21253   return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21254 }
21255
21256 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21257   bool IsStrict = Op->isStrictFPOpcode();
21258
21259   SDLoc DL(Op);
21260   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21261   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21262   MVT VT = Op.getSimpleValueType();
21263   MVT SVT = In.getSimpleValueType();
21264
21265   if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
21266     return SDValue();
21267
21268   if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
21269       !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
21270     if (!Subtarget.getTargetTriple().isOSDarwin())
21271       return SDValue();
21272
21273     // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
21274     TargetLowering::CallLoweringInfo CLI(DAG);
21275     Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21276
21277     TargetLowering::ArgListTy Args;
21278     TargetLowering::ArgListEntry Entry;
21279     Entry.Node = In;
21280     Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
21281     Entry.IsSExt = false;
21282     Entry.IsZExt = true;
21283     Args.push_back(Entry);
21284
21285     SDValue Callee = DAG.getExternalSymbol(
21286         getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
21287                                        : RTLIB::FPROUND_F32_F16),
21288         getPointerTy(DAG.getDataLayout()));
21289     CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21290         CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
21291         std::move(Args));
21292
21293     SDValue Res;
21294     std::tie(Res, Chain) = LowerCallTo(CLI);
21295
21296     Res = DAG.getBitcast(MVT::f16, Res);
21297
21298     if (IsStrict)
21299       Res = DAG.getMergeValues({Res, Chain}, DL);
21300
21301     return Res;
21302   }
21303
21304   if (VT.getScalarType() == MVT::bf16) {
21305     if (SVT.getScalarType() == MVT::f32 && isTypeLegal(VT))
21306       return Op;
21307     return SDValue();
21308   }
21309
21310   if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
21311     if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
21312       return SDValue();
21313
21314     if (VT.isVector())
21315       return Op;
21316
21317     SDValue Res;
21318     SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
21319                                         MVT::i32);
21320     if (IsStrict) {
21321       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
21322                         DAG.getConstantFP(0, DL, MVT::v4f32), In,
21323                         DAG.getIntPtrConstant(0, DL));
21324       Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
21325                         {Chain, Res, Rnd});
21326       Chain = Res.getValue(1);
21327     } else {
21328       // FIXME: Should we use zeros for upper elements for non-strict?
21329       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
21330       Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
21331     }
21332
21333     Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21334                       DAG.getIntPtrConstant(0, DL));
21335     Res = DAG.getBitcast(MVT::f16, Res);
21336
21337     if (IsStrict)
21338       return DAG.getMergeValues({Res, Chain}, DL);
21339
21340     return Res;
21341   }
21342
21343   return Op;
21344 }
21345
21346 static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
21347   bool IsStrict = Op->isStrictFPOpcode();
21348   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21349   assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
21350          "Unexpected VT!");
21351
21352   SDLoc dl(Op);
21353   SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21354                             DAG.getConstant(0, dl, MVT::v8i16), Src,
21355                             DAG.getIntPtrConstant(0, dl));
21356
21357   SDValue Chain;
21358   if (IsStrict) {
21359     Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21360                       {Op.getOperand(0), Res});
21361     Chain = Res.getValue(1);
21362   } else {
21363     Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21364   }
21365
21366   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21367                     DAG.getIntPtrConstant(0, dl));
21368
21369   if (IsStrict)
21370     return DAG.getMergeValues({Res, Chain}, dl);
21371
21372   return Res;
21373 }
21374
21375 static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
21376   bool IsStrict = Op->isStrictFPOpcode();
21377   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21378   assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
21379          "Unexpected VT!");
21380
21381   SDLoc dl(Op);
21382   SDValue Res, Chain;
21383   if (IsStrict) {
21384     Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21385                       DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21386                       DAG.getIntPtrConstant(0, dl));
21387     Res = DAG.getNode(
21388         X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21389         {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21390     Chain = Res.getValue(1);
21391   } else {
21392     // FIXME: Should we use zeros for upper elements for non-strict?
21393     Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21394     Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21395                       DAG.getTargetConstant(4, dl, MVT::i32));
21396   }
21397
21398   Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21399                     DAG.getIntPtrConstant(0, dl));
21400
21401   if (IsStrict)
21402     return DAG.getMergeValues({Res, Chain}, dl);
21403
21404   return Res;
21405 }
21406
21407 SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
21408                                            SelectionDAG &DAG) const {
21409   SDLoc DL(Op);
21410   MakeLibCallOptions CallOptions;
21411   RTLIB::Libcall LC =
21412       RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
21413   SDValue Res =
21414       makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
21415   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,
21416                      DAG.getBitcast(MVT::i32, Res));
21417 }
21418
21419 /// Depending on uarch and/or optimizing for size, we might prefer to use a
21420 /// vector operation in place of the typical scalar operation.
21421 static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
21422                                          const X86Subtarget &Subtarget) {
21423   // If both operands have other uses, this is probably not profitable.
21424   SDValue LHS = Op.getOperand(0);
21425   SDValue RHS = Op.getOperand(1);
21426   if (!LHS.hasOneUse() && !RHS.hasOneUse())
21427     return Op;
21428
21429   // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21430   bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21431   if (IsFP && !Subtarget.hasSSE3())
21432     return Op;
21433   if (!IsFP && !Subtarget.hasSSSE3())
21434     return Op;
21435
21436   // Extract from a common vector.
21437   if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21438       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21439       LHS.getOperand(0) != RHS.getOperand(0) ||
21440       !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21441       !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21442       !shouldUseHorizontalOp(true, DAG, Subtarget))
21443     return Op;
21444
21445   // Allow commuted 'hadd' ops.
21446   // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21447   unsigned HOpcode;
21448   switch (Op.getOpcode()) {
21449     case ISD::ADD: HOpcode = X86ISD::HADD; break;
21450     case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21451     case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21452     case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21453     default:
21454       llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
21455   }
21456   unsigned LExtIndex = LHS.getConstantOperandVal(1);
21457   unsigned RExtIndex = RHS.getConstantOperandVal(1);
21458   if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21459       (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21460     std::swap(LExtIndex, RExtIndex);
21461
21462   if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21463     return Op;
21464
21465   SDValue X = LHS.getOperand(0);
21466   EVT VecVT = X.getValueType();
21467   unsigned BitWidth = VecVT.getSizeInBits();
21468   unsigned NumLanes = BitWidth / 128;
21469   unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21470   assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
21471          "Not expecting illegal vector widths here");
21472
21473   // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21474   // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21475   SDLoc DL(Op);
21476   if (BitWidth == 256 || BitWidth == 512) {
21477     unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21478     X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21479     LExtIndex %= NumEltsPerLane;
21480   }
21481
21482   // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21483   // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21484   // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21485   // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21486   SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21487   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21488                      DAG.getIntPtrConstant(LExtIndex / 2, DL));
21489 }
21490
21491 /// Depending on uarch and/or optimizing for size, we might prefer to use a
21492 /// vector operation in place of the typical scalar operation.
21493 SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21494   assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
21495          "Only expecting float/double");
21496   return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
21497 }
21498
21499 /// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21500 /// This mode isn't supported in hardware on X86. But as long as we aren't
21501 /// compiling with trapping math, we can emulate this with
21502 /// trunc(X + copysign(nextafter(0.5, 0.0), X)).
21503 static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
21504   SDValue N0 = Op.getOperand(0);
21505   SDLoc dl(Op);
21506   MVT VT = Op.getSimpleValueType();
21507
21508   // N0 += copysign(nextafter(0.5, 0.0), N0)
21509   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21510   bool Ignored;
21511   APFloat Point5Pred = APFloat(0.5f);
21512   Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
21513   Point5Pred.next(/*nextDown*/true);
21514
21515   SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
21516                               DAG.getConstantFP(Point5Pred, dl, VT), N0);
21517   N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
21518
21519   // Truncate the result to remove fraction.
21520   return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
21521 }
21522
21523 /// The only differences between FABS and FNEG are the mask and the logic op.
21524 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
21525 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
21526   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
21527          "Wrong opcode for lowering FABS or FNEG.");
21528
21529   bool IsFABS = (Op.getOpcode() == ISD::FABS);
21530
21531   // If this is a FABS and it has an FNEG user, bail out to fold the combination
21532   // into an FNABS. We'll lower the FABS after that if it is still in use.
21533   if (IsFABS)
21534     for (SDNode *User : Op->uses())
21535       if (User->getOpcode() == ISD::FNEG)
21536         return Op;
21537
21538   SDLoc dl(Op);
21539   MVT VT = Op.getSimpleValueType();
21540
21541   bool IsF128 = (VT == MVT::f128);
21542   assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21543          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
21544          "Unexpected type in LowerFABSorFNEG");
21545
21546   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
21547   // decide if we should generate a 16-byte constant mask when we only need 4 or
21548   // 8 bytes for the scalar case.
21549
21550   // There are no scalar bitwise logical SSE/AVX instructions, so we
21551   // generate a 16-byte vector constant and logic op even for the scalar case.
21552   // Using a 16-byte mask allows folding the load of the mask with
21553   // the logic op, so it can save (~4 bytes) on code size.
21554   bool IsFakeVector = !VT.isVector() && !IsF128;
21555   MVT LogicVT = VT;
21556   if (IsFakeVector)
21557     LogicVT = (VT == MVT::f64)   ? MVT::v2f64
21558               : (VT == MVT::f32) ? MVT::v4f32
21559                                  : MVT::v8f16;
21560
21561   unsigned EltBits = VT.getScalarSizeInBits();
21562   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
21563   APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
21564                            APInt::getSignMask(EltBits);
21565   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21566   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
21567
21568   SDValue Op0 = Op.getOperand(0);
21569   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
21570   unsigned LogicOp = IsFABS  ? X86ISD::FAND :
21571                      IsFNABS ? X86ISD::FOR  :
21572                                X86ISD::FXOR;
21573   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
21574
21575   if (VT.isVector() || IsF128)
21576     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21577
21578   // For the scalar case extend to a 128-bit vector, perform the logic op,
21579   // and extract the scalar result back out.
21580   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
21581   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21582   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
21583                      DAG.getIntPtrConstant(0, dl));
21584 }
21585
21586 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
21587   SDValue Mag = Op.getOperand(0);
21588   SDValue Sign = Op.getOperand(1);
21589   SDLoc dl(Op);
21590
21591   // If the sign operand is smaller, extend it first.
21592   MVT VT = Op.getSimpleValueType();
21593   if (Sign.getSimpleValueType().bitsLT(VT))
21594     Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
21595
21596   // And if it is bigger, shrink it first.
21597   if (Sign.getSimpleValueType().bitsGT(VT))
21598     Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
21599                        DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21600
21601   // At this point the operands and the result should have the same
21602   // type, and that won't be f80 since that is not custom lowered.
21603   bool IsF128 = (VT == MVT::f128);
21604   assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21605          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
21606          "Unexpected type in LowerFCOPYSIGN");
21607
21608   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
21609
21610   // Perform all scalar logic operations as 16-byte vectors because there are no
21611   // scalar FP logic instructions in SSE.
21612   // TODO: This isn't necessary. If we used scalar types, we might avoid some
21613   // unnecessary splats, but we might miss load folding opportunities. Should
21614   // this decision be based on OptimizeForSize?
21615   bool IsFakeVector = !VT.isVector() && !IsF128;
21616   MVT LogicVT = VT;
21617   if (IsFakeVector)
21618     LogicVT = (VT == MVT::f64)   ? MVT::v2f64
21619               : (VT == MVT::f32) ? MVT::v4f32
21620                                  : MVT::v8f16;
21621
21622   // The mask constants are automatically splatted for vector types.
21623   unsigned EltSizeInBits = VT.getScalarSizeInBits();
21624   SDValue SignMask = DAG.getConstantFP(
21625       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
21626   SDValue MagMask = DAG.getConstantFP(
21627       APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
21628
21629   // First, clear all bits but the sign bit from the second operand (sign).
21630   if (IsFakeVector)
21631     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
21632   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
21633
21634   // Next, clear the sign bit from the first operand (magnitude).
21635   // TODO: If we had general constant folding for FP logic ops, this check
21636   // wouldn't be necessary.
21637   SDValue MagBits;
21638   if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
21639     APFloat APF = Op0CN->getValueAPF();
21640     APF.clearSign();
21641     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
21642   } else {
21643     // If the magnitude operand wasn't a constant, we need to AND out the sign.
21644     if (IsFakeVector)
21645       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
21646     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
21647   }
21648
21649   // OR the magnitude value with the sign bit.
21650   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
21651   return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
21652                                           DAG.getIntPtrConstant(0, dl));
21653 }
21654
21655 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
21656   SDValue N0 = Op.getOperand(0);
21657   SDLoc dl(Op);
21658   MVT VT = Op.getSimpleValueType();
21659
21660   MVT OpVT = N0.getSimpleValueType();
21661   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
21662          "Unexpected type for FGETSIGN");
21663
21664   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
21665   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
21666   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
21667   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
21668   Res = DAG.getZExtOrTrunc(Res, dl, VT);
21669   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
21670   return Res;
21671 }
21672
21673 /// Helper for attempting to create a X86ISD::BT node.
21674 static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
21675   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
21676   // instruction.  Since the shift amount is in-range-or-undefined, we know
21677   // that doing a bittest on the i32 value is ok.  We extend to i32 because
21678   // the encoding for the i16 version is larger than the i32 version.
21679   // Also promote i16 to i32 for performance / code size reason.
21680   if (Src.getValueType().getScalarSizeInBits() < 32)
21681     Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
21682
21683   // No legal type found, give up.
21684   if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
21685     return SDValue();
21686
21687   // See if we can use the 32-bit instruction instead of the 64-bit one for a
21688   // shorter encoding. Since the former takes the modulo 32 of BitNo and the
21689   // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
21690   // known to be zero.
21691   if (Src.getValueType() == MVT::i64 &&
21692       DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
21693     Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
21694
21695   // If the operand types disagree, extend the shift amount to match.  Since
21696   // BT ignores high bits (like shifts) we can use anyextend.
21697   if (Src.getValueType() != BitNo.getValueType()) {
21698     // Peek through a mask/modulo operation.
21699     // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
21700     // we probably need a better IsDesirableToPromoteOp to handle this as well.
21701     if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
21702       BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
21703                           DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
21704                                       BitNo.getOperand(0)),
21705                           DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
21706                                       BitNo.getOperand(1)));
21707     else
21708       BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
21709   }
21710
21711   return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
21712 }
21713
21714 /// Helper for creating a X86ISD::SETCC node.
21715 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
21716                         SelectionDAG &DAG) {
21717   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
21718                      DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
21719 }
21720
21721 /// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
21722 /// recognizable memcmp expansion.
21723 static bool isOrXorXorTree(SDValue X, bool Root = true) {
21724   if (X.getOpcode() == ISD::OR)
21725     return isOrXorXorTree(X.getOperand(0), false) &&
21726            isOrXorXorTree(X.getOperand(1), false);
21727   if (Root)
21728     return false;
21729   return X.getOpcode() == ISD::XOR;
21730 }
21731
21732 /// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
21733 /// expansion.
21734 template <typename F>
21735 static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,
21736                                 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
21737   SDValue Op0 = X.getOperand(0);
21738   SDValue Op1 = X.getOperand(1);
21739   if (X.getOpcode() == ISD::OR) {
21740     SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
21741     SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
21742     if (VecVT != CmpVT)
21743       return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
21744     if (HasPT)
21745       return DAG.getNode(ISD::OR, DL, VecVT, A, B);
21746     return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
21747   }
21748   if (X.getOpcode() == ISD::XOR) {
21749     SDValue A = SToV(Op0);
21750     SDValue B = SToV(Op1);
21751     if (VecVT != CmpVT)
21752       return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
21753     if (HasPT)
21754       return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
21755     return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
21756   }
21757   llvm_unreachable("Impossible");
21758 }
21759
21760 /// Try to map a 128-bit or larger integer comparison to vector instructions
21761 /// before type legalization splits it up into chunks.
21762 static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
21763                                                ISD::CondCode CC,
21764                                                const SDLoc &DL,
21765                                                SelectionDAG &DAG,
21766                                                const X86Subtarget &Subtarget) {
21767   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
21768
21769   // We're looking for an oversized integer equality comparison.
21770   EVT OpVT = X.getValueType();
21771   unsigned OpSize = OpVT.getSizeInBits();
21772   if (!OpVT.isScalarInteger() || OpSize < 128)
21773     return SDValue();
21774
21775   // Ignore a comparison with zero because that gets special treatment in
21776   // EmitTest(). But make an exception for the special case of a pair of
21777   // logically-combined vector-sized operands compared to zero. This pattern may
21778   // be generated by the memcmp expansion pass with oversized integer compares
21779   // (see PR33325).
21780   bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
21781   if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
21782     return SDValue();
21783
21784   // Don't perform this combine if constructing the vector will be expensive.
21785   auto IsVectorBitCastCheap = [](SDValue X) {
21786     X = peekThroughBitcasts(X);
21787     return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
21788            X.getOpcode() == ISD::LOAD;
21789   };
21790   if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
21791       !IsOrXorXorTreeCCZero)
21792     return SDValue();
21793
21794   // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
21795   // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
21796   // Otherwise use PCMPEQ (plus AND) and mask testing.
21797   bool NoImplicitFloatOps =
21798       DAG.getMachineFunction().getFunction().hasFnAttribute(
21799           Attribute::NoImplicitFloat);
21800   if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
21801       ((OpSize == 128 && Subtarget.hasSSE2()) ||
21802        (OpSize == 256 && Subtarget.hasAVX()) ||
21803        (OpSize == 512 && Subtarget.useAVX512Regs()))) {
21804     bool HasPT = Subtarget.hasSSE41();
21805
21806     // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
21807     // vector registers are essentially free. (Technically, widening registers
21808     // prevents load folding, but the tradeoff is worth it.)
21809     bool PreferKOT = Subtarget.preferMaskRegisters();
21810     bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
21811
21812     EVT VecVT = MVT::v16i8;
21813     EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
21814     if (OpSize == 256) {
21815       VecVT = MVT::v32i8;
21816       CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
21817     }
21818     EVT CastVT = VecVT;
21819     bool NeedsAVX512FCast = false;
21820     if (OpSize == 512 || NeedZExt) {
21821       if (Subtarget.hasBWI()) {
21822         VecVT = MVT::v64i8;
21823         CmpVT = MVT::v64i1;
21824         if (OpSize == 512)
21825           CastVT = VecVT;
21826       } else {
21827         VecVT = MVT::v16i32;
21828         CmpVT = MVT::v16i1;
21829         CastVT = OpSize == 512   ? VecVT
21830                  : OpSize == 256 ? MVT::v8i32
21831                                  : MVT::v4i32;
21832         NeedsAVX512FCast = true;
21833       }
21834     }
21835
21836     auto ScalarToVector = [&](SDValue X) -> SDValue {
21837       bool TmpZext = false;
21838       EVT TmpCastVT = CastVT;
21839       if (X.getOpcode() == ISD::ZERO_EXTEND) {
21840         SDValue OrigX = X.getOperand(0);
21841         unsigned OrigSize = OrigX.getScalarValueSizeInBits();
21842         if (OrigSize < OpSize) {
21843           if (OrigSize == 128) {
21844             TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
21845             X = OrigX;
21846             TmpZext = true;
21847           } else if (OrigSize == 256) {
21848             TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
21849             X = OrigX;
21850             TmpZext = true;
21851           }
21852         }
21853       }
21854       X = DAG.getBitcast(TmpCastVT, X);
21855       if (!NeedZExt && !TmpZext)
21856         return X;
21857       return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
21858                          DAG.getConstant(0, DL, VecVT), X,
21859                          DAG.getVectorIdxConstant(0, DL));
21860     };
21861
21862     SDValue Cmp;
21863     if (IsOrXorXorTreeCCZero) {
21864       // This is a bitwise-combined equality comparison of 2 pairs of vectors:
21865       // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
21866       // Use 2 vector equality compares and 'and' the results before doing a
21867       // MOVMSK.
21868       Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
21869     } else {
21870       SDValue VecX = ScalarToVector(X);
21871       SDValue VecY = ScalarToVector(Y);
21872       if (VecVT != CmpVT) {
21873         Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
21874       } else if (HasPT) {
21875         Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
21876       } else {
21877         Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
21878       }
21879     }
21880     // AVX512 should emit a setcc that will lower to kortest.
21881     if (VecVT != CmpVT) {
21882       EVT KRegVT = CmpVT == MVT::v64i1   ? MVT::i64
21883                    : CmpVT == MVT::v32i1 ? MVT::i32
21884                                          : MVT::i16;
21885       return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
21886                           DAG.getConstant(0, DL, KRegVT), CC);
21887     }
21888     if (HasPT) {
21889       SDValue BCCmp =
21890           DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
21891       SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
21892       X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
21893       SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
21894       return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
21895     }
21896     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
21897     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
21898     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
21899     assert(Cmp.getValueType() == MVT::v16i8 &&
21900            "Non 128-bit vector on pre-SSE41 target");
21901     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
21902     SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
21903     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
21904   }
21905
21906   return SDValue();
21907 }
21908
21909 /// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
21910 /// style scalarized (associative) reduction patterns. Partial reductions
21911 /// are supported when the pointer SrcMask is non-null.
21912 /// TODO - move this to SelectionDAG?
21913 static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
21914                                  SmallVectorImpl<SDValue> &SrcOps,
21915                                  SmallVectorImpl<APInt> *SrcMask = nullptr) {
21916   SmallVector<SDValue, 8> Opnds;
21917   DenseMap<SDValue, APInt> SrcOpMap;
21918   EVT VT = MVT::Other;
21919
21920   // Recognize a special case where a vector is casted into wide integer to
21921   // test all 0s.
21922   assert(Op.getOpcode() == unsigned(BinOp) &&
21923          "Unexpected bit reduction opcode");
21924   Opnds.push_back(Op.getOperand(0));
21925   Opnds.push_back(Op.getOperand(1));
21926
21927   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
21928     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
21929     // BFS traverse all BinOp operands.
21930     if (I->getOpcode() == unsigned(BinOp)) {
21931       Opnds.push_back(I->getOperand(0));
21932       Opnds.push_back(I->getOperand(1));
21933       // Re-evaluate the number of nodes to be traversed.
21934       e += 2; // 2 more nodes (LHS and RHS) are pushed.
21935       continue;
21936     }
21937
21938     // Quit if a non-EXTRACT_VECTOR_ELT
21939     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
21940       return false;
21941
21942     // Quit if without a constant index.
21943     auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
21944     if (!Idx)
21945       return false;
21946
21947     SDValue Src = I->getOperand(0);
21948     DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
21949     if (M == SrcOpMap.end()) {
21950       VT = Src.getValueType();
21951       // Quit if not the same type.
21952       if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
21953         return false;
21954       unsigned NumElts = VT.getVectorNumElements();
21955       APInt EltCount = APInt::getZero(NumElts);
21956       M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
21957       SrcOps.push_back(Src);
21958     }
21959
21960     // Quit if element already used.
21961     unsigned CIdx = Idx->getZExtValue();
21962     if (M->second[CIdx])
21963       return false;
21964     M->second.setBit(CIdx);
21965   }
21966
21967   if (SrcMask) {
21968     // Collect the source partial masks.
21969     for (SDValue &SrcOp : SrcOps)
21970       SrcMask->push_back(SrcOpMap[SrcOp]);
21971   } else {
21972     // Quit if not all elements are used.
21973     for (const auto &I : SrcOpMap)
21974       if (!I.second.isAllOnes())
21975         return false;
21976   }
21977
21978   return true;
21979 }
21980
21981 // Helper function for comparing all bits of two vectors.
21982 static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS,
21983                                    ISD::CondCode CC, const APInt &OriginalMask,
21984                                    const X86Subtarget &Subtarget,
21985                                    SelectionDAG &DAG, X86::CondCode &X86CC) {
21986   EVT VT = LHS.getValueType();
21987   unsigned ScalarSize = VT.getScalarSizeInBits();
21988   if (OriginalMask.getBitWidth() != ScalarSize) {
21989     assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
21990     return SDValue();
21991   }
21992
21993   // Quit if not convertable to legal scalar or 128/256-bit vector.
21994   if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
21995     return SDValue();
21996
21997   // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
21998   if (VT.isFloatingPoint())
21999     return SDValue();
22000
22001   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22002   X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22003
22004   APInt Mask = OriginalMask;
22005
22006   auto MaskBits = [&](SDValue Src) {
22007     if (Mask.isAllOnes())
22008       return Src;
22009     EVT SrcVT = Src.getValueType();
22010     SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22011     return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22012   };
22013
22014   // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22015   if (VT.getSizeInBits() < 128) {
22016     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22017     if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
22018       if (IntVT != MVT::i64)
22019         return SDValue();
22020       auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
22021                                       MVT::i32, MVT::i32);
22022       auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
22023                                       MVT::i32, MVT::i32);
22024       SDValue Lo =
22025           DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
22026       SDValue Hi =
22027           DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
22028       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22029                          DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
22030                          DAG.getConstant(0, DL, MVT::i32));
22031     }
22032     return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22033                        DAG.getBitcast(IntVT, MaskBits(LHS)),
22034                        DAG.getBitcast(IntVT, MaskBits(RHS)));
22035   }
22036
22037   // Without PTEST, a masked v2i64 or-reduction is not faster than
22038   // scalarization.
22039   bool UseKORTEST = Subtarget.useAVX512Regs();
22040   bool UsePTEST = Subtarget.hasSSE41();
22041   if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
22042     return SDValue();
22043
22044   // Split down to 128/256/512-bit vector.
22045   unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
22046
22047   // If the input vector has vector elements wider than the target test size,
22048   // then cast to <X x i64> so it will safely split.
22049   if (ScalarSize > TestSize) {
22050     if (!Mask.isAllOnes())
22051       return SDValue();
22052     VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
22053     LHS = DAG.getBitcast(VT, LHS);
22054     RHS = DAG.getBitcast(VT, RHS);
22055     Mask = APInt::getAllOnes(64);
22056   }
22057
22058   if (VT.getSizeInBits() > TestSize) {
22059     KnownBits KnownRHS = DAG.computeKnownBits(RHS);
22060     if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
22061       // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22062       while (VT.getSizeInBits() > TestSize) {
22063         auto Split = DAG.SplitVector(LHS, DL);
22064         VT = Split.first.getValueType();
22065         LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22066       }
22067       RHS = DAG.getAllOnesConstant(DL, VT);
22068     } else if (!UsePTEST && !KnownRHS.isZero()) {
22069       // MOVMSK Special Case:
22070       // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22071       MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
22072       VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
22073       LHS = DAG.getBitcast(VT, MaskBits(LHS));
22074       RHS = DAG.getBitcast(VT, MaskBits(RHS));
22075       EVT BoolVT = VT.changeVectorElementType(MVT::i1);
22076       SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
22077       V = DAG.getSExtOrTrunc(V, DL, VT);
22078       while (VT.getSizeInBits() > TestSize) {
22079         auto Split = DAG.SplitVector(V, DL);
22080         VT = Split.first.getValueType();
22081         V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22082       }
22083       V = DAG.getNOT(DL, V, VT);
22084       V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22085       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22086                          DAG.getConstant(0, DL, MVT::i32));
22087     } else {
22088       // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
22089       SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
22090       while (VT.getSizeInBits() > TestSize) {
22091         auto Split = DAG.SplitVector(V, DL);
22092         VT = Split.first.getValueType();
22093         V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22094       }
22095       LHS = V;
22096       RHS = DAG.getConstant(0, DL, VT);
22097     }
22098   }
22099
22100   if (UseKORTEST && VT.is512BitVector()) {
22101     MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22102     MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
22103     LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22104     RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22105     SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
22106     return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
22107   }
22108
22109   if (UsePTEST) {
22110     MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
22111     LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22112     RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22113     SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
22114     return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22115   }
22116
22117   assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22118   MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
22119   LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
22120   RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
22121   SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
22122   V = DAG.getNOT(DL, V, MaskVT);
22123   V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22124   return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22125                      DAG.getConstant(0, DL, MVT::i32));
22126 }
22127
22128 // Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22129 // to CMP(MOVMSK(PCMPEQB(X,Y))).
22130 static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,
22131                                        ISD::CondCode CC, const SDLoc &DL,
22132                                        const X86Subtarget &Subtarget,
22133                                        SelectionDAG &DAG,
22134                                        X86::CondCode &X86CC) {
22135   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22136
22137   bool CmpNull = isNullConstant(RHS);
22138   bool CmpAllOnes = isAllOnesConstant(RHS);
22139   if (!CmpNull && !CmpAllOnes)
22140     return SDValue();
22141
22142   SDValue Op = LHS;
22143   if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22144     return SDValue();
22145
22146   // Check whether we're masking/truncating an OR-reduction result, in which
22147   // case track the masked bits.
22148   // TODO: Add CmpAllOnes support.
22149   APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
22150   if (CmpNull) {
22151     switch (Op.getOpcode()) {
22152     case ISD::TRUNCATE: {
22153       SDValue Src = Op.getOperand(0);
22154       Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22155                                   Op.getScalarValueSizeInBits());
22156       Op = Src;
22157       break;
22158     }
22159     case ISD::AND: {
22160       if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22161         Mask = Cst->getAPIntValue();
22162         Op = Op.getOperand(0);
22163       }
22164       break;
22165     }
22166     }
22167   }
22168
22169   ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
22170
22171   // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
22172   // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22173   SmallVector<SDValue, 8> VecIns;
22174   if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
22175     EVT VT = VecIns[0].getValueType();
22176     assert(llvm::all_of(VecIns,
22177                         [VT](SDValue V) { return VT == V.getValueType(); }) &&
22178            "Reduction source vector mismatch");
22179
22180     // Quit if not splittable to scalar/128/256/512-bit vector.
22181     if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22182       return SDValue();
22183
22184     // If more than one full vector is evaluated, AND/OR them first before
22185     // PTEST.
22186     for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22187          Slot += 2, e += 1) {
22188       // Each iteration will AND/OR 2 nodes and append the result until there is
22189       // only 1 node left, i.e. the final value of all vectors.
22190       SDValue LHS = VecIns[Slot];
22191       SDValue RHS = VecIns[Slot + 1];
22192       VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
22193     }
22194
22195     return LowerVectorAllEqual(DL, VecIns.back(),
22196                                CmpNull ? DAG.getConstant(0, DL, VT)
22197                                        : DAG.getAllOnesConstant(DL, VT),
22198                                CC, Mask, Subtarget, DAG, X86CC);
22199   }
22200
22201   // Match icmp(reduce_or(X),0) anyof reduction patterns.
22202   // Match icmp(reduce_and(X),-1) allof reduction patterns.
22203   if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22204     ISD::NodeType BinOp;
22205     if (SDValue Match =
22206             DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
22207       EVT MatchVT = Match.getValueType();
22208       return LowerVectorAllEqual(DL, Match,
22209                                  CmpNull ? DAG.getConstant(0, DL, MatchVT)
22210                                          : DAG.getAllOnesConstant(DL, MatchVT),
22211                                  CC, Mask, Subtarget, DAG, X86CC);
22212     }
22213   }
22214
22215   if (Mask.isAllOnes()) {
22216     assert(!Op.getValueType().isVector() &&
22217            "Illegal vector type for reduction pattern");
22218     SDValue Src = peekThroughBitcasts(Op);
22219     if (Src.getValueType().isFixedLengthVector() &&
22220         Src.getValueType().getScalarType() == MVT::i1) {
22221       // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
22222       // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22223       if (Src.getOpcode() == ISD::SETCC) {
22224         SDValue LHS = Src.getOperand(0);
22225         SDValue RHS = Src.getOperand(1);
22226         EVT LHSVT = LHS.getValueType();
22227         ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22228         if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
22229             llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
22230           APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
22231           return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
22232                                      X86CC);
22233         }
22234       }
22235       // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
22236       // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22237       // Peek through truncation, mask the LSB and compare against zero/LSB.
22238       if (Src.getOpcode() == ISD::TRUNCATE) {
22239         SDValue Inner = Src.getOperand(0);
22240         EVT InnerVT = Inner.getValueType();
22241         if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
22242           unsigned BW = InnerVT.getScalarSizeInBits();
22243           APInt SrcMask = APInt(BW, 1);
22244           APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
22245           return LowerVectorAllEqual(DL, Inner,
22246                                      DAG.getConstant(Cmp, DL, InnerVT), CC,
22247                                      SrcMask, Subtarget, DAG, X86CC);
22248         }
22249       }
22250     }
22251   }
22252
22253   return SDValue();
22254 }
22255
22256 /// return true if \c Op has a use that doesn't just read flags.
22257 static bool hasNonFlagsUse(SDValue Op) {
22258   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22259        ++UI) {
22260     SDNode *User = *UI;
22261     unsigned UOpNo = UI.getOperandNo();
22262     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22263       // Look pass truncate.
22264       UOpNo = User->use_begin().getOperandNo();
22265       User = *User->use_begin();
22266     }
22267
22268     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22269         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22270       return true;
22271   }
22272   return false;
22273 }
22274
22275 // Transform to an x86-specific ALU node with flags if there is a chance of
22276 // using an RMW op or only the flags are used. Otherwise, leave
22277 // the node alone and emit a 'cmp' or 'test' instruction.
22278 static bool isProfitableToUseFlagOp(SDValue Op) {
22279   for (SDNode *U : Op->uses())
22280     if (U->getOpcode() != ISD::CopyToReg &&
22281         U->getOpcode() != ISD::SETCC &&
22282         U->getOpcode() != ISD::STORE)
22283       return false;
22284
22285   return true;
22286 }
22287
22288 /// Emit nodes that will be selected as "test Op0,Op0", or something
22289 /// equivalent.
22290 static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22291                         SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22292   // CF and OF aren't always set the way we want. Determine which
22293   // of these we need.
22294   bool NeedCF = false;
22295   bool NeedOF = false;
22296   switch (X86CC) {
22297   default: break;
22298   case X86::COND_A: case X86::COND_AE:
22299   case X86::COND_B: case X86::COND_BE:
22300     NeedCF = true;
22301     break;
22302   case X86::COND_G: case X86::COND_GE:
22303   case X86::COND_L: case X86::COND_LE:
22304   case X86::COND_O: case X86::COND_NO: {
22305     // Check if we really need to set the
22306     // Overflow flag. If NoSignedWrap is present
22307     // that is not actually needed.
22308     switch (Op->getOpcode()) {
22309     case ISD::ADD:
22310     case ISD::SUB:
22311     case ISD::MUL:
22312     case ISD::SHL:
22313       if (Op.getNode()->getFlags().hasNoSignedWrap())
22314         break;
22315       [[fallthrough]];
22316     default:
22317       NeedOF = true;
22318       break;
22319     }
22320     break;
22321   }
22322   }
22323   // See if we can use the EFLAGS value from the operand instead of
22324   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22325   // we prove that the arithmetic won't overflow, we can't use OF or CF.
22326   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22327     // Emit a CMP with 0, which is the TEST pattern.
22328     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22329                        DAG.getConstant(0, dl, Op.getValueType()));
22330   }
22331   unsigned Opcode = 0;
22332   unsigned NumOperands = 0;
22333
22334   SDValue ArithOp = Op;
22335
22336   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22337   // which may be the result of a CAST.  We use the variable 'Op', which is the
22338   // non-casted variable when we check for possible users.
22339   switch (ArithOp.getOpcode()) {
22340   case ISD::AND:
22341     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22342     // because a TEST instruction will be better.
22343     if (!hasNonFlagsUse(Op))
22344       break;
22345
22346     [[fallthrough]];
22347   case ISD::ADD:
22348   case ISD::SUB:
22349   case ISD::OR:
22350   case ISD::XOR:
22351     if (!isProfitableToUseFlagOp(Op))
22352       break;
22353
22354     // Otherwise use a regular EFLAGS-setting instruction.
22355     switch (ArithOp.getOpcode()) {
22356     default: llvm_unreachable("unexpected operator!");
22357     case ISD::ADD: Opcode = X86ISD::ADD; break;
22358     case ISD::SUB: Opcode = X86ISD::SUB; break;
22359     case ISD::XOR: Opcode = X86ISD::XOR; break;
22360     case ISD::AND: Opcode = X86ISD::AND; break;
22361     case ISD::OR:  Opcode = X86ISD::OR;  break;
22362     }
22363
22364     NumOperands = 2;
22365     break;
22366   case X86ISD::ADD:
22367   case X86ISD::SUB:
22368   case X86ISD::OR:
22369   case X86ISD::XOR:
22370   case X86ISD::AND:
22371     return SDValue(Op.getNode(), 1);
22372   case ISD::SSUBO:
22373   case ISD::USUBO: {
22374     // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22375     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22376     return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22377                        Op->getOperand(1)).getValue(1);
22378   }
22379   default:
22380     break;
22381   }
22382
22383   if (Opcode == 0) {
22384     // Emit a CMP with 0, which is the TEST pattern.
22385     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22386                        DAG.getConstant(0, dl, Op.getValueType()));
22387   }
22388   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22389   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22390
22391   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22392   DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22393   return SDValue(New.getNode(), 1);
22394 }
22395
22396 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
22397 /// equivalent.
22398 static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22399                        const SDLoc &dl, SelectionDAG &DAG,
22400                        const X86Subtarget &Subtarget) {
22401   if (isNullConstant(Op1))
22402     return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22403
22404   EVT CmpVT = Op0.getValueType();
22405
22406   assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
22407           CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
22408
22409   // Only promote the compare up to I32 if it is a 16 bit operation
22410   // with an immediate.  16 bit immediates are to be avoided.
22411   if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22412       !DAG.getMachineFunction().getFunction().hasMinSize()) {
22413     ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22414     ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22415     // Don't do this if the immediate can fit in 8-bits.
22416     if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22417         (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22418       unsigned ExtendOp =
22419           isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22420       if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22421         // For equality comparisons try to use SIGN_EXTEND if the input was
22422         // truncate from something with enough sign bits.
22423         if (Op0.getOpcode() == ISD::TRUNCATE) {
22424           if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
22425             ExtendOp = ISD::SIGN_EXTEND;
22426         } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22427           if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
22428             ExtendOp = ISD::SIGN_EXTEND;
22429         }
22430       }
22431
22432       CmpVT = MVT::i32;
22433       Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22434       Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22435     }
22436   }
22437
22438   // Try to shrink i64 compares if the input has enough zero bits.
22439   // FIXME: Do this for non-constant compares for constant on LHS?
22440   if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
22441       Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22442       cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
22443       DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22444     CmpVT = MVT::i32;
22445     Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22446     Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22447   }
22448
22449   // 0-x == y --> x+y == 0
22450   // 0-x != y --> x+y != 0
22451   if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22452       Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22453     SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22454     SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22455     return Add.getValue(1);
22456   }
22457
22458   // x == 0-y --> x+y == 0
22459   // x != 0-y --> x+y != 0
22460   if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22461       Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22462     SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22463     SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22464     return Add.getValue(1);
22465   }
22466
22467   // Use SUB instead of CMP to enable CSE between SUB and CMP.
22468   SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22469   SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22470   return Sub.getValue(1);
22471 }
22472
22473 bool X86TargetLowering::isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond,
22474                                                           EVT VT) const {
22475   return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
22476 }
22477
22478 bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
22479     SDNode *N, SDValue, SDValue IntPow2) const {
22480   if (N->getOpcode() == ISD::FDIV)
22481     return true;
22482
22483   EVT FPVT = N->getValueType(0);
22484   EVT IntVT = IntPow2.getValueType();
22485
22486   // This indicates a non-free bitcast.
22487   // TODO: This is probably overly conservative as we will need to scale the
22488   // integer vector anyways for the int->fp cast.
22489   if (FPVT.isVector() &&
22490       FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
22491     return false;
22492
22493   return true;
22494 }
22495
22496 /// Check if replacement of SQRT with RSQRT should be disabled.
22497 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22498   EVT VT = Op.getValueType();
22499
22500   // We don't need to replace SQRT with RSQRT for half type.
22501   if (VT.getScalarType() == MVT::f16)
22502     return true;
22503
22504   // We never want to use both SQRT and RSQRT instructions for the same input.
22505   if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22506     return false;
22507
22508   if (VT.isVector())
22509     return Subtarget.hasFastVectorFSQRT();
22510   return Subtarget.hasFastScalarFSQRT();
22511 }
22512
22513 /// The minimum architected relative accuracy is 2^-12. We need one
22514 /// Newton-Raphson step to have a good float result (24 bits of precision).
22515 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22516                                            SelectionDAG &DAG, int Enabled,
22517                                            int &RefinementSteps,
22518                                            bool &UseOneConstNR,
22519                                            bool Reciprocal) const {
22520   SDLoc DL(Op);
22521   EVT VT = Op.getValueType();
22522
22523   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22524   // It is likely not profitable to do this for f64 because a double-precision
22525   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22526   // instructions: convert to single, rsqrtss, convert back to double, refine
22527   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22528   // along with FMA, this could be a throughput win.
22529   // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22530   // after legalize types.
22531   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22532       (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22533       (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22534       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22535       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22536     if (RefinementSteps == ReciprocalEstimate::Unspecified)
22537       RefinementSteps = 1;
22538
22539     UseOneConstNR = false;
22540     // There is no FSQRT for 512-bits, but there is RSQRT14.
22541     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22542     SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
22543     if (RefinementSteps == 0 && !Reciprocal)
22544       Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
22545     return Estimate;
22546   }
22547
22548   if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22549       Subtarget.hasFP16()) {
22550     assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
22551     if (RefinementSteps == ReciprocalEstimate::Unspecified)
22552       RefinementSteps = 0;
22553
22554     if (VT == MVT::f16) {
22555       SDValue Zero = DAG.getIntPtrConstant(0, DL);
22556       SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22557       Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22558       Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
22559       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22560     }
22561
22562     return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
22563   }
22564   return SDValue();
22565 }
22566
22567 /// The minimum architected relative accuracy is 2^-12. We need one
22568 /// Newton-Raphson step to have a good float result (24 bits of precision).
22569 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22570                                             int Enabled,
22571                                             int &RefinementSteps) const {
22572   SDLoc DL(Op);
22573   EVT VT = Op.getValueType();
22574
22575   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22576   // It is likely not profitable to do this for f64 because a double-precision
22577   // reciprocal estimate with refinement on x86 prior to FMA requires
22578   // 15 instructions: convert to single, rcpss, convert back to double, refine
22579   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22580   // along with FMA, this could be a throughput win.
22581
22582   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22583       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22584       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22585       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22586     // Enable estimate codegen with 1 refinement step for vector division.
22587     // Scalar division estimates are disabled because they break too much
22588     // real-world code. These defaults are intended to match GCC behavior.
22589     if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22590       return SDValue();
22591
22592     if (RefinementSteps == ReciprocalEstimate::Unspecified)
22593       RefinementSteps = 1;
22594
22595     // There is no FSQRT for 512-bits, but there is RCP14.
22596     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22597     return DAG.getNode(Opcode, DL, VT, Op);
22598   }
22599
22600   if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22601       Subtarget.hasFP16()) {
22602     if (RefinementSteps == ReciprocalEstimate::Unspecified)
22603       RefinementSteps = 0;
22604
22605     if (VT == MVT::f16) {
22606       SDValue Zero = DAG.getIntPtrConstant(0, DL);
22607       SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22608       Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22609       Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
22610       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22611     }
22612
22613     return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
22614   }
22615   return SDValue();
22616 }
22617
22618 /// If we have at least two divisions that use the same divisor, convert to
22619 /// multiplication by a reciprocal. This may need to be adjusted for a given
22620 /// CPU if a division's cost is not at least twice the cost of a multiplication.
22621 /// This is because we still need one division to calculate the reciprocal and
22622 /// then we need two multiplies by that reciprocal as replacements for the
22623 /// original divisions.
22624 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22625   return 2;
22626 }
22627
22628 SDValue
22629 X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22630                                  SelectionDAG &DAG,
22631                                  SmallVectorImpl<SDNode *> &Created) const {
22632   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
22633   if (isIntDivCheap(N->getValueType(0), Attr))
22634     return SDValue(N,0); // Lower SDIV as SDIV
22635
22636   assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
22637          "Unexpected divisor!");
22638
22639   // Only perform this transform if CMOV is supported otherwise the select
22640   // below will become a branch.
22641   if (!Subtarget.canUseCMOV())
22642     return SDValue();
22643
22644   // fold (sdiv X, pow2)
22645   EVT VT = N->getValueType(0);
22646   // FIXME: Support i8.
22647   if (VT != MVT::i16 && VT != MVT::i32 &&
22648       !(Subtarget.is64Bit() && VT == MVT::i64))
22649     return SDValue();
22650
22651   // If the divisor is 2 or -2, the default expansion is better.
22652   if (Divisor == 2 ||
22653       Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
22654     return SDValue();
22655
22656   return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
22657 }
22658
22659 /// Result of 'and' is compared against zero. Change to a BT node if possible.
22660 /// Returns the BT node and the condition code needed to use it.
22661 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
22662                             SelectionDAG &DAG, X86::CondCode &X86CC) {
22663   assert(And.getOpcode() == ISD::AND && "Expected AND node!");
22664   SDValue Op0 = And.getOperand(0);
22665   SDValue Op1 = And.getOperand(1);
22666   if (Op0.getOpcode() == ISD::TRUNCATE)
22667     Op0 = Op0.getOperand(0);
22668   if (Op1.getOpcode() == ISD::TRUNCATE)
22669     Op1 = Op1.getOperand(0);
22670
22671   SDValue Src, BitNo;
22672   if (Op1.getOpcode() == ISD::SHL)
22673     std::swap(Op0, Op1);
22674   if (Op0.getOpcode() == ISD::SHL) {
22675     if (isOneConstant(Op0.getOperand(0))) {
22676       // If we looked past a truncate, check that it's only truncating away
22677       // known zeros.
22678       unsigned BitWidth = Op0.getValueSizeInBits();
22679       unsigned AndBitWidth = And.getValueSizeInBits();
22680       if (BitWidth > AndBitWidth) {
22681         KnownBits Known = DAG.computeKnownBits(Op0);
22682         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22683           return SDValue();
22684       }
22685       Src = Op1;
22686       BitNo = Op0.getOperand(1);
22687     }
22688   } else if (Op1.getOpcode() == ISD::Constant) {
22689     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22690     uint64_t AndRHSVal = AndRHS->getZExtValue();
22691     SDValue AndLHS = Op0;
22692
22693     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22694       Src = AndLHS.getOperand(0);
22695       BitNo = AndLHS.getOperand(1);
22696     } else {
22697       // Use BT if the immediate can't be encoded in a TEST instruction or we
22698       // are optimizing for size and the immedaite won't fit in a byte.
22699       bool OptForSize = DAG.shouldOptForSize();
22700       if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22701           isPowerOf2_64(AndRHSVal)) {
22702         Src = AndLHS;
22703         BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22704                                 Src.getValueType());
22705       }
22706     }
22707   }
22708
22709   // No patterns found, give up.
22710   if (!Src.getNode())
22711     return SDValue();
22712
22713   // Remove any bit flip.
22714   if (isBitwiseNot(Src)) {
22715     Src = Src.getOperand(0);
22716     CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
22717   }
22718
22719   // Attempt to create the X86ISD::BT node.
22720   if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
22721     X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
22722     return BT;
22723   }
22724
22725   return SDValue();
22726 }
22727
22728 // Check if pre-AVX condcode can be performed by a single FCMP op.
22729 static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
22730   return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
22731 }
22732
22733 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
22734 /// CMPs.
22735 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
22736                                    SDValue &Op1, bool &IsAlwaysSignaling) {
22737   unsigned SSECC;
22738   bool Swap = false;
22739
22740   // SSE Condition code mapping:
22741   //  0 - EQ
22742   //  1 - LT
22743   //  2 - LE
22744   //  3 - UNORD
22745   //  4 - NEQ
22746   //  5 - NLT
22747   //  6 - NLE
22748   //  7 - ORD
22749   switch (SetCCOpcode) {
22750   default: llvm_unreachable("Unexpected SETCC condition");
22751   case ISD::SETOEQ:
22752   case ISD::SETEQ:  SSECC = 0; break;
22753   case ISD::SETOGT:
22754   case ISD::SETGT:  Swap = true; [[fallthrough]];
22755   case ISD::SETLT:
22756   case ISD::SETOLT: SSECC = 1; break;
22757   case ISD::SETOGE:
22758   case ISD::SETGE:  Swap = true; [[fallthrough]];
22759   case ISD::SETLE:
22760   case ISD::SETOLE: SSECC = 2; break;
22761   case ISD::SETUO:  SSECC = 3; break;
22762   case ISD::SETUNE:
22763   case ISD::SETNE:  SSECC = 4; break;
22764   case ISD::SETULE: Swap = true; [[fallthrough]];
22765   case ISD::SETUGE: SSECC = 5; break;
22766   case ISD::SETULT: Swap = true; [[fallthrough]];
22767   case ISD::SETUGT: SSECC = 6; break;
22768   case ISD::SETO:   SSECC = 7; break;
22769   case ISD::SETUEQ: SSECC = 8; break;
22770   case ISD::SETONE: SSECC = 12; break;
22771   }
22772   if (Swap)
22773     std::swap(Op0, Op1);
22774
22775   switch (SetCCOpcode) {
22776   default:
22777     IsAlwaysSignaling = true;
22778     break;
22779   case ISD::SETEQ:
22780   case ISD::SETOEQ:
22781   case ISD::SETUEQ:
22782   case ISD::SETNE:
22783   case ISD::SETONE:
22784   case ISD::SETUNE:
22785   case ISD::SETO:
22786   case ISD::SETUO:
22787     IsAlwaysSignaling = false;
22788     break;
22789   }
22790
22791   return SSECC;
22792 }
22793
22794 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
22795 /// concatenate the result back.
22796 static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
22797                               ISD::CondCode Cond, SelectionDAG &DAG,
22798                               const SDLoc &dl) {
22799   assert(VT.isInteger() && VT == LHS.getValueType() &&
22800          VT == RHS.getValueType() && "Unsupported VTs!");
22801
22802   SDValue CC = DAG.getCondCode(Cond);
22803
22804   // Extract the LHS Lo/Hi vectors
22805   SDValue LHS1, LHS2;
22806   std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
22807
22808   // Extract the RHS Lo/Hi vectors
22809   SDValue RHS1, RHS2;
22810   std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
22811
22812   // Issue the operation on the smaller types and concatenate the result back
22813   EVT LoVT, HiVT;
22814   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
22815   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22816                      DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
22817                      DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
22818 }
22819
22820 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
22821
22822   SDValue Op0 = Op.getOperand(0);
22823   SDValue Op1 = Op.getOperand(1);
22824   SDValue CC = Op.getOperand(2);
22825   MVT VT = Op.getSimpleValueType();
22826   SDLoc dl(Op);
22827
22828   assert(VT.getVectorElementType() == MVT::i1 &&
22829          "Cannot set masked compare for this operation");
22830
22831   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
22832
22833   // Prefer SETGT over SETLT.
22834   if (SetCCOpcode == ISD::SETLT) {
22835     SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
22836     std::swap(Op0, Op1);
22837   }
22838
22839   return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
22840 }
22841
22842 /// Given a buildvector constant, return a new vector constant with each element
22843 /// incremented or decremented. If incrementing or decrementing would result in
22844 /// unsigned overflow or underflow or this is not a simple vector constant,
22845 /// return an empty value.
22846 static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc,
22847                                     bool NSW) {
22848   auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
22849   if (!BV || !V.getValueType().isSimple())
22850     return SDValue();
22851
22852   MVT VT = V.getSimpleValueType();
22853   MVT EltVT = VT.getVectorElementType();
22854   unsigned NumElts = VT.getVectorNumElements();
22855   SmallVector<SDValue, 8> NewVecC;
22856   SDLoc DL(V);
22857   for (unsigned i = 0; i < NumElts; ++i) {
22858     auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
22859     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
22860       return SDValue();
22861
22862     // Avoid overflow/underflow.
22863     const APInt &EltC = Elt->getAPIntValue();
22864     if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
22865       return SDValue();
22866     if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
22867                 (!IsInc && EltC.isMinSignedValue())))
22868       return SDValue();
22869
22870     NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
22871   }
22872
22873   return DAG.getBuildVector(VT, DL, NewVecC);
22874 }
22875
22876 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
22877 /// Op0 u<= Op1:
22878 ///   t = psubus Op0, Op1
22879 ///   pcmpeq t, <0..0>
22880 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
22881                                     ISD::CondCode Cond, const SDLoc &dl,
22882                                     const X86Subtarget &Subtarget,
22883                                     SelectionDAG &DAG) {
22884   if (!Subtarget.hasSSE2())
22885     return SDValue();
22886
22887   MVT VET = VT.getVectorElementType();
22888   if (VET != MVT::i8 && VET != MVT::i16)
22889     return SDValue();
22890
22891   switch (Cond) {
22892   default:
22893     return SDValue();
22894   case ISD::SETULT: {
22895     // If the comparison is against a constant we can turn this into a
22896     // setule.  With psubus, setule does not require a swap.  This is
22897     // beneficial because the constant in the register is no longer
22898     // destructed as the destination so it can be hoisted out of a loop.
22899     // Only do this pre-AVX since vpcmp* is no longer destructive.
22900     if (Subtarget.hasAVX())
22901       return SDValue();
22902     SDValue ULEOp1 =
22903         incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
22904     if (!ULEOp1)
22905       return SDValue();
22906     Op1 = ULEOp1;
22907     break;
22908   }
22909   case ISD::SETUGT: {
22910     // If the comparison is against a constant, we can turn this into a setuge.
22911     // This is beneficial because materializing a constant 0 for the PCMPEQ is
22912     // probably cheaper than XOR+PCMPGT using 2 different vector constants:
22913     // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
22914     SDValue UGEOp1 =
22915         incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
22916     if (!UGEOp1)
22917       return SDValue();
22918     Op1 = Op0;
22919     Op0 = UGEOp1;
22920     break;
22921   }
22922   // Psubus is better than flip-sign because it requires no inversion.
22923   case ISD::SETUGE:
22924     std::swap(Op0, Op1);
22925     break;
22926   case ISD::SETULE:
22927     break;
22928   }
22929
22930   SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
22931   return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
22932                      DAG.getConstant(0, dl, VT));
22933 }
22934
22935 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
22936                            SelectionDAG &DAG) {
22937   bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
22938                   Op.getOpcode() == ISD::STRICT_FSETCCS;
22939   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
22940   SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
22941   SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
22942   MVT VT = Op->getSimpleValueType(0);
22943   ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
22944   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
22945   SDLoc dl(Op);
22946
22947   if (isFP) {
22948     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
22949     assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
22950     if (isSoftF16(EltVT, Subtarget))
22951       return SDValue();
22952
22953     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
22954     SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22955
22956     // If we have a strict compare with a vXi1 result and the input is 128/256
22957     // bits we can't use a masked compare unless we have VLX. If we use a wider
22958     // compare like we do for non-strict, we might trigger spurious exceptions
22959     // from the upper elements. Instead emit a AVX compare and convert to mask.
22960     unsigned Opc;
22961     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
22962         (!IsStrict || Subtarget.hasVLX() ||
22963          Op0.getSimpleValueType().is512BitVector())) {
22964 #ifndef NDEBUG
22965       unsigned Num = VT.getVectorNumElements();
22966       assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
22967 #endif
22968       Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
22969     } else {
22970       Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
22971       // The SSE/AVX packed FP comparison nodes are defined with a
22972       // floating-point vector result that matches the operand type. This allows
22973       // them to work with an SSE1 target (integer vector types are not legal).
22974       VT = Op0.getSimpleValueType();
22975     }
22976
22977     SDValue Cmp;
22978     bool IsAlwaysSignaling;
22979     unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
22980     if (!Subtarget.hasAVX()) {
22981       // TODO: We could use following steps to handle a quiet compare with
22982       // signaling encodings.
22983       // 1. Get ordered masks from a quiet ISD::SETO
22984       // 2. Use the masks to mask potential unordered elements in operand A, B
22985       // 3. Get the compare results of masked A, B
22986       // 4. Calculating final result using the mask and result from 3
22987       // But currently, we just fall back to scalar operations.
22988       if (IsStrict && IsAlwaysSignaling && !IsSignaling)
22989         return SDValue();
22990
22991       // Insert an extra signaling instruction to raise exception.
22992       if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
22993         SDValue SignalCmp = DAG.getNode(
22994             Opc, dl, {VT, MVT::Other},
22995             {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
22996         // FIXME: It seems we need to update the flags of all new strict nodes.
22997         // Otherwise, mayRaiseFPException in MI will return false due to
22998         // NoFPExcept = false by default. However, I didn't find it in other
22999         // patches.
23000         SignalCmp->setFlags(Op->getFlags());
23001         Chain = SignalCmp.getValue(1);
23002       }
23003
23004       // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23005       // emit two comparisons and a logic op to tie them together.
23006       if (!cheapX86FSETCC_SSE(Cond)) {
23007         // LLVM predicate is SETUEQ or SETONE.
23008         unsigned CC0, CC1;
23009         unsigned CombineOpc;
23010         if (Cond == ISD::SETUEQ) {
23011           CC0 = 3; // UNORD
23012           CC1 = 0; // EQ
23013           CombineOpc = X86ISD::FOR;
23014         } else {
23015           assert(Cond == ISD::SETONE);
23016           CC0 = 7; // ORD
23017           CC1 = 4; // NEQ
23018           CombineOpc = X86ISD::FAND;
23019         }
23020
23021         SDValue Cmp0, Cmp1;
23022         if (IsStrict) {
23023           Cmp0 = DAG.getNode(
23024               Opc, dl, {VT, MVT::Other},
23025               {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23026           Cmp1 = DAG.getNode(
23027               Opc, dl, {VT, MVT::Other},
23028               {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23029           Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23030                               Cmp1.getValue(1));
23031         } else {
23032           Cmp0 = DAG.getNode(
23033               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23034           Cmp1 = DAG.getNode(
23035               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23036         }
23037         Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23038       } else {
23039         if (IsStrict) {
23040           Cmp = DAG.getNode(
23041               Opc, dl, {VT, MVT::Other},
23042               {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23043           Chain = Cmp.getValue(1);
23044         } else
23045           Cmp = DAG.getNode(
23046               Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23047       }
23048     } else {
23049       // Handle all other FP comparisons here.
23050       if (IsStrict) {
23051         // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23052         SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23053         Cmp = DAG.getNode(
23054             Opc, dl, {VT, MVT::Other},
23055             {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23056         Chain = Cmp.getValue(1);
23057       } else
23058         Cmp = DAG.getNode(
23059             Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23060     }
23061
23062     if (VT.getFixedSizeInBits() >
23063         Op.getSimpleValueType().getFixedSizeInBits()) {
23064       // We emitted a compare with an XMM/YMM result. Finish converting to a
23065       // mask register using a vptestm.
23066       EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23067       Cmp = DAG.getBitcast(CastVT, Cmp);
23068       Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23069                          DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23070     } else {
23071       // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23072       // the result type of SETCC. The bitcast is expected to be optimized
23073       // away during combining/isel.
23074       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23075     }
23076
23077     if (IsStrict)
23078       return DAG.getMergeValues({Cmp, Chain}, dl);
23079
23080     return Cmp;
23081   }
23082
23083   assert(!IsStrict && "Strict SETCC only handles FP operands.");
23084
23085   MVT VTOp0 = Op0.getSimpleValueType();
23086   (void)VTOp0;
23087   assert(VTOp0 == Op1.getSimpleValueType() &&
23088          "Expected operands with same type!");
23089   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
23090          "Invalid number of packed elements for source and destination!");
23091
23092   // The non-AVX512 code below works under the assumption that source and
23093   // destination types are the same.
23094   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23095          "Value types for source and destination must be the same!");
23096
23097   // The result is boolean, but operands are int/float
23098   if (VT.getVectorElementType() == MVT::i1) {
23099     // In AVX-512 architecture setcc returns mask with i1 elements,
23100     // But there is no compare instruction for i8 and i16 elements in KNL.
23101     assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23102            "Unexpected operand type");
23103     return LowerIntVSETCC_AVX512(Op, DAG);
23104   }
23105
23106   // Lower using XOP integer comparisons.
23107   if (VT.is128BitVector() && Subtarget.hasXOP()) {
23108     // Translate compare code to XOP PCOM compare mode.
23109     unsigned CmpMode = 0;
23110     switch (Cond) {
23111     default: llvm_unreachable("Unexpected SETCC condition");
23112     case ISD::SETULT:
23113     case ISD::SETLT: CmpMode = 0x00; break;
23114     case ISD::SETULE:
23115     case ISD::SETLE: CmpMode = 0x01; break;
23116     case ISD::SETUGT:
23117     case ISD::SETGT: CmpMode = 0x02; break;
23118     case ISD::SETUGE:
23119     case ISD::SETGE: CmpMode = 0x03; break;
23120     case ISD::SETEQ: CmpMode = 0x04; break;
23121     case ISD::SETNE: CmpMode = 0x05; break;
23122     }
23123
23124     // Are we comparing unsigned or signed integers?
23125     unsigned Opc =
23126         ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
23127
23128     return DAG.getNode(Opc, dl, VT, Op0, Op1,
23129                        DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23130   }
23131
23132   // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23133   // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23134   if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
23135     SDValue BC0 = peekThroughBitcasts(Op0);
23136     if (BC0.getOpcode() == ISD::AND) {
23137       APInt UndefElts;
23138       SmallVector<APInt, 64> EltBits;
23139       if (getTargetConstantBitsFromNode(BC0.getOperand(1),
23140                                         VT.getScalarSizeInBits(), UndefElts,
23141                                         EltBits, false, false)) {
23142         if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23143           Cond = ISD::SETEQ;
23144           Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23145         }
23146       }
23147     }
23148   }
23149
23150   // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23151   if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23152       Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23153     ConstantSDNode *C1 = isConstOrConstSplat(Op1);
23154     if (C1 && C1->getAPIntValue().isPowerOf2()) {
23155       unsigned BitWidth = VT.getScalarSizeInBits();
23156       unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23157
23158       SDValue Result = Op0.getOperand(0);
23159       Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23160                            DAG.getConstant(ShiftAmt, dl, VT));
23161       Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23162                            DAG.getConstant(BitWidth - 1, dl, VT));
23163       return Result;
23164     }
23165   }
23166
23167   // Break 256-bit integer vector compare into smaller ones.
23168   if (VT.is256BitVector() && !Subtarget.hasInt256())
23169     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23170
23171   // Break 512-bit integer vector compare into smaller ones.
23172   // TODO: Try harder to use VPCMPx + VPMOV2x?
23173   if (VT.is512BitVector())
23174     return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23175
23176   // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23177   // not-of-PCMPEQ:
23178   // X != INT_MIN --> X >s INT_MIN
23179   // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23180   // +X != 0 --> +X >s 0
23181   APInt ConstValue;
23182   if (Cond == ISD::SETNE &&
23183       ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23184     if (ConstValue.isMinSignedValue())
23185       Cond = ISD::SETGT;
23186     else if (ConstValue.isMaxSignedValue())
23187       Cond = ISD::SETLT;
23188     else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
23189       Cond = ISD::SETGT;
23190   }
23191
23192   // If both operands are known non-negative, then an unsigned compare is the
23193   // same as a signed compare and there's no need to flip signbits.
23194   // TODO: We could check for more general simplifications here since we're
23195   // computing known bits.
23196   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23197                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23198
23199   // Special case: Use min/max operations for unsigned compares.
23200   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23201   if (ISD::isUnsignedIntSetCC(Cond) &&
23202       (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23203       TLI.isOperationLegal(ISD::UMIN, VT)) {
23204     // If we have a constant operand, increment/decrement it and change the
23205     // condition to avoid an invert.
23206     if (Cond == ISD::SETUGT) {
23207       // X > C --> X >= (C+1) --> X == umax(X, C+1)
23208       if (SDValue UGTOp1 =
23209               incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
23210         Op1 = UGTOp1;
23211         Cond = ISD::SETUGE;
23212       }
23213     }
23214     if (Cond == ISD::SETULT) {
23215       // X < C --> X <= (C-1) --> X == umin(X, C-1)
23216       if (SDValue ULTOp1 =
23217               incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
23218         Op1 = ULTOp1;
23219         Cond = ISD::SETULE;
23220       }
23221     }
23222     bool Invert = false;
23223     unsigned Opc;
23224     switch (Cond) {
23225     default: llvm_unreachable("Unexpected condition code");
23226     case ISD::SETUGT: Invert = true; [[fallthrough]];
23227     case ISD::SETULE: Opc = ISD::UMIN; break;
23228     case ISD::SETULT: Invert = true; [[fallthrough]];
23229     case ISD::SETUGE: Opc = ISD::UMAX; break;
23230     }
23231
23232     SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23233     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23234
23235     // If the logical-not of the result is required, perform that now.
23236     if (Invert)
23237       Result = DAG.getNOT(dl, Result, VT);
23238
23239     return Result;
23240   }
23241
23242   // Try to use SUBUS and PCMPEQ.
23243   if (FlipSigns)
23244     if (SDValue V =
23245             LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23246       return V;
23247
23248   // We are handling one of the integer comparisons here. Since SSE only has
23249   // GT and EQ comparisons for integer, swapping operands and multiple
23250   // operations may be required for some comparisons.
23251   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23252                                                             : X86ISD::PCMPGT;
23253   bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23254               Cond == ISD::SETGE || Cond == ISD::SETUGE;
23255   bool Invert = Cond == ISD::SETNE ||
23256                 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
23257
23258   if (Swap)
23259     std::swap(Op0, Op1);
23260
23261   // Check that the operation in question is available (most are plain SSE2,
23262   // but PCMPGTQ and PCMPEQQ have different requirements).
23263   if (VT == MVT::v2i64) {
23264     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23265       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23266
23267       // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23268       // the odd elements over the even elements.
23269       if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23270         Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23271         Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23272
23273         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23274         static const int MaskHi[] = { 1, 1, 3, 3 };
23275         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23276
23277         return DAG.getBitcast(VT, Result);
23278       }
23279
23280       if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23281         Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23282         Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23283
23284         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23285         static const int MaskHi[] = { 1, 1, 3, 3 };
23286         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23287
23288         return DAG.getBitcast(VT, Result);
23289       }
23290
23291       // Since SSE has no unsigned integer comparisons, we need to flip the sign
23292       // bits of the inputs before performing those operations. The lower
23293       // compare is always unsigned.
23294       SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
23295                                              : 0x0000000080000000ULL,
23296                                    dl, MVT::v2i64);
23297
23298       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23299       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23300
23301       // Cast everything to the right type.
23302       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23303       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23304
23305       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23306       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23307       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23308
23309       // Create masks for only the low parts/high parts of the 64 bit integers.
23310       static const int MaskHi[] = { 1, 1, 3, 3 };
23311       static const int MaskLo[] = { 0, 0, 2, 2 };
23312       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23313       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23314       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23315
23316       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23317       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23318
23319       if (Invert)
23320         Result = DAG.getNOT(dl, Result, MVT::v4i32);
23321
23322       return DAG.getBitcast(VT, Result);
23323     }
23324
23325     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23326       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23327       // pcmpeqd + pshufd + pand.
23328       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
23329
23330       // First cast everything to the right type.
23331       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23332       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23333
23334       // Do the compare.
23335       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23336
23337       // Make sure the lower and upper halves are both all-ones.
23338       static const int Mask[] = { 1, 0, 3, 2 };
23339       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23340       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23341
23342       if (Invert)
23343         Result = DAG.getNOT(dl, Result, MVT::v4i32);
23344
23345       return DAG.getBitcast(VT, Result);
23346     }
23347   }
23348
23349   // Since SSE has no unsigned integer comparisons, we need to flip the sign
23350   // bits of the inputs before performing those operations.
23351   if (FlipSigns) {
23352     MVT EltVT = VT.getVectorElementType();
23353     SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
23354                                  VT);
23355     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23356     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23357   }
23358
23359   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23360
23361   // If the logical-not of the result is required, perform that now.
23362   if (Invert)
23363     Result = DAG.getNOT(dl, Result, VT);
23364
23365   return Result;
23366 }
23367
23368 // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23369 static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
23370                               const SDLoc &dl, SelectionDAG &DAG,
23371                               const X86Subtarget &Subtarget,
23372                               SDValue &X86CC) {
23373   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23374
23375   // Must be a bitcast from vXi1.
23376   if (Op0.getOpcode() != ISD::BITCAST)
23377     return SDValue();
23378
23379   Op0 = Op0.getOperand(0);
23380   MVT VT = Op0.getSimpleValueType();
23381   if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23382       !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23383       !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23384     return SDValue();
23385
23386   X86::CondCode X86Cond;
23387   if (isNullConstant(Op1)) {
23388     X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23389   } else if (isAllOnesConstant(Op1)) {
23390     // C flag is set for all ones.
23391     X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23392   } else
23393     return SDValue();
23394
23395   // If the input is an AND, we can combine it's operands into the KTEST.
23396   bool KTestable = false;
23397   if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23398     KTestable = true;
23399   if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23400     KTestable = true;
23401   if (!isNullConstant(Op1))
23402     KTestable = false;
23403   if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23404     SDValue LHS = Op0.getOperand(0);
23405     SDValue RHS = Op0.getOperand(1);
23406     X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23407     return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23408   }
23409
23410   // If the input is an OR, we can combine it's operands into the KORTEST.
23411   SDValue LHS = Op0;
23412   SDValue RHS = Op0;
23413   if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23414     LHS = Op0.getOperand(0);
23415     RHS = Op0.getOperand(1);
23416   }
23417
23418   X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23419   return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23420 }
23421
23422 /// Emit flags for the given setcc condition and operands. Also returns the
23423 /// corresponding X86 condition code constant in X86CC.
23424 SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23425                                              ISD::CondCode CC, const SDLoc &dl,
23426                                              SelectionDAG &DAG,
23427                                              SDValue &X86CC) const {
23428   // Equality Combines.
23429   if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23430     X86::CondCode X86CondCode;
23431
23432     // Optimize to BT if possible.
23433     // Lower (X & (1 << N)) == 0 to BT(X, N).
23434     // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23435     // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23436     if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
23437       if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
23438         X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23439         return BT;
23440       }
23441     }
23442
23443     // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
23444     if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
23445                                                X86CondCode)) {
23446       X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23447       return CmpZ;
23448     }
23449
23450     // Try to lower using KORTEST or KTEST.
23451     if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23452       return Test;
23453
23454     // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms
23455     // of these.
23456     if (isOneConstant(Op1) || isNullConstant(Op1)) {
23457       // If the input is a setcc, then reuse the input setcc or use a new one
23458       // with the inverted condition.
23459       if (Op0.getOpcode() == X86ISD::SETCC) {
23460         bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23461
23462         X86CC = Op0.getOperand(0);
23463         if (Invert) {
23464           X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23465           X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
23466           X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23467         }
23468
23469         return Op0.getOperand(1);
23470       }
23471     }
23472
23473     // Try to use the carry flag from the add in place of an separate CMP for:
23474     // (seteq (add X, -1), -1). Similar for setne.
23475     if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23476         Op0.getOperand(1) == Op1) {
23477       if (isProfitableToUseFlagOp(Op0)) {
23478         SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23479
23480         SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23481                                   Op0.getOperand(1));
23482         DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23483         X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23484         X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23485         return SDValue(New.getNode(), 1);
23486       }
23487     }
23488   }
23489
23490   X86::CondCode CondCode =
23491       TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23492   assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
23493
23494   SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23495   X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23496   return EFLAGS;
23497 }
23498
23499 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23500
23501   bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23502                   Op.getOpcode() == ISD::STRICT_FSETCCS;
23503   MVT VT = Op->getSimpleValueType(0);
23504
23505   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23506
23507   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23508   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23509   SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23510   SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23511   SDLoc dl(Op);
23512   ISD::CondCode CC =
23513       cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23514
23515   if (isSoftF16(Op0.getValueType(), Subtarget))
23516     return SDValue();
23517
23518   // Handle f128 first, since one possible outcome is a normal integer
23519   // comparison which gets handled by emitFlagsForSetcc.
23520   if (Op0.getValueType() == MVT::f128) {
23521     softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23522                         Op.getOpcode() == ISD::STRICT_FSETCCS);
23523
23524     // If softenSetCCOperands returned a scalar, use it.
23525     if (!Op1.getNode()) {
23526       assert(Op0.getValueType() == Op.getValueType() &&
23527              "Unexpected setcc expansion!");
23528       if (IsStrict)
23529         return DAG.getMergeValues({Op0, Chain}, dl);
23530       return Op0;
23531     }
23532   }
23533
23534   if (Op0.getSimpleValueType().isInteger()) {
23535     // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23536     // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23537     // this may translate to less uops depending on uarch implementation. The
23538     // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23539     // canonicalize to that CondCode.
23540     // NOTE: Only do this if incrementing the constant doesn't increase the bit
23541     // encoding size - so it must either already be a i8 or i32 immediate, or it
23542     // shrinks down to that. We don't do this for any i64's to avoid additional
23543     // constant materializations.
23544     // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23545     if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23546       const APInt &Op1Val = Op1C->getAPIntValue();
23547       if (!Op1Val.isZero()) {
23548         // Ensure the constant+1 doesn't overflow.
23549         if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23550             (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23551           APInt Op1ValPlusOne = Op1Val + 1;
23552           if (Op1ValPlusOne.isSignedIntN(32) &&
23553               (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23554             Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23555             CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
23556                                             : ISD::CondCode::SETUGE;
23557           }
23558         }
23559       }
23560     }
23561
23562     SDValue X86CC;
23563     SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23564     SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23565     return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23566   }
23567
23568   // Handle floating point.
23569   X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23570   if (CondCode == X86::COND_INVALID)
23571     return SDValue();
23572
23573   SDValue EFLAGS;
23574   if (IsStrict) {
23575     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23576     EFLAGS =
23577         DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
23578                     dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23579     Chain = EFLAGS.getValue(1);
23580   } else {
23581     EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23582   }
23583
23584   SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23585   SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23586   return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23587 }
23588
23589 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23590   SDValue LHS = Op.getOperand(0);
23591   SDValue RHS = Op.getOperand(1);
23592   SDValue Carry = Op.getOperand(2);
23593   SDValue Cond = Op.getOperand(3);
23594   SDLoc DL(Op);
23595
23596   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
23597   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23598
23599   // Recreate the carry if needed.
23600   EVT CarryVT = Carry.getValueType();
23601   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23602                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
23603
23604   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23605   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23606   return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23607 }
23608
23609 // This function returns three things: the arithmetic computation itself
23610 // (Value), an EFLAGS result (Overflow), and a condition code (Cond).  The
23611 // flag and the condition code define the case in which the arithmetic
23612 // computation overflows.
23613 static std::pair<SDValue, SDValue>
23614 getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
23615   assert(Op.getResNo() == 0 && "Unexpected result number!");
23616   SDValue Value, Overflow;
23617   SDValue LHS = Op.getOperand(0);
23618   SDValue RHS = Op.getOperand(1);
23619   unsigned BaseOp = 0;
23620   SDLoc DL(Op);
23621   switch (Op.getOpcode()) {
23622   default: llvm_unreachable("Unknown ovf instruction!");
23623   case ISD::SADDO:
23624     BaseOp = X86ISD::ADD;
23625     Cond = X86::COND_O;
23626     break;
23627   case ISD::UADDO:
23628     BaseOp = X86ISD::ADD;
23629     Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
23630     break;
23631   case ISD::SSUBO:
23632     BaseOp = X86ISD::SUB;
23633     Cond = X86::COND_O;
23634     break;
23635   case ISD::USUBO:
23636     BaseOp = X86ISD::SUB;
23637     Cond = X86::COND_B;
23638     break;
23639   case ISD::SMULO:
23640     BaseOp = X86ISD::SMUL;
23641     Cond = X86::COND_O;
23642     break;
23643   case ISD::UMULO:
23644     BaseOp = X86ISD::UMUL;
23645     Cond = X86::COND_O;
23646     break;
23647   }
23648
23649   if (BaseOp) {
23650     // Also sets EFLAGS.
23651     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23652     Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23653     Overflow = Value.getValue(1);
23654   }
23655
23656   return std::make_pair(Value, Overflow);
23657 }
23658
23659 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23660   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23661   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23662   // looks for this combo and may remove the "setcc" instruction if the "setcc"
23663   // has only one use.
23664   SDLoc DL(Op);
23665   X86::CondCode Cond;
23666   SDValue Value, Overflow;
23667   std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23668
23669   SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23670   assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
23671   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23672 }
23673
23674 /// Return true if opcode is a X86 logical comparison.
23675 static bool isX86LogicalCmp(SDValue Op) {
23676   unsigned Opc = Op.getOpcode();
23677   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23678       Opc == X86ISD::FCMP)
23679     return true;
23680   if (Op.getResNo() == 1 &&
23681       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23682        Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23683        Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23684     return true;
23685
23686   return false;
23687 }
23688
23689 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
23690   if (V.getOpcode() != ISD::TRUNCATE)
23691     return false;
23692
23693   SDValue VOp0 = V.getOperand(0);
23694   unsigned InBits = VOp0.getValueSizeInBits();
23695   unsigned Bits = V.getValueSizeInBits();
23696   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23697 }
23698
23699 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23700   bool AddTest = true;
23701   SDValue Cond  = Op.getOperand(0);
23702   SDValue Op1 = Op.getOperand(1);
23703   SDValue Op2 = Op.getOperand(2);
23704   SDLoc DL(Op);
23705   MVT VT = Op1.getSimpleValueType();
23706   SDValue CC;
23707
23708   if (isSoftF16(VT, Subtarget)) {
23709     MVT NVT = VT.changeTypeToInteger();
23710     return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
23711                                           DAG.getBitcast(NVT, Op1),
23712                                           DAG.getBitcast(NVT, Op2)));
23713   }
23714
23715   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
23716   // are available or VBLENDV if AVX is available.
23717   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
23718   if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
23719       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
23720     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
23721     bool IsAlwaysSignaling;
23722     unsigned SSECC =
23723         translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
23724                            CondOp0, CondOp1, IsAlwaysSignaling);
23725
23726     if (Subtarget.hasAVX512()) {
23727       SDValue Cmp =
23728           DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
23729                       DAG.getTargetConstant(SSECC, DL, MVT::i8));
23730       assert(!VT.isVector() && "Not a scalar type?");
23731       return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23732     }
23733
23734     if (SSECC < 8 || Subtarget.hasAVX()) {
23735       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
23736                                 DAG.getTargetConstant(SSECC, DL, MVT::i8));
23737
23738       // If we have AVX, we can use a variable vector select (VBLENDV) instead
23739       // of 3 logic instructions for size savings and potentially speed.
23740       // Unfortunately, there is no scalar form of VBLENDV.
23741
23742       // If either operand is a +0.0 constant, don't try this. We can expect to
23743       // optimize away at least one of the logic instructions later in that
23744       // case, so that sequence would be faster than a variable blend.
23745
23746       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
23747       // uses XMM0 as the selection register. That may need just as many
23748       // instructions as the AND/ANDN/OR sequence due to register moves, so
23749       // don't bother.
23750       if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
23751           !isNullFPConstant(Op2)) {
23752         // Convert to vectors, do a VSELECT, and convert back to scalar.
23753         // All of the conversions should be optimized away.
23754         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
23755         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
23756         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
23757         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
23758
23759         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
23760         VCmp = DAG.getBitcast(VCmpVT, VCmp);
23761
23762         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
23763
23764         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
23765                            VSel, DAG.getIntPtrConstant(0, DL));
23766       }
23767       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
23768       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
23769       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
23770     }
23771   }
23772
23773   // AVX512 fallback is to lower selects of scalar floats to masked moves.
23774   if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
23775     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
23776     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
23777   }
23778
23779   if (Cond.getOpcode() == ISD::SETCC &&
23780       !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
23781     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
23782       Cond = NewCond;
23783       // If the condition was updated, it's possible that the operands of the
23784       // select were also updated (for example, EmitTest has a RAUW). Refresh
23785       // the local references to the select operands in case they got stale.
23786       Op1 = Op.getOperand(1);
23787       Op2 = Op.getOperand(2);
23788     }
23789   }
23790
23791   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
23792   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
23793   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
23794   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
23795   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
23796   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
23797   // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
23798   // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
23799   if (Cond.getOpcode() == X86ISD::SETCC &&
23800       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
23801       isNullConstant(Cond.getOperand(1).getOperand(1))) {
23802     SDValue Cmp = Cond.getOperand(1);
23803     SDValue CmpOp0 = Cmp.getOperand(0);
23804     unsigned CondCode = Cond.getConstantOperandVal(0);
23805
23806     // Special handling for __builtin_ffs(X) - 1 pattern which looks like
23807     // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
23808     // handle to keep the CMP with 0. This should be removed by
23809     // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
23810     // cttz_zero_undef.
23811     auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
23812       return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
23813               Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
23814     };
23815     if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
23816         ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
23817          (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
23818       // Keep Cmp.
23819     } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23820         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
23821       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
23822       SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
23823
23824       // 'X - 1' sets the carry flag if X == 0.
23825       // '0 - X' sets the carry flag if X != 0.
23826       // Convert the carry flag to a -1/0 mask with sbb:
23827       // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
23828       // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
23829       // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
23830       // select (X == 0), -1, Y --> X - 1; or (sbb), Y
23831       SDValue Sub;
23832       if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
23833         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
23834         Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
23835       } else {
23836         SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
23837         Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
23838       }
23839       SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
23840                                 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
23841                                 Sub.getValue(1));
23842       return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
23843     } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
23844                CmpOp0.getOpcode() == ISD::AND &&
23845                isOneConstant(CmpOp0.getOperand(1))) {
23846       SDValue Src1, Src2;
23847       // true if Op2 is XOR or OR operator and one of its operands
23848       // is equal to Op1
23849       // ( a , a op b) || ( b , a op b)
23850       auto isOrXorPattern = [&]() {
23851         if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
23852             (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
23853           Src1 =
23854               Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
23855           Src2 = Op1;
23856           return true;
23857         }
23858         return false;
23859       };
23860
23861       if (isOrXorPattern()) {
23862         SDValue Neg;
23863         unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
23864         // we need mask of all zeros or ones with same size of the other
23865         // operands.
23866         if (CmpSz > VT.getSizeInBits())
23867           Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
23868         else if (CmpSz < VT.getSizeInBits())
23869           Neg = DAG.getNode(ISD::AND, DL, VT,
23870               DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
23871               DAG.getConstant(1, DL, VT));
23872         else
23873           Neg = CmpOp0;
23874         SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
23875                                    Neg); // -(and (x, 0x1))
23876         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
23877         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
23878       }
23879     } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
23880                Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
23881                ((CondCode == X86::COND_S) ||                    // smin(x, 0)
23882                 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
23883       // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
23884       //
23885       // If the comparison is testing for a positive value, we have to invert
23886       // the sign bit mask, so only do that transform if the target has a
23887       // bitwise 'and not' instruction (the invert is free).
23888       // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
23889       unsigned ShCt = VT.getSizeInBits() - 1;
23890       SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
23891       SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
23892       if (CondCode == X86::COND_G)
23893         Shift = DAG.getNOT(DL, Shift, VT);
23894       return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
23895     }
23896   }
23897
23898   // Look past (and (setcc_carry (cmp ...)), 1).
23899   if (Cond.getOpcode() == ISD::AND &&
23900       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
23901       isOneConstant(Cond.getOperand(1)))
23902     Cond = Cond.getOperand(0);
23903
23904   // If condition flag is set by a X86ISD::CMP, then use it as the condition
23905   // setting operand in place of the X86ISD::SETCC.
23906   unsigned CondOpcode = Cond.getOpcode();
23907   if (CondOpcode == X86ISD::SETCC ||
23908       CondOpcode == X86ISD::SETCC_CARRY) {
23909     CC = Cond.getOperand(0);
23910
23911     SDValue Cmp = Cond.getOperand(1);
23912     bool IllegalFPCMov = false;
23913     if (VT.isFloatingPoint() && !VT.isVector() &&
23914         !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV())  // FPStack?
23915       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
23916
23917     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
23918         Cmp.getOpcode() == X86ISD::BT) { // FIXME
23919       Cond = Cmp;
23920       AddTest = false;
23921     }
23922   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
23923              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
23924              CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
23925     SDValue Value;
23926     X86::CondCode X86Cond;
23927     std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
23928
23929     CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
23930     AddTest = false;
23931   }
23932
23933   if (AddTest) {
23934     // Look past the truncate if the high bits are known zero.
23935     if (isTruncWithZeroHighBitsInput(Cond, DAG))
23936       Cond = Cond.getOperand(0);
23937
23938     // We know the result of AND is compared against zero. Try to match
23939     // it to BT.
23940     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
23941       X86::CondCode X86CondCode;
23942       if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
23943         CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
23944         Cond = BT;
23945         AddTest = false;
23946       }
23947     }
23948   }
23949
23950   if (AddTest) {
23951     CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
23952     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
23953   }
23954
23955   // a <  b ? -1 :  0 -> RES = ~setcc_carry
23956   // a <  b ?  0 : -1 -> RES = setcc_carry
23957   // a >= b ? -1 :  0 -> RES = setcc_carry
23958   // a >= b ?  0 : -1 -> RES = ~setcc_carry
23959   if (Cond.getOpcode() == X86ISD::SUB) {
23960     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
23961
23962     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
23963         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
23964         (isNullConstant(Op1) || isNullConstant(Op2))) {
23965       SDValue Res =
23966           DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
23967                       DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
23968       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
23969         return DAG.getNOT(DL, Res, Res.getValueType());
23970       return Res;
23971     }
23972   }
23973
23974   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
23975   // widen the cmov and push the truncate through. This avoids introducing a new
23976   // branch during isel and doesn't add any extensions.
23977   if (Op.getValueType() == MVT::i8 &&
23978       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
23979     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
23980     if (T1.getValueType() == T2.getValueType() &&
23981         // Exclude CopyFromReg to avoid partial register stalls.
23982         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
23983       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
23984                                  CC, Cond);
23985       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
23986     }
23987   }
23988
23989   // Or finally, promote i8 cmovs if we have CMOV,
23990   //                 or i16 cmovs if it won't prevent folding a load.
23991   // FIXME: we should not limit promotion of i8 case to only when the CMOV is
23992   //        legal, but EmitLoweredSelect() can not deal with these extensions
23993   //        being inserted between two CMOV's. (in i16 case too TBN)
23994   //        https://bugs.llvm.org/show_bug.cgi?id=40974
23995   if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
23996       (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
23997        !X86::mayFoldLoad(Op2, Subtarget))) {
23998     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
23999     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24000     SDValue Ops[] = { Op2, Op1, CC, Cond };
24001     SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24002     return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24003   }
24004
24005   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24006   // condition is true.
24007   SDValue Ops[] = { Op2, Op1, CC, Cond };
24008   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24009 }
24010
24011 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
24012                                      const X86Subtarget &Subtarget,
24013                                      SelectionDAG &DAG) {
24014   MVT VT = Op->getSimpleValueType(0);
24015   SDValue In = Op->getOperand(0);
24016   MVT InVT = In.getSimpleValueType();
24017   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24018   MVT VTElt = VT.getVectorElementType();
24019   SDLoc dl(Op);
24020
24021   unsigned NumElts = VT.getVectorNumElements();
24022
24023   // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24024   MVT ExtVT = VT;
24025   if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24026     // If v16i32 is to be avoided, we'll need to split and concatenate.
24027     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24028       return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24029
24030     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24031   }
24032
24033   // Widen to 512-bits if VLX is not supported.
24034   MVT WideVT = ExtVT;
24035   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24036     NumElts *= 512 / ExtVT.getSizeInBits();
24037     InVT = MVT::getVectorVT(MVT::i1, NumElts);
24038     In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24039                      In, DAG.getIntPtrConstant(0, dl));
24040     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24041   }
24042
24043   SDValue V;
24044   MVT WideEltVT = WideVT.getVectorElementType();
24045   if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24046       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24047     V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24048   } else {
24049     SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24050     SDValue Zero = DAG.getConstant(0, dl, WideVT);
24051     V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24052   }
24053
24054   // Truncate if we had to extend i16/i8 above.
24055   if (VT != ExtVT) {
24056     WideVT = MVT::getVectorVT(VTElt, NumElts);
24057     V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24058   }
24059
24060   // Extract back to 128/256-bit if we widened.
24061   if (WideVT != VT)
24062     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24063                     DAG.getIntPtrConstant(0, dl));
24064
24065   return V;
24066 }
24067
24068 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24069                                SelectionDAG &DAG) {
24070   SDValue In = Op->getOperand(0);
24071   MVT InVT = In.getSimpleValueType();
24072
24073   if (InVT.getVectorElementType() == MVT::i1)
24074     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24075
24076   assert(Subtarget.hasAVX() && "Expected AVX support");
24077   return LowerAVXExtend(Op, DAG, Subtarget);
24078 }
24079
24080 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24081 // For sign extend this needs to handle all vector sizes and SSE4.1 and
24082 // non-SSE4.1 targets. For zero extend this should only handle inputs of
24083 // MVT::v64i8 when BWI is not supported, but AVX512 is.
24084 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
24085                                         const X86Subtarget &Subtarget,
24086                                         SelectionDAG &DAG) {
24087   SDValue In = Op->getOperand(0);
24088   MVT VT = Op->getSimpleValueType(0);
24089   MVT InVT = In.getSimpleValueType();
24090
24091   MVT SVT = VT.getVectorElementType();
24092   MVT InSVT = InVT.getVectorElementType();
24093   assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits());
24094
24095   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24096     return SDValue();
24097   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24098     return SDValue();
24099   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24100       !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24101       !(VT.is512BitVector() && Subtarget.hasAVX512()))
24102     return SDValue();
24103
24104   SDLoc dl(Op);
24105   unsigned Opc = Op.getOpcode();
24106   unsigned NumElts = VT.getVectorNumElements();
24107
24108   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24109   // For 512-bit vectors, we need 128-bits or 256-bits.
24110   if (InVT.getSizeInBits() > 128) {
24111     // Input needs to be at least the same number of elements as output, and
24112     // at least 128-bits.
24113     int InSize = InSVT.getSizeInBits() * NumElts;
24114     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24115     InVT = In.getSimpleValueType();
24116   }
24117
24118   // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24119   // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24120   // need to be handled here for 256/512-bit results.
24121   if (Subtarget.hasInt256()) {
24122     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24123
24124     if (InVT.getVectorNumElements() != NumElts)
24125       return DAG.getNode(Op.getOpcode(), dl, VT, In);
24126
24127     // FIXME: Apparently we create inreg operations that could be regular
24128     // extends.
24129     unsigned ExtOpc =
24130         Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
24131                                              : ISD::ZERO_EXTEND;
24132     return DAG.getNode(ExtOpc, dl, VT, In);
24133   }
24134
24135   // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24136   if (Subtarget.hasAVX()) {
24137     assert(VT.is256BitVector() && "256-bit vector expected");
24138     MVT HalfVT = VT.getHalfNumVectorElementsVT();
24139     int HalfNumElts = HalfVT.getVectorNumElements();
24140
24141     unsigned NumSrcElts = InVT.getVectorNumElements();
24142     SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24143     for (int i = 0; i != HalfNumElts; ++i)
24144       HiMask[i] = HalfNumElts + i;
24145
24146     SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24147     SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24148     Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24149     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24150   }
24151
24152   // We should only get here for sign extend.
24153   assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24154   assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24155   unsigned InNumElts = InVT.getVectorNumElements();
24156
24157   // If the source elements are already all-signbits, we don't need to extend,
24158   // just splat the elements.
24159   APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
24160   if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
24161     unsigned Scale = InNumElts / NumElts;
24162     SmallVector<int, 16> ShuffleMask;
24163     for (unsigned I = 0; I != NumElts; ++I)
24164       ShuffleMask.append(Scale, I);
24165     return DAG.getBitcast(VT,
24166                           DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
24167   }
24168
24169   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24170   SDValue Curr = In;
24171   SDValue SignExt = Curr;
24172
24173   // As SRAI is only available on i16/i32 types, we expand only up to i32
24174   // and handle i64 separately.
24175   if (InVT != MVT::v4i32) {
24176     MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24177
24178     unsigned DestWidth = DestVT.getScalarSizeInBits();
24179     unsigned Scale = DestWidth / InSVT.getSizeInBits();
24180     unsigned DestElts = DestVT.getVectorNumElements();
24181
24182     // Build a shuffle mask that takes each input element and places it in the
24183     // MSBs of the new element size.
24184     SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24185     for (unsigned i = 0; i != DestElts; ++i)
24186       Mask[i * Scale + (Scale - 1)] = i;
24187
24188     Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24189     Curr = DAG.getBitcast(DestVT, Curr);
24190
24191     unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24192     SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24193                           DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24194   }
24195
24196   if (VT == MVT::v2i64) {
24197     assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24198     SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24199     SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24200     SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24201     SignExt = DAG.getBitcast(VT, SignExt);
24202   }
24203
24204   return SignExt;
24205 }
24206
24207 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24208                                 SelectionDAG &DAG) {
24209   MVT VT = Op->getSimpleValueType(0);
24210   SDValue In = Op->getOperand(0);
24211   MVT InVT = In.getSimpleValueType();
24212   SDLoc dl(Op);
24213
24214   if (InVT.getVectorElementType() == MVT::i1)
24215     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24216
24217   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
24218   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
24219          "Expected same number of elements");
24220   assert((VT.getVectorElementType() == MVT::i16 ||
24221           VT.getVectorElementType() == MVT::i32 ||
24222           VT.getVectorElementType() == MVT::i64) &&
24223          "Unexpected element type");
24224   assert((InVT.getVectorElementType() == MVT::i8 ||
24225           InVT.getVectorElementType() == MVT::i16 ||
24226           InVT.getVectorElementType() == MVT::i32) &&
24227          "Unexpected element type");
24228
24229   if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24230     assert(InVT == MVT::v32i8 && "Unexpected VT!");
24231     return splitVectorIntUnary(Op, DAG);
24232   }
24233
24234   if (Subtarget.hasInt256())
24235     return Op;
24236
24237   // Optimize vectors in AVX mode
24238   // Sign extend  v8i16 to v8i32 and
24239   //              v4i32 to v4i64
24240   //
24241   // Divide input vector into two parts
24242   // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24243   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24244   // concat the vectors to original VT
24245   MVT HalfVT = VT.getHalfNumVectorElementsVT();
24246   SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24247
24248   unsigned NumElems = InVT.getVectorNumElements();
24249   SmallVector<int,8> ShufMask(NumElems, -1);
24250   for (unsigned i = 0; i != NumElems/2; ++i)
24251     ShufMask[i] = i + NumElems/2;
24252
24253   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24254   OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24255
24256   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24257 }
24258
24259 /// Change a vector store into a pair of half-size vector stores.
24260 static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
24261   SDValue StoredVal = Store->getValue();
24262   assert((StoredVal.getValueType().is256BitVector() ||
24263           StoredVal.getValueType().is512BitVector()) &&
24264          "Expecting 256/512-bit op");
24265
24266   // Splitting volatile memory ops is not allowed unless the operation was not
24267   // legal to begin with. Assume the input store is legal (this transform is
24268   // only used for targets with AVX). Note: It is possible that we have an
24269   // illegal type like v2i128, and so we could allow splitting a volatile store
24270   // in that case if that is important.
24271   if (!Store->isSimple())
24272     return SDValue();
24273
24274   SDLoc DL(Store);
24275   SDValue Value0, Value1;
24276   std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24277   unsigned HalfOffset = Value0.getValueType().getStoreSize();
24278   SDValue Ptr0 = Store->getBasePtr();
24279   SDValue Ptr1 =
24280       DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
24281   SDValue Ch0 =
24282       DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24283                    Store->getOriginalAlign(),
24284                    Store->getMemOperand()->getFlags());
24285   SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24286                              Store->getPointerInfo().getWithOffset(HalfOffset),
24287                              Store->getOriginalAlign(),
24288                              Store->getMemOperand()->getFlags());
24289   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24290 }
24291
24292 /// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24293 /// type.
24294 static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
24295                                     SelectionDAG &DAG) {
24296   SDValue StoredVal = Store->getValue();
24297   assert(StoreVT.is128BitVector() &&
24298          StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24299   StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24300
24301   // Splitting volatile memory ops is not allowed unless the operation was not
24302   // legal to begin with. We are assuming the input op is legal (this transform
24303   // is only used for targets with AVX).
24304   if (!Store->isSimple())
24305     return SDValue();
24306
24307   MVT StoreSVT = StoreVT.getScalarType();
24308   unsigned NumElems = StoreVT.getVectorNumElements();
24309   unsigned ScalarSize = StoreSVT.getStoreSize();
24310
24311   SDLoc DL(Store);
24312   SmallVector<SDValue, 4> Stores;
24313   for (unsigned i = 0; i != NumElems; ++i) {
24314     unsigned Offset = i * ScalarSize;
24315     SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24316                                            TypeSize::Fixed(Offset), DL);
24317     SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24318                               DAG.getIntPtrConstant(i, DL));
24319     SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24320                               Store->getPointerInfo().getWithOffset(Offset),
24321                               Store->getOriginalAlign(),
24322                               Store->getMemOperand()->getFlags());
24323     Stores.push_back(Ch);
24324   }
24325   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24326 }
24327
24328 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24329                           SelectionDAG &DAG) {
24330   StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24331   SDLoc dl(St);
24332   SDValue StoredVal = St->getValue();
24333
24334   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24335   if (StoredVal.getValueType().isVector() &&
24336       StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24337     unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24338     assert(NumElts <= 8 && "Unexpected VT");
24339     assert(!St->isTruncatingStore() && "Expected non-truncating store");
24340     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24341            "Expected AVX512F without AVX512DQI");
24342
24343     // We must pad with zeros to ensure we store zeroes to any unused bits.
24344     StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24345                             DAG.getUNDEF(MVT::v16i1), StoredVal,
24346                             DAG.getIntPtrConstant(0, dl));
24347     StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24348     StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24349     // Make sure we store zeros in the extra bits.
24350     if (NumElts < 8)
24351       StoredVal = DAG.getZeroExtendInReg(
24352           StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24353
24354     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24355                         St->getPointerInfo(), St->getOriginalAlign(),
24356                         St->getMemOperand()->getFlags());
24357   }
24358
24359   if (St->isTruncatingStore())
24360     return SDValue();
24361
24362   // If this is a 256-bit store of concatenated ops, we are better off splitting
24363   // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24364   // and each half can execute independently. Some cores would split the op into
24365   // halves anyway, so the concat (vinsertf128) is purely an extra op.
24366   MVT StoreVT = StoredVal.getSimpleValueType();
24367   if (StoreVT.is256BitVector() ||
24368       ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24369        !Subtarget.hasBWI())) {
24370     if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG))
24371       return splitVectorStore(St, DAG);
24372     return SDValue();
24373   }
24374
24375   if (StoreVT.is32BitVector())
24376     return SDValue();
24377
24378   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24379   assert(StoreVT.is64BitVector() && "Unexpected VT");
24380   assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
24381              TargetLowering::TypeWidenVector &&
24382          "Unexpected type action!");
24383
24384   EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24385   StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24386                           DAG.getUNDEF(StoreVT));
24387
24388   if (Subtarget.hasSSE2()) {
24389     // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24390     // and store it.
24391     MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24392     MVT CastVT = MVT::getVectorVT(StVT, 2);
24393     StoredVal = DAG.getBitcast(CastVT, StoredVal);
24394     StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24395                             DAG.getIntPtrConstant(0, dl));
24396
24397     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24398                         St->getPointerInfo(), St->getOriginalAlign(),
24399                         St->getMemOperand()->getFlags());
24400   }
24401   assert(Subtarget.hasSSE1() && "Expected SSE");
24402   SDVTList Tys = DAG.getVTList(MVT::Other);
24403   SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24404   return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24405                                  St->getMemOperand());
24406 }
24407
24408 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
24409 // may emit an illegal shuffle but the expansion is still better than scalar
24410 // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24411 // we'll emit a shuffle and a arithmetic shift.
24412 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24413 // TODO: It is possible to support ZExt by zeroing the undef values during
24414 // the shuffle phase or after the shuffle.
24415 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24416                                  SelectionDAG &DAG) {
24417   MVT RegVT = Op.getSimpleValueType();
24418   assert(RegVT.isVector() && "We only custom lower vector loads.");
24419   assert(RegVT.isInteger() &&
24420          "We only custom lower integer vector loads.");
24421
24422   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24423   SDLoc dl(Ld);
24424
24425   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24426   if (RegVT.getVectorElementType() == MVT::i1) {
24427     assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24428     assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
24429     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24430            "Expected AVX512F without AVX512DQI");
24431
24432     SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24433                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24434                                 Ld->getMemOperand()->getFlags());
24435
24436     // Replace chain users with the new chain.
24437     assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24438
24439     SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24440     Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24441                       DAG.getBitcast(MVT::v16i1, Val),
24442                       DAG.getIntPtrConstant(0, dl));
24443     return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24444   }
24445
24446   return SDValue();
24447 }
24448
24449 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24450 /// each of which has no other use apart from the AND / OR.
24451 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24452   Opc = Op.getOpcode();
24453   if (Opc != ISD::OR && Opc != ISD::AND)
24454     return false;
24455   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24456           Op.getOperand(0).hasOneUse() &&
24457           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24458           Op.getOperand(1).hasOneUse());
24459 }
24460
24461 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24462   SDValue Chain = Op.getOperand(0);
24463   SDValue Cond  = Op.getOperand(1);
24464   SDValue Dest  = Op.getOperand(2);
24465   SDLoc dl(Op);
24466
24467   // Bail out when we don't have native compare instructions.
24468   if (Cond.getOpcode() == ISD::SETCC &&
24469       Cond.getOperand(0).getValueType() != MVT::f128 &&
24470       !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
24471     SDValue LHS = Cond.getOperand(0);
24472     SDValue RHS = Cond.getOperand(1);
24473     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24474
24475     // Special case for
24476     // setcc([su]{add,sub,mul}o == 0)
24477     // setcc([su]{add,sub,mul}o != 1)
24478     if (ISD::isOverflowIntrOpRes(LHS) &&
24479         (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24480         (isNullConstant(RHS) || isOneConstant(RHS))) {
24481       SDValue Value, Overflow;
24482       X86::CondCode X86Cond;
24483       std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24484
24485       if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24486         X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24487
24488       SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24489       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24490                          Overflow);
24491     }
24492
24493     if (LHS.getSimpleValueType().isInteger()) {
24494       SDValue CCVal;
24495       SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24496       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24497                          EFLAGS);
24498     }
24499
24500     if (CC == ISD::SETOEQ) {
24501       // For FCMP_OEQ, we can emit
24502       // two branches instead of an explicit AND instruction with a
24503       // separate test. However, we only do this if this block doesn't
24504       // have a fall-through edge, because this requires an explicit
24505       // jmp when the condition is false.
24506       if (Op.getNode()->hasOneUse()) {
24507         SDNode *User = *Op.getNode()->use_begin();
24508         // Look for an unconditional branch following this conditional branch.
24509         // We need this because we need to reverse the successors in order
24510         // to implement FCMP_OEQ.
24511         if (User->getOpcode() == ISD::BR) {
24512           SDValue FalseBB = User->getOperand(1);
24513           SDNode *NewBR =
24514             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24515           assert(NewBR == User);
24516           (void)NewBR;
24517           Dest = FalseBB;
24518
24519           SDValue Cmp =
24520               DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24521           SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24522           Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24523                               CCVal, Cmp);
24524           CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24525           return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24526                              Cmp);
24527         }
24528       }
24529     } else if (CC == ISD::SETUNE) {
24530       // For FCMP_UNE, we can emit
24531       // two branches instead of an explicit OR instruction with a
24532       // separate test.
24533       SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24534       SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24535       Chain =
24536           DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24537       CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24538       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24539                          Cmp);
24540     } else {
24541       X86::CondCode X86Cond =
24542           TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24543       SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24544       SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24545       return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24546                          Cmp);
24547     }
24548   }
24549
24550   if (ISD::isOverflowIntrOpRes(Cond)) {
24551     SDValue Value, Overflow;
24552     X86::CondCode X86Cond;
24553     std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24554
24555     SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24556     return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24557                        Overflow);
24558   }
24559
24560   // Look past the truncate if the high bits are known zero.
24561   if (isTruncWithZeroHighBitsInput(Cond, DAG))
24562     Cond = Cond.getOperand(0);
24563
24564   EVT CondVT = Cond.getValueType();
24565
24566   // Add an AND with 1 if we don't already have one.
24567   if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24568     Cond =
24569         DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24570
24571   SDValue LHS = Cond;
24572   SDValue RHS = DAG.getConstant(0, dl, CondVT);
24573
24574   SDValue CCVal;
24575   SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24576   return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24577                      EFLAGS);
24578 }
24579
24580 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24581 // Calls to _alloca are needed to probe the stack when allocating more than 4k
24582 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
24583 // that the guard pages used by the OS virtual memory manager are allocated in
24584 // correct sequence.
24585 SDValue
24586 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24587                                            SelectionDAG &DAG) const {
24588   MachineFunction &MF = DAG.getMachineFunction();
24589   bool SplitStack = MF.shouldSplitStack();
24590   bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24591   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24592                SplitStack || EmitStackProbeCall;
24593   SDLoc dl(Op);
24594
24595   // Get the inputs.
24596   SDNode *Node = Op.getNode();
24597   SDValue Chain = Op.getOperand(0);
24598   SDValue Size  = Op.getOperand(1);
24599   MaybeAlign Alignment(Op.getConstantOperandVal(2));
24600   EVT VT = Node->getValueType(0);
24601
24602   // Chain the dynamic stack allocation so that it doesn't modify the stack
24603   // pointer when other instructions are using the stack.
24604   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24605
24606   bool Is64Bit = Subtarget.is64Bit();
24607   MVT SPTy = getPointerTy(DAG.getDataLayout());
24608
24609   SDValue Result;
24610   if (!Lower) {
24611     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24612     Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
24613     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
24614                     " not tell us which reg is the stack pointer!");
24615
24616     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24617     const Align StackAlign = TFI.getStackAlign();
24618     if (hasInlineStackProbe(MF)) {
24619       MachineRegisterInfo &MRI = MF.getRegInfo();
24620
24621       const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24622       Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24623       Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24624       Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24625                            DAG.getRegister(Vreg, SPTy));
24626     } else {
24627       SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24628       Chain = SP.getValue(1);
24629       Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24630     }
24631     if (Alignment && *Alignment > StackAlign)
24632       Result =
24633           DAG.getNode(ISD::AND, dl, VT, Result,
24634                       DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24635     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24636   } else if (SplitStack) {
24637     MachineRegisterInfo &MRI = MF.getRegInfo();
24638
24639     if (Is64Bit) {
24640       // The 64 bit implementation of segmented stacks needs to clobber both r10
24641       // r11. This makes it impossible to use it along with nested parameters.
24642       const Function &F = MF.getFunction();
24643       for (const auto &A : F.args()) {
24644         if (A.hasNestAttr())
24645           report_fatal_error("Cannot use segmented stacks with functions that "
24646                              "have nested arguments.");
24647       }
24648     }
24649
24650     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24651     Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24652     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24653     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24654                                 DAG.getRegister(Vreg, SPTy));
24655   } else {
24656     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24657     Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
24658     MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
24659
24660     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24661     Register SPReg = RegInfo->getStackRegister();
24662     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24663     Chain = SP.getValue(1);
24664
24665     if (Alignment) {
24666       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24667                        DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24668       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24669     }
24670
24671     Result = SP;
24672   }
24673
24674   Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
24675
24676   SDValue Ops[2] = {Result, Chain};
24677   return DAG.getMergeValues(Ops, dl);
24678 }
24679
24680 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24681   MachineFunction &MF = DAG.getMachineFunction();
24682   auto PtrVT = getPointerTy(MF.getDataLayout());
24683   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
24684
24685   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24686   SDLoc DL(Op);
24687
24688   if (!Subtarget.is64Bit() ||
24689       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24690     // vastart just stores the address of the VarArgsFrameIndex slot into the
24691     // memory location argument.
24692     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24693     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24694                         MachinePointerInfo(SV));
24695   }
24696
24697   // __va_list_tag:
24698   //   gp_offset         (0 - 6 * 8)
24699   //   fp_offset         (48 - 48 + 8 * 16)
24700   //   overflow_arg_area (point to parameters coming in memory).
24701   //   reg_save_area
24702   SmallVector<SDValue, 8> MemOps;
24703   SDValue FIN = Op.getOperand(1);
24704   // Store gp_offset
24705   SDValue Store = DAG.getStore(
24706       Op.getOperand(0), DL,
24707       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24708       MachinePointerInfo(SV));
24709   MemOps.push_back(Store);
24710
24711   // Store fp_offset
24712   FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
24713   Store = DAG.getStore(
24714       Op.getOperand(0), DL,
24715       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
24716       MachinePointerInfo(SV, 4));
24717   MemOps.push_back(Store);
24718
24719   // Store ptr to overflow_arg_area
24720   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
24721   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24722   Store =
24723       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
24724   MemOps.push_back(Store);
24725
24726   // Store ptr to reg_save_area.
24727   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
24728       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
24729   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
24730   Store = DAG.getStore(
24731       Op.getOperand(0), DL, RSFIN, FIN,
24732       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
24733   MemOps.push_back(Store);
24734   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
24735 }
24736
24737 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
24738   assert(Subtarget.is64Bit() &&
24739          "LowerVAARG only handles 64-bit va_arg!");
24740   assert(Op.getNumOperands() == 4);
24741
24742   MachineFunction &MF = DAG.getMachineFunction();
24743   if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
24744     // The Win64 ABI uses char* instead of a structure.
24745     return DAG.expandVAArg(Op.getNode());
24746
24747   SDValue Chain = Op.getOperand(0);
24748   SDValue SrcPtr = Op.getOperand(1);
24749   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24750   unsigned Align = Op.getConstantOperandVal(3);
24751   SDLoc dl(Op);
24752
24753   EVT ArgVT = Op.getNode()->getValueType(0);
24754   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24755   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
24756   uint8_t ArgMode;
24757
24758   // Decide which area this value should be read from.
24759   // TODO: Implement the AMD64 ABI in its entirety. This simple
24760   // selection mechanism works only for the basic types.
24761   assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
24762   if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
24763     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
24764   } else {
24765     assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
24766            "Unhandled argument type in LowerVAARG");
24767     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
24768   }
24769
24770   if (ArgMode == 2) {
24771     // Make sure using fp_offset makes sense.
24772     assert(!Subtarget.useSoftFloat() &&
24773            !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
24774            Subtarget.hasSSE1());
24775   }
24776
24777   // Insert VAARG node into the DAG
24778   // VAARG returns two values: Variable Argument Address, Chain
24779   SDValue InstOps[] = {Chain, SrcPtr,
24780                        DAG.getTargetConstant(ArgSize, dl, MVT::i32),
24781                        DAG.getTargetConstant(ArgMode, dl, MVT::i8),
24782                        DAG.getTargetConstant(Align, dl, MVT::i32)};
24783   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
24784   SDValue VAARG = DAG.getMemIntrinsicNode(
24785       Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
24786       VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
24787       /*Alignment=*/std::nullopt,
24788       MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
24789   Chain = VAARG.getValue(1);
24790
24791   // Load the next argument and return it
24792   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
24793 }
24794
24795 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
24796                            SelectionDAG &DAG) {
24797   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
24798   // where a va_list is still an i8*.
24799   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
24800   if (Subtarget.isCallingConvWin64(
24801         DAG.getMachineFunction().getFunction().getCallingConv()))
24802     // Probably a Win64 va_copy.
24803     return DAG.expandVACopy(Op.getNode());
24804
24805   SDValue Chain = Op.getOperand(0);
24806   SDValue DstPtr = Op.getOperand(1);
24807   SDValue SrcPtr = Op.getOperand(2);
24808   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
24809   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
24810   SDLoc DL(Op);
24811
24812   return DAG.getMemcpy(
24813       Chain, DL, DstPtr, SrcPtr,
24814       DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
24815       Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
24816       false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
24817 }
24818
24819 // Helper to get immediate/variable SSE shift opcode from other shift opcodes.
24820 static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
24821   switch (Opc) {
24822   case ISD::SHL:
24823   case X86ISD::VSHL:
24824   case X86ISD::VSHLI:
24825     return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
24826   case ISD::SRL:
24827   case X86ISD::VSRL:
24828   case X86ISD::VSRLI:
24829     return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
24830   case ISD::SRA:
24831   case X86ISD::VSRA:
24832   case X86ISD::VSRAI:
24833     return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
24834   }
24835   llvm_unreachable("Unknown target vector shift node");
24836 }
24837
24838 /// Handle vector element shifts where the shift amount is a constant.
24839 /// Takes immediate version of shift as input.
24840 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
24841                                           SDValue SrcOp, uint64_t ShiftAmt,
24842                                           SelectionDAG &DAG) {
24843   MVT ElementType = VT.getVectorElementType();
24844
24845   // Bitcast the source vector to the output type, this is mainly necessary for
24846   // vXi8/vXi64 shifts.
24847   if (VT != SrcOp.getSimpleValueType())
24848     SrcOp = DAG.getBitcast(VT, SrcOp);
24849
24850   // Fold this packed shift into its first operand if ShiftAmt is 0.
24851   if (ShiftAmt == 0)
24852     return SrcOp;
24853
24854   // Check for ShiftAmt >= element width
24855   if (ShiftAmt >= ElementType.getSizeInBits()) {
24856     if (Opc == X86ISD::VSRAI)
24857       ShiftAmt = ElementType.getSizeInBits() - 1;
24858     else
24859       return DAG.getConstant(0, dl, VT);
24860   }
24861
24862   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
24863          && "Unknown target vector shift-by-constant node");
24864
24865   // Fold this packed vector shift into a build vector if SrcOp is a
24866   // vector of Constants or UNDEFs.
24867   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
24868     unsigned ShiftOpc;
24869     switch (Opc) {
24870     default: llvm_unreachable("Unknown opcode!");
24871     case X86ISD::VSHLI:
24872       ShiftOpc = ISD::SHL;
24873       break;
24874     case X86ISD::VSRLI:
24875       ShiftOpc = ISD::SRL;
24876       break;
24877     case X86ISD::VSRAI:
24878       ShiftOpc = ISD::SRA;
24879       break;
24880     }
24881
24882     SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
24883     if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
24884       return C;
24885   }
24886
24887   return DAG.getNode(Opc, dl, VT, SrcOp,
24888                      DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
24889 }
24890
24891 /// Handle vector element shifts by a splat shift amount
24892 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
24893                                    SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
24894                                    const X86Subtarget &Subtarget,
24895                                    SelectionDAG &DAG) {
24896   MVT AmtVT = ShAmt.getSimpleValueType();
24897   assert(AmtVT.isVector() && "Vector shift type mismatch");
24898   assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
24899          "Illegal vector splat index");
24900
24901   // Move the splat element to the bottom element.
24902   if (ShAmtIdx != 0) {
24903     SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
24904     Mask[0] = ShAmtIdx;
24905     ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
24906   }
24907
24908   // Peek through any zext node if we can get back to a 128-bit source.
24909   if (AmtVT.getScalarSizeInBits() == 64 &&
24910       (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
24911        ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
24912       ShAmt.getOperand(0).getValueType().isSimple() &&
24913       ShAmt.getOperand(0).getValueType().is128BitVector()) {
24914     ShAmt = ShAmt.getOperand(0);
24915     AmtVT = ShAmt.getSimpleValueType();
24916   }
24917
24918   // See if we can mask off the upper elements using the existing source node.
24919   // The shift uses the entire lower 64-bits of the amount vector, so no need to
24920   // do this for vXi64 types.
24921   bool IsMasked = false;
24922   if (AmtVT.getScalarSizeInBits() < 64) {
24923     if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
24924         ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
24925       // If the shift amount has come from a scalar, then zero-extend the scalar
24926       // before moving to the vector.
24927       ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
24928       ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
24929       ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
24930       AmtVT = MVT::v4i32;
24931       IsMasked = true;
24932     } else if (ShAmt.getOpcode() == ISD::AND) {
24933       // See if the shift amount is already masked (e.g. for rotation modulo),
24934       // then we can zero-extend it by setting all the other mask elements to
24935       // zero.
24936       SmallVector<SDValue> MaskElts(
24937           AmtVT.getVectorNumElements(),
24938           DAG.getConstant(0, dl, AmtVT.getScalarType()));
24939       MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
24940       SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
24941       if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
24942                                              {ShAmt.getOperand(1), Mask}))) {
24943         ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
24944         IsMasked = true;
24945       }
24946     }
24947   }
24948
24949   // Extract if the shift amount vector is larger than 128-bits.
24950   if (AmtVT.getSizeInBits() > 128) {
24951     ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
24952     AmtVT = ShAmt.getSimpleValueType();
24953   }
24954
24955   // Zero-extend bottom element to v2i64 vector type, either by extension or
24956   // shuffle masking.
24957   if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
24958     if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
24959                                 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
24960       ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
24961     } else if (Subtarget.hasSSE41()) {
24962       ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
24963                           MVT::v2i64, ShAmt);
24964     } else {
24965       SDValue ByteShift = DAG.getTargetConstant(
24966           (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
24967       ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
24968       ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
24969                           ByteShift);
24970       ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
24971                           ByteShift);
24972     }
24973   }
24974
24975   // Change opcode to non-immediate version.
24976   Opc = getTargetVShiftUniformOpcode(Opc, true);
24977
24978   // The return type has to be a 128-bit type with the same element
24979   // type as the input type.
24980   MVT EltVT = VT.getVectorElementType();
24981   MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
24982
24983   ShAmt = DAG.getBitcast(ShVT, ShAmt);
24984   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
24985 }
24986
24987 /// Return Mask with the necessary casting or extending
24988 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
24989 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
24990                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
24991                            const SDLoc &dl) {
24992
24993   if (isAllOnesConstant(Mask))
24994     return DAG.getConstant(1, dl, MaskVT);
24995   if (X86::isZeroNode(Mask))
24996     return DAG.getConstant(0, dl, MaskVT);
24997
24998   assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
24999
25000   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25001     assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25002     assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25003     // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25004     SDValue Lo, Hi;
25005     std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
25006     Lo = DAG.getBitcast(MVT::v32i1, Lo);
25007     Hi = DAG.getBitcast(MVT::v32i1, Hi);
25008     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25009   } else {
25010     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25011                                      Mask.getSimpleValueType().getSizeInBits());
25012     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25013     // are extracted by EXTRACT_SUBVECTOR.
25014     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25015                        DAG.getBitcast(BitcastVT, Mask),
25016                        DAG.getIntPtrConstant(0, dl));
25017   }
25018 }
25019
25020 /// Return (and \p Op, \p Mask) for compare instructions or
25021 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25022 /// necessary casting or extending for \p Mask when lowering masking intrinsics
25023 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
25024                                     SDValue PreservedSrc,
25025                                     const X86Subtarget &Subtarget,
25026                                     SelectionDAG &DAG) {
25027   MVT VT = Op.getSimpleValueType();
25028   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25029   unsigned OpcodeSelect = ISD::VSELECT;
25030   SDLoc dl(Op);
25031
25032   if (isAllOnesConstant(Mask))
25033     return Op;
25034
25035   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25036
25037   if (PreservedSrc.isUndef())
25038     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25039   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25040 }
25041
25042 /// Creates an SDNode for a predicated scalar operation.
25043 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25044 /// The mask is coming as MVT::i8 and it should be transformed
25045 /// to MVT::v1i1 while lowering masking intrinsics.
25046 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25047 /// "X86select" instead of "vselect". We just can't create the "vselect" node
25048 /// for a scalar instruction.
25049 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
25050                                     SDValue PreservedSrc,
25051                                     const X86Subtarget &Subtarget,
25052                                     SelectionDAG &DAG) {
25053
25054   if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25055     if (MaskConst->getZExtValue() & 0x1)
25056       return Op;
25057
25058   MVT VT = Op.getSimpleValueType();
25059   SDLoc dl(Op);
25060
25061   assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25062   SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25063                               DAG.getBitcast(MVT::v8i1, Mask),
25064                               DAG.getIntPtrConstant(0, dl));
25065   if (Op.getOpcode() == X86ISD::FSETCCM ||
25066       Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25067       Op.getOpcode() == X86ISD::VFPCLASSS)
25068     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25069
25070   if (PreservedSrc.isUndef())
25071     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25072   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25073 }
25074
25075 static int getSEHRegistrationNodeSize(const Function *Fn) {
25076   if (!Fn->hasPersonalityFn())
25077     report_fatal_error(
25078         "querying registration node size for function without personality");
25079   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25080   // WinEHStatePass for the full struct definition.
25081   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25082   case EHPersonality::MSVC_X86SEH: return 24;
25083   case EHPersonality::MSVC_CXX: return 16;
25084   default: break;
25085   }
25086   report_fatal_error(
25087       "can only recover FP for 32-bit MSVC EH personality functions");
25088 }
25089
25090 /// When the MSVC runtime transfers control to us, either to an outlined
25091 /// function or when returning to a parent frame after catching an exception, we
25092 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25093 /// Here's the math:
25094 ///   RegNodeBase = EntryEBP - RegNodeSize
25095 ///   ParentFP = RegNodeBase - ParentFrameOffset
25096 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
25097 /// subtracting the offset (negative on x86) takes us back to the parent FP.
25098 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
25099                                    SDValue EntryEBP) {
25100   MachineFunction &MF = DAG.getMachineFunction();
25101   SDLoc dl;
25102
25103   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25104   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25105
25106   // It's possible that the parent function no longer has a personality function
25107   // if the exceptional code was optimized away, in which case we just return
25108   // the incoming EBP.
25109   if (!Fn->hasPersonalityFn())
25110     return EntryEBP;
25111
25112   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25113   // registration, or the .set_setframe offset.
25114   MCSymbol *OffsetSym =
25115       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
25116           GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25117   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25118   SDValue ParentFrameOffset =
25119       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25120
25121   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25122   // prologue to RBP in the parent function.
25123   const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
25124   if (Subtarget.is64Bit())
25125     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25126
25127   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25128   // RegNodeBase = EntryEBP - RegNodeSize
25129   // ParentFP = RegNodeBase - ParentFrameOffset
25130   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25131                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
25132   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25133 }
25134
25135 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25136                                                    SelectionDAG &DAG) const {
25137   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25138   auto isRoundModeCurDirection = [](SDValue Rnd) {
25139     if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25140       return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25141
25142     return false;
25143   };
25144   auto isRoundModeSAE = [](SDValue Rnd) {
25145     if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25146       unsigned RC = C->getZExtValue();
25147       if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25148         // Clear the NO_EXC bit and check remaining bits.
25149         RC ^= X86::STATIC_ROUNDING::NO_EXC;
25150         // As a convenience we allow no other bits or explicitly
25151         // current direction.
25152         return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25153       }
25154     }
25155
25156     return false;
25157   };
25158   auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25159     if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25160       RC = C->getZExtValue();
25161       if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25162         // Clear the NO_EXC bit and check remaining bits.
25163         RC ^= X86::STATIC_ROUNDING::NO_EXC;
25164         return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
25165                RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
25166                RC == X86::STATIC_ROUNDING::TO_POS_INF ||
25167                RC == X86::STATIC_ROUNDING::TO_ZERO;
25168       }
25169     }
25170
25171     return false;
25172   };
25173
25174   SDLoc dl(Op);
25175   unsigned IntNo = Op.getConstantOperandVal(0);
25176   MVT VT = Op.getSimpleValueType();
25177   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25178
25179   // Propagate flags from original node to transformed node(s).
25180   SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25181
25182   if (IntrData) {
25183     switch(IntrData->Type) {
25184     case INTR_TYPE_1OP: {
25185       // We specify 2 possible opcodes for intrinsics with rounding modes.
25186       // First, we check if the intrinsic may have non-default rounding mode,
25187       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25188       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25189       if (IntrWithRoundingModeOpcode != 0) {
25190         SDValue Rnd = Op.getOperand(2);
25191         unsigned RC = 0;
25192         if (isRoundModeSAEToX(Rnd, RC))
25193           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25194                              Op.getOperand(1),
25195                              DAG.getTargetConstant(RC, dl, MVT::i32));
25196         if (!isRoundModeCurDirection(Rnd))
25197           return SDValue();
25198       }
25199       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25200                          Op.getOperand(1));
25201     }
25202     case INTR_TYPE_1OP_SAE: {
25203       SDValue Sae = Op.getOperand(2);
25204
25205       unsigned Opc;
25206       if (isRoundModeCurDirection(Sae))
25207         Opc = IntrData->Opc0;
25208       else if (isRoundModeSAE(Sae))
25209         Opc = IntrData->Opc1;
25210       else
25211         return SDValue();
25212
25213       return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25214     }
25215     case INTR_TYPE_2OP: {
25216       SDValue Src2 = Op.getOperand(2);
25217
25218       // We specify 2 possible opcodes for intrinsics with rounding modes.
25219       // First, we check if the intrinsic may have non-default rounding mode,
25220       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25221       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25222       if (IntrWithRoundingModeOpcode != 0) {
25223         SDValue Rnd = Op.getOperand(3);
25224         unsigned RC = 0;
25225         if (isRoundModeSAEToX(Rnd, RC))
25226           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25227                              Op.getOperand(1), Src2,
25228                              DAG.getTargetConstant(RC, dl, MVT::i32));
25229         if (!isRoundModeCurDirection(Rnd))
25230           return SDValue();
25231       }
25232
25233       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25234                          Op.getOperand(1), Src2);
25235     }
25236     case INTR_TYPE_2OP_SAE: {
25237       SDValue Sae = Op.getOperand(3);
25238
25239       unsigned Opc;
25240       if (isRoundModeCurDirection(Sae))
25241         Opc = IntrData->Opc0;
25242       else if (isRoundModeSAE(Sae))
25243         Opc = IntrData->Opc1;
25244       else
25245         return SDValue();
25246
25247       return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25248                          Op.getOperand(2));
25249     }
25250     case INTR_TYPE_3OP:
25251     case INTR_TYPE_3OP_IMM8: {
25252       SDValue Src1 = Op.getOperand(1);
25253       SDValue Src2 = Op.getOperand(2);
25254       SDValue Src3 = Op.getOperand(3);
25255
25256       if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25257           Src3.getValueType() != MVT::i8) {
25258         Src3 = DAG.getTargetConstant(
25259             cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
25260       }
25261
25262       // We specify 2 possible opcodes for intrinsics with rounding modes.
25263       // First, we check if the intrinsic may have non-default rounding mode,
25264       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25265       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25266       if (IntrWithRoundingModeOpcode != 0) {
25267         SDValue Rnd = Op.getOperand(4);
25268         unsigned RC = 0;
25269         if (isRoundModeSAEToX(Rnd, RC))
25270           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25271                              Src1, Src2, Src3,
25272                              DAG.getTargetConstant(RC, dl, MVT::i32));
25273         if (!isRoundModeCurDirection(Rnd))
25274           return SDValue();
25275       }
25276
25277       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25278                          {Src1, Src2, Src3});
25279     }
25280     case INTR_TYPE_4OP_IMM8: {
25281       assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25282       SDValue Src4 = Op.getOperand(4);
25283       if (Src4.getValueType() != MVT::i8) {
25284         Src4 = DAG.getTargetConstant(
25285             cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
25286       }
25287
25288       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25289                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25290                          Src4);
25291     }
25292     case INTR_TYPE_1OP_MASK: {
25293       SDValue Src = Op.getOperand(1);
25294       SDValue PassThru = Op.getOperand(2);
25295       SDValue Mask = Op.getOperand(3);
25296       // We add rounding mode to the Node when
25297       //   - RC Opcode is specified and
25298       //   - RC is not "current direction".
25299       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25300       if (IntrWithRoundingModeOpcode != 0) {
25301         SDValue Rnd = Op.getOperand(4);
25302         unsigned RC = 0;
25303         if (isRoundModeSAEToX(Rnd, RC))
25304           return getVectorMaskingNode(
25305               DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25306                           Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25307               Mask, PassThru, Subtarget, DAG);
25308         if (!isRoundModeCurDirection(Rnd))
25309           return SDValue();
25310       }
25311       return getVectorMaskingNode(
25312           DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25313           Subtarget, DAG);
25314     }
25315     case INTR_TYPE_1OP_MASK_SAE: {
25316       SDValue Src = Op.getOperand(1);
25317       SDValue PassThru = Op.getOperand(2);
25318       SDValue Mask = Op.getOperand(3);
25319       SDValue Rnd = Op.getOperand(4);
25320
25321       unsigned Opc;
25322       if (isRoundModeCurDirection(Rnd))
25323         Opc = IntrData->Opc0;
25324       else if (isRoundModeSAE(Rnd))
25325         Opc = IntrData->Opc1;
25326       else
25327         return SDValue();
25328
25329       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25330                                   Subtarget, DAG);
25331     }
25332     case INTR_TYPE_SCALAR_MASK: {
25333       SDValue Src1 = Op.getOperand(1);
25334       SDValue Src2 = Op.getOperand(2);
25335       SDValue passThru = Op.getOperand(3);
25336       SDValue Mask = Op.getOperand(4);
25337       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25338       // There are 2 kinds of intrinsics in this group:
25339       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25340       // (2) With rounding mode and sae - 7 operands.
25341       bool HasRounding = IntrWithRoundingModeOpcode != 0;
25342       if (Op.getNumOperands() == (5U + HasRounding)) {
25343         if (HasRounding) {
25344           SDValue Rnd = Op.getOperand(5);
25345           unsigned RC = 0;
25346           if (isRoundModeSAEToX(Rnd, RC))
25347             return getScalarMaskingNode(
25348                 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25349                             DAG.getTargetConstant(RC, dl, MVT::i32)),
25350                 Mask, passThru, Subtarget, DAG);
25351           if (!isRoundModeCurDirection(Rnd))
25352             return SDValue();
25353         }
25354         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25355                                                 Src2),
25356                                     Mask, passThru, Subtarget, DAG);
25357       }
25358
25359       assert(Op.getNumOperands() == (6U + HasRounding) &&
25360              "Unexpected intrinsic form");
25361       SDValue RoundingMode = Op.getOperand(5);
25362       unsigned Opc = IntrData->Opc0;
25363       if (HasRounding) {
25364         SDValue Sae = Op.getOperand(6);
25365         if (isRoundModeSAE(Sae))
25366           Opc = IntrWithRoundingModeOpcode;
25367         else if (!isRoundModeCurDirection(Sae))
25368           return SDValue();
25369       }
25370       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25371                                               Src2, RoundingMode),
25372                                   Mask, passThru, Subtarget, DAG);
25373     }
25374     case INTR_TYPE_SCALAR_MASK_RND: {
25375       SDValue Src1 = Op.getOperand(1);
25376       SDValue Src2 = Op.getOperand(2);
25377       SDValue passThru = Op.getOperand(3);
25378       SDValue Mask = Op.getOperand(4);
25379       SDValue Rnd = Op.getOperand(5);
25380
25381       SDValue NewOp;
25382       unsigned RC = 0;
25383       if (isRoundModeCurDirection(Rnd))
25384         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25385       else if (isRoundModeSAEToX(Rnd, RC))
25386         NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25387                             DAG.getTargetConstant(RC, dl, MVT::i32));
25388       else
25389         return SDValue();
25390
25391       return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25392     }
25393     case INTR_TYPE_SCALAR_MASK_SAE: {
25394       SDValue Src1 = Op.getOperand(1);
25395       SDValue Src2 = Op.getOperand(2);
25396       SDValue passThru = Op.getOperand(3);
25397       SDValue Mask = Op.getOperand(4);
25398       SDValue Sae = Op.getOperand(5);
25399       unsigned Opc;
25400       if (isRoundModeCurDirection(Sae))
25401         Opc = IntrData->Opc0;
25402       else if (isRoundModeSAE(Sae))
25403         Opc = IntrData->Opc1;
25404       else
25405         return SDValue();
25406
25407       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25408                                   Mask, passThru, Subtarget, DAG);
25409     }
25410     case INTR_TYPE_2OP_MASK: {
25411       SDValue Src1 = Op.getOperand(1);
25412       SDValue Src2 = Op.getOperand(2);
25413       SDValue PassThru = Op.getOperand(3);
25414       SDValue Mask = Op.getOperand(4);
25415       SDValue NewOp;
25416       if (IntrData->Opc1 != 0) {
25417         SDValue Rnd = Op.getOperand(5);
25418         unsigned RC = 0;
25419         if (isRoundModeSAEToX(Rnd, RC))
25420           NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25421                               DAG.getTargetConstant(RC, dl, MVT::i32));
25422         else if (!isRoundModeCurDirection(Rnd))
25423           return SDValue();
25424       }
25425       if (!NewOp)
25426         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25427       return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25428     }
25429     case INTR_TYPE_2OP_MASK_SAE: {
25430       SDValue Src1 = Op.getOperand(1);
25431       SDValue Src2 = Op.getOperand(2);
25432       SDValue PassThru = Op.getOperand(3);
25433       SDValue Mask = Op.getOperand(4);
25434
25435       unsigned Opc = IntrData->Opc0;
25436       if (IntrData->Opc1 != 0) {
25437         SDValue Sae = Op.getOperand(5);
25438         if (isRoundModeSAE(Sae))
25439           Opc = IntrData->Opc1;
25440         else if (!isRoundModeCurDirection(Sae))
25441           return SDValue();
25442       }
25443
25444       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25445                                   Mask, PassThru, Subtarget, DAG);
25446     }
25447     case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
25448       SDValue Src1 = Op.getOperand(1);
25449       SDValue Src2 = Op.getOperand(2);
25450       SDValue Src3 = Op.getOperand(3);
25451       SDValue PassThru = Op.getOperand(4);
25452       SDValue Mask = Op.getOperand(5);
25453       SDValue Sae = Op.getOperand(6);
25454       unsigned Opc;
25455       if (isRoundModeCurDirection(Sae))
25456         Opc = IntrData->Opc0;
25457       else if (isRoundModeSAE(Sae))
25458         Opc = IntrData->Opc1;
25459       else
25460         return SDValue();
25461
25462       return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25463                                   Mask, PassThru, Subtarget, DAG);
25464     }
25465     case INTR_TYPE_3OP_MASK_SAE: {
25466       SDValue Src1 = Op.getOperand(1);
25467       SDValue Src2 = Op.getOperand(2);
25468       SDValue Src3 = Op.getOperand(3);
25469       SDValue PassThru = Op.getOperand(4);
25470       SDValue Mask = Op.getOperand(5);
25471
25472       unsigned Opc = IntrData->Opc0;
25473       if (IntrData->Opc1 != 0) {
25474         SDValue Sae = Op.getOperand(6);
25475         if (isRoundModeSAE(Sae))
25476           Opc = IntrData->Opc1;
25477         else if (!isRoundModeCurDirection(Sae))
25478           return SDValue();
25479       }
25480       return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25481                                   Mask, PassThru, Subtarget, DAG);
25482     }
25483     case BLENDV: {
25484       SDValue Src1 = Op.getOperand(1);
25485       SDValue Src2 = Op.getOperand(2);
25486       SDValue Src3 = Op.getOperand(3);
25487
25488       EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
25489       Src3 = DAG.getBitcast(MaskVT, Src3);
25490
25491       // Reverse the operands to match VSELECT order.
25492       return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25493     }
25494     case VPERM_2OP : {
25495       SDValue Src1 = Op.getOperand(1);
25496       SDValue Src2 = Op.getOperand(2);
25497
25498       // Swap Src1 and Src2 in the node creation
25499       return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25500     }
25501     case CFMA_OP_MASKZ:
25502     case CFMA_OP_MASK: {
25503       SDValue Src1 = Op.getOperand(1);
25504       SDValue Src2 = Op.getOperand(2);
25505       SDValue Src3 = Op.getOperand(3);
25506       SDValue Mask = Op.getOperand(4);
25507       MVT VT = Op.getSimpleValueType();
25508
25509       SDValue PassThru = Src3;
25510       if (IntrData->Type == CFMA_OP_MASKZ)
25511         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25512
25513       // We add rounding mode to the Node when
25514       //   - RC Opcode is specified and
25515       //   - RC is not "current direction".
25516       SDValue NewOp;
25517       if (IntrData->Opc1 != 0) {
25518         SDValue Rnd = Op.getOperand(5);
25519         unsigned RC = 0;
25520         if (isRoundModeSAEToX(Rnd, RC))
25521           NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
25522                               DAG.getTargetConstant(RC, dl, MVT::i32));
25523         else if (!isRoundModeCurDirection(Rnd))
25524           return SDValue();
25525       }
25526       if (!NewOp)
25527         NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
25528       return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25529     }
25530     case IFMA_OP:
25531       // NOTE: We need to swizzle the operands to pass the multiply operands
25532       // first.
25533       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25534                          Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25535     case FPCLASSS: {
25536       SDValue Src1 = Op.getOperand(1);
25537       SDValue Imm = Op.getOperand(2);
25538       SDValue Mask = Op.getOperand(3);
25539       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25540       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25541                                                  Subtarget, DAG);
25542       // Need to fill with zeros to ensure the bitcast will produce zeroes
25543       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25544       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25545                                 DAG.getConstant(0, dl, MVT::v8i1),
25546                                 FPclassMask, DAG.getIntPtrConstant(0, dl));
25547       return DAG.getBitcast(MVT::i8, Ins);
25548     }
25549
25550     case CMP_MASK_CC: {
25551       MVT MaskVT = Op.getSimpleValueType();
25552       SDValue CC = Op.getOperand(3);
25553       SDValue Mask = Op.getOperand(4);
25554       // We specify 2 possible opcodes for intrinsics with rounding modes.
25555       // First, we check if the intrinsic may have non-default rounding mode,
25556       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25557       if (IntrData->Opc1 != 0) {
25558         SDValue Sae = Op.getOperand(5);
25559         if (isRoundModeSAE(Sae))
25560           return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25561                              Op.getOperand(2), CC, Mask, Sae);
25562         if (!isRoundModeCurDirection(Sae))
25563           return SDValue();
25564       }
25565       //default rounding mode
25566       return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25567                          {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25568     }
25569     case CMP_MASK_SCALAR_CC: {
25570       SDValue Src1 = Op.getOperand(1);
25571       SDValue Src2 = Op.getOperand(2);
25572       SDValue CC = Op.getOperand(3);
25573       SDValue Mask = Op.getOperand(4);
25574
25575       SDValue Cmp;
25576       if (IntrData->Opc1 != 0) {
25577         SDValue Sae = Op.getOperand(5);
25578         if (isRoundModeSAE(Sae))
25579           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25580         else if (!isRoundModeCurDirection(Sae))
25581           return SDValue();
25582       }
25583       //default rounding mode
25584       if (!Cmp.getNode())
25585         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25586
25587       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25588                                              Subtarget, DAG);
25589       // Need to fill with zeros to ensure the bitcast will produce zeroes
25590       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25591       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25592                                 DAG.getConstant(0, dl, MVT::v8i1),
25593                                 CmpMask, DAG.getIntPtrConstant(0, dl));
25594       return DAG.getBitcast(MVT::i8, Ins);
25595     }
25596     case COMI: { // Comparison intrinsics
25597       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25598       SDValue LHS = Op.getOperand(1);
25599       SDValue RHS = Op.getOperand(2);
25600       // Some conditions require the operands to be swapped.
25601       if (CC == ISD::SETLT || CC == ISD::SETLE)
25602         std::swap(LHS, RHS);
25603
25604       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25605       SDValue SetCC;
25606       switch (CC) {
25607       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25608         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25609         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25610         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25611         break;
25612       }
25613       case ISD::SETNE: { // (ZF = 1 or PF = 1)
25614         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25615         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25616         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25617         break;
25618       }
25619       case ISD::SETGT: // (CF = 0 and ZF = 0)
25620       case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25621         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25622         break;
25623       }
25624       case ISD::SETGE: // CF = 0
25625       case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25626         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25627         break;
25628       default:
25629         llvm_unreachable("Unexpected illegal condition!");
25630       }
25631       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25632     }
25633     case COMI_RM: { // Comparison intrinsics with Sae
25634       SDValue LHS = Op.getOperand(1);
25635       SDValue RHS = Op.getOperand(2);
25636       unsigned CondVal = Op.getConstantOperandVal(3);
25637       SDValue Sae = Op.getOperand(4);
25638
25639       SDValue FCmp;
25640       if (isRoundModeCurDirection(Sae))
25641         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25642                            DAG.getTargetConstant(CondVal, dl, MVT::i8));
25643       else if (isRoundModeSAE(Sae))
25644         FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25645                            DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25646       else
25647         return SDValue();
25648       // Need to fill with zeros to ensure the bitcast will produce zeroes
25649       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25650       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25651                                 DAG.getConstant(0, dl, MVT::v16i1),
25652                                 FCmp, DAG.getIntPtrConstant(0, dl));
25653       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25654                          DAG.getBitcast(MVT::i16, Ins));
25655     }
25656     case VSHIFT: {
25657       SDValue SrcOp = Op.getOperand(1);
25658       SDValue ShAmt = Op.getOperand(2);
25659       assert(ShAmt.getValueType() == MVT::i32 &&
25660              "Unexpected VSHIFT amount type");
25661
25662       // Catch shift-by-constant.
25663       if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
25664         return getTargetVShiftByConstNode(IntrData->Opc0, dl,
25665                                           Op.getSimpleValueType(), SrcOp,
25666                                           CShAmt->getZExtValue(), DAG);
25667
25668       ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25669       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25670                                  SrcOp, ShAmt, 0, Subtarget, DAG);
25671     }
25672     case COMPRESS_EXPAND_IN_REG: {
25673       SDValue Mask = Op.getOperand(3);
25674       SDValue DataToCompress = Op.getOperand(1);
25675       SDValue PassThru = Op.getOperand(2);
25676       if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25677         return Op.getOperand(1);
25678
25679       // Avoid false dependency.
25680       if (PassThru.isUndef())
25681         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25682
25683       return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25684                          Mask);
25685     }
25686     case FIXUPIMM:
25687     case FIXUPIMM_MASKZ: {
25688       SDValue Src1 = Op.getOperand(1);
25689       SDValue Src2 = Op.getOperand(2);
25690       SDValue Src3 = Op.getOperand(3);
25691       SDValue Imm = Op.getOperand(4);
25692       SDValue Mask = Op.getOperand(5);
25693       SDValue Passthru = (IntrData->Type == FIXUPIMM)
25694                              ? Src1
25695                              : getZeroVector(VT, Subtarget, DAG, dl);
25696
25697       unsigned Opc = IntrData->Opc0;
25698       if (IntrData->Opc1 != 0) {
25699         SDValue Sae = Op.getOperand(6);
25700         if (isRoundModeSAE(Sae))
25701           Opc = IntrData->Opc1;
25702         else if (!isRoundModeCurDirection(Sae))
25703           return SDValue();
25704       }
25705
25706       SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25707
25708       if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25709         return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25710
25711       return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25712     }
25713     case ROUNDP: {
25714       assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
25715       // Clear the upper bits of the rounding immediate so that the legacy
25716       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25717       auto Round = cast<ConstantSDNode>(Op.getOperand(2));
25718       SDValue RoundingMode =
25719           DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25720       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25721                          Op.getOperand(1), RoundingMode);
25722     }
25723     case ROUNDS: {
25724       assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
25725       // Clear the upper bits of the rounding immediate so that the legacy
25726       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
25727       auto Round = cast<ConstantSDNode>(Op.getOperand(3));
25728       SDValue RoundingMode =
25729           DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
25730       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25731                          Op.getOperand(1), Op.getOperand(2), RoundingMode);
25732     }
25733     case BEXTRI: {
25734       assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
25735
25736       uint64_t Imm = Op.getConstantOperandVal(2);
25737       SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
25738                                               Op.getValueType());
25739       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25740                          Op.getOperand(1), Control);
25741     }
25742     // ADC/SBB
25743     case ADX: {
25744       SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
25745       SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
25746
25747       SDValue Res;
25748       // If the carry in is zero, then we should just use ADD/SUB instead of
25749       // ADC/SBB.
25750       if (isNullConstant(Op.getOperand(1))) {
25751         Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
25752                           Op.getOperand(3));
25753       } else {
25754         SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
25755                                     DAG.getConstant(-1, dl, MVT::i8));
25756         Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
25757                           Op.getOperand(3), GenCF.getValue(1));
25758       }
25759       SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
25760       SDValue Results[] = { SetCC, Res };
25761       return DAG.getMergeValues(Results, dl);
25762     }
25763     case CVTPD2PS_MASK:
25764     case CVTPD2DQ_MASK:
25765     case CVTQQ2PS_MASK:
25766     case TRUNCATE_TO_REG: {
25767       SDValue Src = Op.getOperand(1);
25768       SDValue PassThru = Op.getOperand(2);
25769       SDValue Mask = Op.getOperand(3);
25770
25771       if (isAllOnesConstant(Mask))
25772         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25773
25774       MVT SrcVT = Src.getSimpleValueType();
25775       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25776       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25777       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
25778                          {Src, PassThru, Mask});
25779     }
25780     case CVTPS2PH_MASK: {
25781       SDValue Src = Op.getOperand(1);
25782       SDValue Rnd = Op.getOperand(2);
25783       SDValue PassThru = Op.getOperand(3);
25784       SDValue Mask = Op.getOperand(4);
25785
25786       unsigned RC = 0;
25787       unsigned Opc = IntrData->Opc0;
25788       bool SAE = Src.getValueType().is512BitVector() &&
25789                  (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
25790       if (SAE) {
25791         Opc = X86ISD::CVTPS2PH_SAE;
25792         Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
25793       }
25794
25795       if (isAllOnesConstant(Mask))
25796         return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
25797
25798       if (SAE)
25799         Opc = X86ISD::MCVTPS2PH_SAE;
25800       else
25801         Opc = IntrData->Opc1;
25802       MVT SrcVT = Src.getSimpleValueType();
25803       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
25804       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25805       return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
25806     }
25807     case CVTNEPS2BF16_MASK: {
25808       SDValue Src = Op.getOperand(1);
25809       SDValue PassThru = Op.getOperand(2);
25810       SDValue Mask = Op.getOperand(3);
25811
25812       if (ISD::isBuildVectorAllOnes(Mask.getNode()))
25813         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
25814
25815       // Break false dependency.
25816       if (PassThru.isUndef())
25817         PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
25818
25819       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
25820                          Mask);
25821     }
25822     default:
25823       break;
25824     }
25825   }
25826
25827   switch (IntNo) {
25828   default: return SDValue();    // Don't custom lower most intrinsics.
25829
25830   // ptest and testp intrinsics. The intrinsic these come from are designed to
25831   // return an integer value, not just an instruction so lower it to the ptest
25832   // or testp pattern and a setcc for the result.
25833   case Intrinsic::x86_avx512_ktestc_b:
25834   case Intrinsic::x86_avx512_ktestc_w:
25835   case Intrinsic::x86_avx512_ktestc_d:
25836   case Intrinsic::x86_avx512_ktestc_q:
25837   case Intrinsic::x86_avx512_ktestz_b:
25838   case Intrinsic::x86_avx512_ktestz_w:
25839   case Intrinsic::x86_avx512_ktestz_d:
25840   case Intrinsic::x86_avx512_ktestz_q:
25841   case Intrinsic::x86_sse41_ptestz:
25842   case Intrinsic::x86_sse41_ptestc:
25843   case Intrinsic::x86_sse41_ptestnzc:
25844   case Intrinsic::x86_avx_ptestz_256:
25845   case Intrinsic::x86_avx_ptestc_256:
25846   case Intrinsic::x86_avx_ptestnzc_256:
25847   case Intrinsic::x86_avx_vtestz_ps:
25848   case Intrinsic::x86_avx_vtestc_ps:
25849   case Intrinsic::x86_avx_vtestnzc_ps:
25850   case Intrinsic::x86_avx_vtestz_pd:
25851   case Intrinsic::x86_avx_vtestc_pd:
25852   case Intrinsic::x86_avx_vtestnzc_pd:
25853   case Intrinsic::x86_avx_vtestz_ps_256:
25854   case Intrinsic::x86_avx_vtestc_ps_256:
25855   case Intrinsic::x86_avx_vtestnzc_ps_256:
25856   case Intrinsic::x86_avx_vtestz_pd_256:
25857   case Intrinsic::x86_avx_vtestc_pd_256:
25858   case Intrinsic::x86_avx_vtestnzc_pd_256: {
25859     unsigned TestOpc = X86ISD::PTEST;
25860     X86::CondCode X86CC;
25861     switch (IntNo) {
25862     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
25863     case Intrinsic::x86_avx512_ktestc_b:
25864     case Intrinsic::x86_avx512_ktestc_w:
25865     case Intrinsic::x86_avx512_ktestc_d:
25866     case Intrinsic::x86_avx512_ktestc_q:
25867       // CF = 1
25868       TestOpc = X86ISD::KTEST;
25869       X86CC = X86::COND_B;
25870       break;
25871     case Intrinsic::x86_avx512_ktestz_b:
25872     case Intrinsic::x86_avx512_ktestz_w:
25873     case Intrinsic::x86_avx512_ktestz_d:
25874     case Intrinsic::x86_avx512_ktestz_q:
25875       TestOpc = X86ISD::KTEST;
25876       X86CC = X86::COND_E;
25877       break;
25878     case Intrinsic::x86_avx_vtestz_ps:
25879     case Intrinsic::x86_avx_vtestz_pd:
25880     case Intrinsic::x86_avx_vtestz_ps_256:
25881     case Intrinsic::x86_avx_vtestz_pd_256:
25882       TestOpc = X86ISD::TESTP;
25883       [[fallthrough]];
25884     case Intrinsic::x86_sse41_ptestz:
25885     case Intrinsic::x86_avx_ptestz_256:
25886       // ZF = 1
25887       X86CC = X86::COND_E;
25888       break;
25889     case Intrinsic::x86_avx_vtestc_ps:
25890     case Intrinsic::x86_avx_vtestc_pd:
25891     case Intrinsic::x86_avx_vtestc_ps_256:
25892     case Intrinsic::x86_avx_vtestc_pd_256:
25893       TestOpc = X86ISD::TESTP;
25894       [[fallthrough]];
25895     case Intrinsic::x86_sse41_ptestc:
25896     case Intrinsic::x86_avx_ptestc_256:
25897       // CF = 1
25898       X86CC = X86::COND_B;
25899       break;
25900     case Intrinsic::x86_avx_vtestnzc_ps:
25901     case Intrinsic::x86_avx_vtestnzc_pd:
25902     case Intrinsic::x86_avx_vtestnzc_ps_256:
25903     case Intrinsic::x86_avx_vtestnzc_pd_256:
25904       TestOpc = X86ISD::TESTP;
25905       [[fallthrough]];
25906     case Intrinsic::x86_sse41_ptestnzc:
25907     case Intrinsic::x86_avx_ptestnzc_256:
25908       // ZF and CF = 0
25909       X86CC = X86::COND_A;
25910       break;
25911     }
25912
25913     SDValue LHS = Op.getOperand(1);
25914     SDValue RHS = Op.getOperand(2);
25915     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
25916     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
25917     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25918   }
25919
25920   case Intrinsic::x86_sse42_pcmpistria128:
25921   case Intrinsic::x86_sse42_pcmpestria128:
25922   case Intrinsic::x86_sse42_pcmpistric128:
25923   case Intrinsic::x86_sse42_pcmpestric128:
25924   case Intrinsic::x86_sse42_pcmpistrio128:
25925   case Intrinsic::x86_sse42_pcmpestrio128:
25926   case Intrinsic::x86_sse42_pcmpistris128:
25927   case Intrinsic::x86_sse42_pcmpestris128:
25928   case Intrinsic::x86_sse42_pcmpistriz128:
25929   case Intrinsic::x86_sse42_pcmpestriz128: {
25930     unsigned Opcode;
25931     X86::CondCode X86CC;
25932     switch (IntNo) {
25933     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
25934     case Intrinsic::x86_sse42_pcmpistria128:
25935       Opcode = X86ISD::PCMPISTR;
25936       X86CC = X86::COND_A;
25937       break;
25938     case Intrinsic::x86_sse42_pcmpestria128:
25939       Opcode = X86ISD::PCMPESTR;
25940       X86CC = X86::COND_A;
25941       break;
25942     case Intrinsic::x86_sse42_pcmpistric128:
25943       Opcode = X86ISD::PCMPISTR;
25944       X86CC = X86::COND_B;
25945       break;
25946     case Intrinsic::x86_sse42_pcmpestric128:
25947       Opcode = X86ISD::PCMPESTR;
25948       X86CC = X86::COND_B;
25949       break;
25950     case Intrinsic::x86_sse42_pcmpistrio128:
25951       Opcode = X86ISD::PCMPISTR;
25952       X86CC = X86::COND_O;
25953       break;
25954     case Intrinsic::x86_sse42_pcmpestrio128:
25955       Opcode = X86ISD::PCMPESTR;
25956       X86CC = X86::COND_O;
25957       break;
25958     case Intrinsic::x86_sse42_pcmpistris128:
25959       Opcode = X86ISD::PCMPISTR;
25960       X86CC = X86::COND_S;
25961       break;
25962     case Intrinsic::x86_sse42_pcmpestris128:
25963       Opcode = X86ISD::PCMPESTR;
25964       X86CC = X86::COND_S;
25965       break;
25966     case Intrinsic::x86_sse42_pcmpistriz128:
25967       Opcode = X86ISD::PCMPISTR;
25968       X86CC = X86::COND_E;
25969       break;
25970     case Intrinsic::x86_sse42_pcmpestriz128:
25971       Opcode = X86ISD::PCMPESTR;
25972       X86CC = X86::COND_E;
25973       break;
25974     }
25975     SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
25976     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25977     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
25978     SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
25979     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25980   }
25981
25982   case Intrinsic::x86_sse42_pcmpistri128:
25983   case Intrinsic::x86_sse42_pcmpestri128: {
25984     unsigned Opcode;
25985     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
25986       Opcode = X86ISD::PCMPISTR;
25987     else
25988       Opcode = X86ISD::PCMPESTR;
25989
25990     SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
25991     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
25992     return DAG.getNode(Opcode, dl, VTs, NewOps);
25993   }
25994
25995   case Intrinsic::x86_sse42_pcmpistrm128:
25996   case Intrinsic::x86_sse42_pcmpestrm128: {
25997     unsigned Opcode;
25998     if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
25999       Opcode = X86ISD::PCMPISTR;
26000     else
26001       Opcode = X86ISD::PCMPESTR;
26002
26003     SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));
26004     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26005     return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26006   }
26007
26008   case Intrinsic::eh_sjlj_lsda: {
26009     MachineFunction &MF = DAG.getMachineFunction();
26010     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26011     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26012     auto &Context = MF.getMMI().getContext();
26013     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26014                                             Twine(MF.getFunctionNumber()));
26015     return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
26016                        DAG.getMCSymbol(S, PtrVT));
26017   }
26018
26019   case Intrinsic::x86_seh_lsda: {
26020     // Compute the symbol for the LSDA. We know it'll get emitted later.
26021     MachineFunction &MF = DAG.getMachineFunction();
26022     SDValue Op1 = Op.getOperand(1);
26023     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26024     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
26025         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26026
26027     // Generate a simple absolute symbol reference. This intrinsic is only
26028     // supported on 32-bit Windows, which isn't PIC.
26029     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26030     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26031   }
26032
26033   case Intrinsic::eh_recoverfp: {
26034     SDValue FnOp = Op.getOperand(1);
26035     SDValue IncomingFPOp = Op.getOperand(2);
26036     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26037     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26038     if (!Fn)
26039       report_fatal_error(
26040           "llvm.eh.recoverfp must take a function as the first argument");
26041     return recoverFramePointer(DAG, Fn, IncomingFPOp);
26042   }
26043
26044   case Intrinsic::localaddress: {
26045     // Returns one of the stack, base, or frame pointer registers, depending on
26046     // which is used to reference local variables.
26047     MachineFunction &MF = DAG.getMachineFunction();
26048     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26049     unsigned Reg;
26050     if (RegInfo->hasBasePointer(MF))
26051       Reg = RegInfo->getBaseRegister();
26052     else { // Handles the SP or FP case.
26053       bool CantUseFP = RegInfo->hasStackRealignment(MF);
26054       if (CantUseFP)
26055         Reg = RegInfo->getPtrSizedStackRegister(MF);
26056       else
26057         Reg = RegInfo->getPtrSizedFrameRegister(MF);
26058     }
26059     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26060   }
26061   case Intrinsic::x86_avx512_vp2intersect_q_512:
26062   case Intrinsic::x86_avx512_vp2intersect_q_256:
26063   case Intrinsic::x86_avx512_vp2intersect_q_128:
26064   case Intrinsic::x86_avx512_vp2intersect_d_512:
26065   case Intrinsic::x86_avx512_vp2intersect_d_256:
26066   case Intrinsic::x86_avx512_vp2intersect_d_128: {
26067     MVT MaskVT = Op.getSimpleValueType();
26068
26069     SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26070     SDLoc DL(Op);
26071
26072     SDValue Operation =
26073         DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
26074                     Op->getOperand(1), Op->getOperand(2));
26075
26076     SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26077                                                  MaskVT, Operation);
26078     SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26079                                                  MaskVT, Operation);
26080     return DAG.getMergeValues({Result0, Result1}, DL);
26081   }
26082   case Intrinsic::x86_mmx_pslli_w:
26083   case Intrinsic::x86_mmx_pslli_d:
26084   case Intrinsic::x86_mmx_pslli_q:
26085   case Intrinsic::x86_mmx_psrli_w:
26086   case Intrinsic::x86_mmx_psrli_d:
26087   case Intrinsic::x86_mmx_psrli_q:
26088   case Intrinsic::x86_mmx_psrai_w:
26089   case Intrinsic::x86_mmx_psrai_d: {
26090     SDLoc DL(Op);
26091     SDValue ShAmt = Op.getOperand(2);
26092     // If the argument is a constant, convert it to a target constant.
26093     if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26094       // Clamp out of bounds shift amounts since they will otherwise be masked
26095       // to 8-bits which may make it no longer out of bounds.
26096       unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26097       if (ShiftAmount == 0)
26098         return Op.getOperand(1);
26099
26100       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26101                          Op.getOperand(0), Op.getOperand(1),
26102                          DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26103     }
26104
26105     unsigned NewIntrinsic;
26106     switch (IntNo) {
26107     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
26108     case Intrinsic::x86_mmx_pslli_w:
26109       NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26110       break;
26111     case Intrinsic::x86_mmx_pslli_d:
26112       NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26113       break;
26114     case Intrinsic::x86_mmx_pslli_q:
26115       NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26116       break;
26117     case Intrinsic::x86_mmx_psrli_w:
26118       NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26119       break;
26120     case Intrinsic::x86_mmx_psrli_d:
26121       NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26122       break;
26123     case Intrinsic::x86_mmx_psrli_q:
26124       NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26125       break;
26126     case Intrinsic::x86_mmx_psrai_w:
26127       NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26128       break;
26129     case Intrinsic::x86_mmx_psrai_d:
26130       NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26131       break;
26132     }
26133
26134     // The vector shift intrinsics with scalars uses 32b shift amounts but
26135     // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26136     // MMX register.
26137     ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26138     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26139                        DAG.getTargetConstant(NewIntrinsic, DL,
26140                                              getPointerTy(DAG.getDataLayout())),
26141                        Op.getOperand(1), ShAmt);
26142   }
26143   case Intrinsic::thread_pointer: {
26144     if (Subtarget.isTargetELF()) {
26145       SDLoc dl(Op);
26146       EVT PtrVT = getPointerTy(DAG.getDataLayout());
26147       // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
26148       Value *Ptr = Constant::getNullValue(PointerType::get(
26149           *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
26150       return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26151                          DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
26152     }
26153     report_fatal_error(
26154         "Target OS doesn't support __builtin_thread_pointer() yet.");
26155   }
26156   }
26157 }
26158
26159 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26160                                  SDValue Src, SDValue Mask, SDValue Base,
26161                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
26162                                  const X86Subtarget &Subtarget) {
26163   SDLoc dl(Op);
26164   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26165   // Scale must be constant.
26166   if (!C)
26167     return SDValue();
26168   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26169   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26170                                         TLI.getPointerTy(DAG.getDataLayout()));
26171   EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26172   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26173   // If source is undef or we know it won't be used, use a zero vector
26174   // to break register dependency.
26175   // TODO: use undef instead and let BreakFalseDeps deal with it?
26176   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26177     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26178
26179   // Cast mask to an integer type.
26180   Mask = DAG.getBitcast(MaskVT, Mask);
26181
26182   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26183
26184   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26185   SDValue Res =
26186       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26187                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26188   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26189 }
26190
26191 static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
26192                              SDValue Src, SDValue Mask, SDValue Base,
26193                              SDValue Index, SDValue ScaleOp, SDValue Chain,
26194                              const X86Subtarget &Subtarget) {
26195   MVT VT = Op.getSimpleValueType();
26196   SDLoc dl(Op);
26197   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26198   // Scale must be constant.
26199   if (!C)
26200     return SDValue();
26201   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26202   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26203                                         TLI.getPointerTy(DAG.getDataLayout()));
26204   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26205                               VT.getVectorNumElements());
26206   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26207
26208   // We support two versions of the gather intrinsics. One with scalar mask and
26209   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26210   if (Mask.getValueType() != MaskVT)
26211     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26212
26213   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26214   // If source is undef or we know it won't be used, use a zero vector
26215   // to break register dependency.
26216   // TODO: use undef instead and let BreakFalseDeps deal with it?
26217   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26218     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26219
26220   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26221
26222   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26223   SDValue Res =
26224       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26225                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26226   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26227 }
26228
26229 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26230                                SDValue Src, SDValue Mask, SDValue Base,
26231                                SDValue Index, SDValue ScaleOp, SDValue Chain,
26232                                const X86Subtarget &Subtarget) {
26233   SDLoc dl(Op);
26234   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26235   // Scale must be constant.
26236   if (!C)
26237     return SDValue();
26238   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26239   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26240                                         TLI.getPointerTy(DAG.getDataLayout()));
26241   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26242                               Src.getSimpleValueType().getVectorNumElements());
26243   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26244
26245   // We support two versions of the scatter intrinsics. One with scalar mask and
26246   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26247   if (Mask.getValueType() != MaskVT)
26248     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26249
26250   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26251
26252   SDVTList VTs = DAG.getVTList(MVT::Other);
26253   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26254   SDValue Res =
26255       DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26256                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26257   return Res;
26258 }
26259
26260 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26261                                SDValue Mask, SDValue Base, SDValue Index,
26262                                SDValue ScaleOp, SDValue Chain,
26263                                const X86Subtarget &Subtarget) {
26264   SDLoc dl(Op);
26265   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26266   // Scale must be constant.
26267   if (!C)
26268     return SDValue();
26269   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26270   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26271                                         TLI.getPointerTy(DAG.getDataLayout()));
26272   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26273   SDValue Segment = DAG.getRegister(0, MVT::i32);
26274   MVT MaskVT =
26275     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26276   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26277   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26278   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26279   return SDValue(Res, 0);
26280 }
26281
26282 /// Handles the lowering of builtin intrinsics with chain that return their
26283 /// value into registers EDX:EAX.
26284 /// If operand ScrReg is a valid register identifier, then operand 2 of N is
26285 /// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26286 /// TargetOpcode.
26287 /// Returns a Glue value which can be used to add extra copy-from-reg if the
26288 /// expanded intrinsics implicitly defines extra registers (i.e. not just
26289 /// EDX:EAX).
26290 static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
26291                                         SelectionDAG &DAG,
26292                                         unsigned TargetOpcode,
26293                                         unsigned SrcReg,
26294                                         const X86Subtarget &Subtarget,
26295                                         SmallVectorImpl<SDValue> &Results) {
26296   SDValue Chain = N->getOperand(0);
26297   SDValue Glue;
26298
26299   if (SrcReg) {
26300     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26301     Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26302     Glue = Chain.getValue(1);
26303   }
26304
26305   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26306   SDValue N1Ops[] = {Chain, Glue};
26307   SDNode *N1 = DAG.getMachineNode(
26308       TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26309   Chain = SDValue(N1, 0);
26310
26311   // Reads the content of XCR and returns it in registers EDX:EAX.
26312   SDValue LO, HI;
26313   if (Subtarget.is64Bit()) {
26314     LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26315     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26316                             LO.getValue(2));
26317   } else {
26318     LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26319     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26320                             LO.getValue(2));
26321   }
26322   Chain = HI.getValue(1);
26323   Glue = HI.getValue(2);
26324
26325   if (Subtarget.is64Bit()) {
26326     // Merge the two 32-bit values into a 64-bit one.
26327     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26328                               DAG.getConstant(32, DL, MVT::i8));
26329     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26330     Results.push_back(Chain);
26331     return Glue;
26332   }
26333
26334   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26335   SDValue Ops[] = { LO, HI };
26336   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26337   Results.push_back(Pair);
26338   Results.push_back(Chain);
26339   return Glue;
26340 }
26341
26342 /// Handles the lowering of builtin intrinsics that read the time stamp counter
26343 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26344 /// READCYCLECOUNTER nodes.
26345 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26346                                     SelectionDAG &DAG,
26347                                     const X86Subtarget &Subtarget,
26348                                     SmallVectorImpl<SDValue> &Results) {
26349   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26350   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26351   // and the EAX register is loaded with the low-order 32 bits.
26352   SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26353                                              /* NoRegister */0, Subtarget,
26354                                              Results);
26355   if (Opcode != X86::RDTSCP)
26356     return;
26357
26358   SDValue Chain = Results[1];
26359   // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26360   // the ECX register. Add 'ecx' explicitly to the chain.
26361   SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26362   Results[1] = ecx;
26363   Results.push_back(ecx.getValue(1));
26364 }
26365
26366 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
26367                                      SelectionDAG &DAG) {
26368   SmallVector<SDValue, 3> Results;
26369   SDLoc DL(Op);
26370   getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26371                           Results);
26372   return DAG.getMergeValues(Results, DL);
26373 }
26374
26375 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
26376   MachineFunction &MF = DAG.getMachineFunction();
26377   SDValue Chain = Op.getOperand(0);
26378   SDValue RegNode = Op.getOperand(2);
26379   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26380   if (!EHInfo)
26381     report_fatal_error("EH registrations only live in functions using WinEH");
26382
26383   // Cast the operand to an alloca, and remember the frame index.
26384   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26385   if (!FINode)
26386     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26387   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26388
26389   // Return the chain operand without making any DAG nodes.
26390   return Chain;
26391 }
26392
26393 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
26394   MachineFunction &MF = DAG.getMachineFunction();
26395   SDValue Chain = Op.getOperand(0);
26396   SDValue EHGuard = Op.getOperand(2);
26397   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26398   if (!EHInfo)
26399     report_fatal_error("EHGuard only live in functions using WinEH");
26400
26401   // Cast the operand to an alloca, and remember the frame index.
26402   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26403   if (!FINode)
26404     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26405   EHInfo->EHGuardFrameIndex = FINode->getIndex();
26406
26407   // Return the chain operand without making any DAG nodes.
26408   return Chain;
26409 }
26410
26411 /// Emit Truncating Store with signed or unsigned saturation.
26412 static SDValue
26413 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
26414                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26415                 SelectionDAG &DAG) {
26416   SDVTList VTs = DAG.getVTList(MVT::Other);
26417   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26418   SDValue Ops[] = { Chain, Val, Ptr, Undef };
26419   unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26420   return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26421 }
26422
26423 /// Emit Masked Truncating Store with signed or unsigned saturation.
26424 static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
26425                                      const SDLoc &DL,
26426                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26427                       MachineMemOperand *MMO, SelectionDAG &DAG) {
26428   SDVTList VTs = DAG.getVTList(MVT::Other);
26429   SDValue Ops[] = { Chain, Val, Ptr, Mask };
26430   unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26431   return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26432 }
26433
26434 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
26435                                       SelectionDAG &DAG) {
26436   unsigned IntNo = Op.getConstantOperandVal(1);
26437   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26438   if (!IntrData) {
26439     switch (IntNo) {
26440
26441     case Intrinsic::swift_async_context_addr: {
26442       SDLoc dl(Op);
26443       auto &MF = DAG.getMachineFunction();
26444       auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26445       if (Subtarget.is64Bit()) {
26446         MF.getFrameInfo().setFrameAddressIsTaken(true);
26447         X86FI->setHasSwiftAsyncContext(true);
26448         SDValue Chain = Op->getOperand(0);
26449         SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
26450         SDValue Result =
26451             SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
26452                                        DAG.getTargetConstant(8, dl, MVT::i32)),
26453                     0);
26454         // Return { result, chain }.
26455         return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26456                            CopyRBP.getValue(1));
26457       } else {
26458         // 32-bit so no special extended frame, create or reuse an existing
26459         // stack slot.
26460         if (!X86FI->getSwiftAsyncContextFrameIdx())
26461           X86FI->setSwiftAsyncContextFrameIdx(
26462               MF.getFrameInfo().CreateStackObject(4, Align(4), false));
26463         SDValue Result =
26464             DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
26465         // Return { result, chain }.
26466         return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26467                            Op->getOperand(0));
26468       }
26469     }
26470
26471     case llvm::Intrinsic::x86_seh_ehregnode:
26472       return MarkEHRegistrationNode(Op, DAG);
26473     case llvm::Intrinsic::x86_seh_ehguard:
26474       return MarkEHGuard(Op, DAG);
26475     case llvm::Intrinsic::x86_rdpkru: {
26476       SDLoc dl(Op);
26477       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26478       // Create a RDPKRU node and pass 0 to the ECX parameter.
26479       return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26480                          DAG.getConstant(0, dl, MVT::i32));
26481     }
26482     case llvm::Intrinsic::x86_wrpkru: {
26483       SDLoc dl(Op);
26484       // Create a WRPKRU node, pass the input to the EAX parameter,  and pass 0
26485       // to the EDX and ECX parameters.
26486       return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26487                          Op.getOperand(0), Op.getOperand(2),
26488                          DAG.getConstant(0, dl, MVT::i32),
26489                          DAG.getConstant(0, dl, MVT::i32));
26490     }
26491     case llvm::Intrinsic::asan_check_memaccess: {
26492       // Mark this as adjustsStack because it will be lowered to a call.
26493       DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);
26494       // Don't do anything here, we will expand these intrinsics out later.
26495       return Op;
26496     }
26497     case llvm::Intrinsic::x86_flags_read_u32:
26498     case llvm::Intrinsic::x86_flags_read_u64:
26499     case llvm::Intrinsic::x86_flags_write_u32:
26500     case llvm::Intrinsic::x86_flags_write_u64: {
26501       // We need a frame pointer because this will get lowered to a PUSH/POP
26502       // sequence.
26503       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26504       MFI.setHasCopyImplyingStackAdjustment(true);
26505       // Don't do anything here, we will expand these intrinsics out later
26506       // during FinalizeISel in EmitInstrWithCustomInserter.
26507       return Op;
26508     }
26509     case Intrinsic::x86_lwpins32:
26510     case Intrinsic::x86_lwpins64:
26511     case Intrinsic::x86_umwait:
26512     case Intrinsic::x86_tpause: {
26513       SDLoc dl(Op);
26514       SDValue Chain = Op->getOperand(0);
26515       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26516       unsigned Opcode;
26517
26518       switch (IntNo) {
26519       default: llvm_unreachable("Impossible intrinsic");
26520       case Intrinsic::x86_umwait:
26521         Opcode = X86ISD::UMWAIT;
26522         break;
26523       case Intrinsic::x86_tpause:
26524         Opcode = X86ISD::TPAUSE;
26525         break;
26526       case Intrinsic::x86_lwpins32:
26527       case Intrinsic::x86_lwpins64:
26528         Opcode = X86ISD::LWPINS;
26529         break;
26530       }
26531
26532       SDValue Operation =
26533           DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26534                       Op->getOperand(3), Op->getOperand(4));
26535       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26536       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26537                          Operation.getValue(1));
26538     }
26539     case Intrinsic::x86_enqcmd:
26540     case Intrinsic::x86_enqcmds: {
26541       SDLoc dl(Op);
26542       SDValue Chain = Op.getOperand(0);
26543       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26544       unsigned Opcode;
26545       switch (IntNo) {
26546       default: llvm_unreachable("Impossible intrinsic!");
26547       case Intrinsic::x86_enqcmd:
26548         Opcode = X86ISD::ENQCMD;
26549         break;
26550       case Intrinsic::x86_enqcmds:
26551         Opcode = X86ISD::ENQCMDS;
26552         break;
26553       }
26554       SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26555                                       Op.getOperand(3));
26556       SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26557       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26558                          Operation.getValue(1));
26559     }
26560     case Intrinsic::x86_aesenc128kl:
26561     case Intrinsic::x86_aesdec128kl:
26562     case Intrinsic::x86_aesenc256kl:
26563     case Intrinsic::x86_aesdec256kl: {
26564       SDLoc DL(Op);
26565       SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26566       SDValue Chain = Op.getOperand(0);
26567       unsigned Opcode;
26568
26569       switch (IntNo) {
26570       default: llvm_unreachable("Impossible intrinsic");
26571       case Intrinsic::x86_aesenc128kl:
26572         Opcode = X86ISD::AESENC128KL;
26573         break;
26574       case Intrinsic::x86_aesdec128kl:
26575         Opcode = X86ISD::AESDEC128KL;
26576         break;
26577       case Intrinsic::x86_aesenc256kl:
26578         Opcode = X86ISD::AESENC256KL;
26579         break;
26580       case Intrinsic::x86_aesdec256kl:
26581         Opcode = X86ISD::AESDEC256KL;
26582         break;
26583       }
26584
26585       MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26586       MachineMemOperand *MMO = MemIntr->getMemOperand();
26587       EVT MemVT = MemIntr->getMemoryVT();
26588       SDValue Operation = DAG.getMemIntrinsicNode(
26589           Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26590           MMO);
26591       SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26592
26593       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26594                          {ZF, Operation.getValue(0), Operation.getValue(2)});
26595     }
26596     case Intrinsic::x86_aesencwide128kl:
26597     case Intrinsic::x86_aesdecwide128kl:
26598     case Intrinsic::x86_aesencwide256kl:
26599     case Intrinsic::x86_aesdecwide256kl: {
26600       SDLoc DL(Op);
26601       SDVTList VTs = DAG.getVTList(
26602           {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26603            MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26604       SDValue Chain = Op.getOperand(0);
26605       unsigned Opcode;
26606
26607       switch (IntNo) {
26608       default: llvm_unreachable("Impossible intrinsic");
26609       case Intrinsic::x86_aesencwide128kl:
26610         Opcode = X86ISD::AESENCWIDE128KL;
26611         break;
26612       case Intrinsic::x86_aesdecwide128kl:
26613         Opcode = X86ISD::AESDECWIDE128KL;
26614         break;
26615       case Intrinsic::x86_aesencwide256kl:
26616         Opcode = X86ISD::AESENCWIDE256KL;
26617         break;
26618       case Intrinsic::x86_aesdecwide256kl:
26619         Opcode = X86ISD::AESDECWIDE256KL;
26620         break;
26621       }
26622
26623       MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26624       MachineMemOperand *MMO = MemIntr->getMemOperand();
26625       EVT MemVT = MemIntr->getMemoryVT();
26626       SDValue Operation = DAG.getMemIntrinsicNode(
26627           Opcode, DL, VTs,
26628           {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26629            Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26630            Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26631           MemVT, MMO);
26632       SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26633
26634       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26635                          {ZF, Operation.getValue(1), Operation.getValue(2),
26636                           Operation.getValue(3), Operation.getValue(4),
26637                           Operation.getValue(5), Operation.getValue(6),
26638                           Operation.getValue(7), Operation.getValue(8),
26639                           Operation.getValue(9)});
26640     }
26641     case Intrinsic::x86_testui: {
26642       SDLoc dl(Op);
26643       SDValue Chain = Op.getOperand(0);
26644       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26645       SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26646       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26647       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26648                          Operation.getValue(1));
26649     }
26650     case Intrinsic::x86_atomic_bts_rm:
26651     case Intrinsic::x86_atomic_btc_rm:
26652     case Intrinsic::x86_atomic_btr_rm: {
26653       SDLoc DL(Op);
26654       MVT VT = Op.getSimpleValueType();
26655       SDValue Chain = Op.getOperand(0);
26656       SDValue Op1 = Op.getOperand(2);
26657       SDValue Op2 = Op.getOperand(3);
26658       unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm   ? X86ISD::LBTS_RM
26659                      : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
26660                                                              : X86ISD::LBTR_RM;
26661       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26662       SDValue Res =
26663           DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26664                                   {Chain, Op1, Op2}, VT, MMO);
26665       Chain = Res.getValue(1);
26666       Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
26667       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
26668     }
26669     case Intrinsic::x86_atomic_bts:
26670     case Intrinsic::x86_atomic_btc:
26671     case Intrinsic::x86_atomic_btr: {
26672       SDLoc DL(Op);
26673       MVT VT = Op.getSimpleValueType();
26674       SDValue Chain = Op.getOperand(0);
26675       SDValue Op1 = Op.getOperand(2);
26676       SDValue Op2 = Op.getOperand(3);
26677       unsigned Opc = IntNo == Intrinsic::x86_atomic_bts   ? X86ISD::LBTS
26678                      : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
26679                                                           : X86ISD::LBTR;
26680       SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
26681       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26682       SDValue Res =
26683           DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26684                                   {Chain, Op1, Op2, Size}, VT, MMO);
26685       Chain = Res.getValue(1);
26686       Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
26687       unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
26688       if (Imm)
26689         Res = DAG.getNode(ISD::SHL, DL, VT, Res,
26690                           DAG.getShiftAmountConstant(Imm, VT, DL));
26691       return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
26692     }
26693     case Intrinsic::x86_cmpccxadd32:
26694     case Intrinsic::x86_cmpccxadd64: {
26695       SDLoc DL(Op);
26696       SDValue Chain = Op.getOperand(0);
26697       SDValue Addr = Op.getOperand(2);
26698       SDValue Src1 = Op.getOperand(3);
26699       SDValue Src2 = Op.getOperand(4);
26700       SDValue CC = Op.getOperand(5);
26701       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26702       SDValue Operation = DAG.getMemIntrinsicNode(
26703           X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
26704           MVT::i32, MMO);
26705       return Operation;
26706     }
26707     case Intrinsic::x86_aadd32:
26708     case Intrinsic::x86_aadd64:
26709     case Intrinsic::x86_aand32:
26710     case Intrinsic::x86_aand64:
26711     case Intrinsic::x86_aor32:
26712     case Intrinsic::x86_aor64:
26713     case Intrinsic::x86_axor32:
26714     case Intrinsic::x86_axor64: {
26715       SDLoc DL(Op);
26716       SDValue Chain = Op.getOperand(0);
26717       SDValue Op1 = Op.getOperand(2);
26718       SDValue Op2 = Op.getOperand(3);
26719       MVT VT = Op2.getSimpleValueType();
26720       unsigned Opc = 0;
26721       switch (IntNo) {
26722       default:
26723         llvm_unreachable("Unknown Intrinsic");
26724       case Intrinsic::x86_aadd32:
26725       case Intrinsic::x86_aadd64:
26726         Opc = X86ISD::AADD;
26727         break;
26728       case Intrinsic::x86_aand32:
26729       case Intrinsic::x86_aand64:
26730         Opc = X86ISD::AAND;
26731         break;
26732       case Intrinsic::x86_aor32:
26733       case Intrinsic::x86_aor64:
26734         Opc = X86ISD::AOR;
26735         break;
26736       case Intrinsic::x86_axor32:
26737       case Intrinsic::x86_axor64:
26738         Opc = X86ISD::AXOR;
26739         break;
26740       }
26741       MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
26742       return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
26743                                      {Chain, Op1, Op2}, VT, MMO);
26744     }
26745     case Intrinsic::x86_atomic_add_cc:
26746     case Intrinsic::x86_atomic_sub_cc:
26747     case Intrinsic::x86_atomic_or_cc:
26748     case Intrinsic::x86_atomic_and_cc:
26749     case Intrinsic::x86_atomic_xor_cc: {
26750       SDLoc DL(Op);
26751       SDValue Chain = Op.getOperand(0);
26752       SDValue Op1 = Op.getOperand(2);
26753       SDValue Op2 = Op.getOperand(3);
26754       X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
26755       MVT VT = Op2.getSimpleValueType();
26756       unsigned Opc = 0;
26757       switch (IntNo) {
26758       default:
26759         llvm_unreachable("Unknown Intrinsic");
26760       case Intrinsic::x86_atomic_add_cc:
26761         Opc = X86ISD::LADD;
26762         break;
26763       case Intrinsic::x86_atomic_sub_cc:
26764         Opc = X86ISD::LSUB;
26765         break;
26766       case Intrinsic::x86_atomic_or_cc:
26767         Opc = X86ISD::LOR;
26768         break;
26769       case Intrinsic::x86_atomic_and_cc:
26770         Opc = X86ISD::LAND;
26771         break;
26772       case Intrinsic::x86_atomic_xor_cc:
26773         Opc = X86ISD::LXOR;
26774         break;
26775       }
26776       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26777       SDValue LockArith =
26778           DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26779                                   {Chain, Op1, Op2}, VT, MMO);
26780       Chain = LockArith.getValue(1);
26781       return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
26782     }
26783     }
26784     return SDValue();
26785   }
26786
26787   SDLoc dl(Op);
26788   switch(IntrData->Type) {
26789   default: llvm_unreachable("Unknown Intrinsic Type");
26790   case RDSEED:
26791   case RDRAND: {
26792     // Emit the node with the right value type.
26793     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
26794     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26795
26796     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
26797     // Otherwise return the value from Rand, which is always 0, casted to i32.
26798     SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
26799                      DAG.getConstant(1, dl, Op->getValueType(1)),
26800                      DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
26801                      SDValue(Result.getNode(), 1)};
26802     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
26803
26804     // Return { result, isValid, chain }.
26805     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
26806                        SDValue(Result.getNode(), 2));
26807   }
26808   case GATHER_AVX2: {
26809     SDValue Chain = Op.getOperand(0);
26810     SDValue Src   = Op.getOperand(2);
26811     SDValue Base  = Op.getOperand(3);
26812     SDValue Index = Op.getOperand(4);
26813     SDValue Mask  = Op.getOperand(5);
26814     SDValue Scale = Op.getOperand(6);
26815     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26816                              Scale, Chain, Subtarget);
26817   }
26818   case GATHER: {
26819   //gather(v1, mask, index, base, scale);
26820     SDValue Chain = Op.getOperand(0);
26821     SDValue Src   = Op.getOperand(2);
26822     SDValue Base  = Op.getOperand(3);
26823     SDValue Index = Op.getOperand(4);
26824     SDValue Mask  = Op.getOperand(5);
26825     SDValue Scale = Op.getOperand(6);
26826     return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
26827                          Chain, Subtarget);
26828   }
26829   case SCATTER: {
26830   //scatter(base, mask, index, v1, scale);
26831     SDValue Chain = Op.getOperand(0);
26832     SDValue Base  = Op.getOperand(2);
26833     SDValue Mask  = Op.getOperand(3);
26834     SDValue Index = Op.getOperand(4);
26835     SDValue Src   = Op.getOperand(5);
26836     SDValue Scale = Op.getOperand(6);
26837     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
26838                           Scale, Chain, Subtarget);
26839   }
26840   case PREFETCH: {
26841     const APInt &HintVal = Op.getConstantOperandAPInt(6);
26842     assert((HintVal == 2 || HintVal == 3) &&
26843            "Wrong prefetch hint in intrinsic: should be 2 or 3");
26844     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
26845     SDValue Chain = Op.getOperand(0);
26846     SDValue Mask  = Op.getOperand(2);
26847     SDValue Index = Op.getOperand(3);
26848     SDValue Base  = Op.getOperand(4);
26849     SDValue Scale = Op.getOperand(5);
26850     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
26851                            Subtarget);
26852   }
26853   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
26854   case RDTSC: {
26855     SmallVector<SDValue, 2> Results;
26856     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
26857                             Results);
26858     return DAG.getMergeValues(Results, dl);
26859   }
26860   // Read Performance Monitoring Counters.
26861   case RDPMC:
26862   // Read Processor Register.
26863   case RDPRU:
26864   // GetExtended Control Register.
26865   case XGETBV: {
26866     SmallVector<SDValue, 2> Results;
26867
26868     // RDPMC uses ECX to select the index of the performance counter to read.
26869     // RDPRU uses ECX to select the processor register to read.
26870     // XGETBV uses ECX to select the index of the XCR register to return.
26871     // The result is stored into registers EDX:EAX.
26872     expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
26873                                 Subtarget, Results);
26874     return DAG.getMergeValues(Results, dl);
26875   }
26876   // XTEST intrinsics.
26877   case XTEST: {
26878     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
26879     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
26880
26881     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
26882     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
26883     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
26884                        Ret, SDValue(InTrans.getNode(), 1));
26885   }
26886   case TRUNCATE_TO_MEM_VI8:
26887   case TRUNCATE_TO_MEM_VI16:
26888   case TRUNCATE_TO_MEM_VI32: {
26889     SDValue Mask = Op.getOperand(4);
26890     SDValue DataToTruncate = Op.getOperand(3);
26891     SDValue Addr = Op.getOperand(2);
26892     SDValue Chain = Op.getOperand(0);
26893
26894     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
26895     assert(MemIntr && "Expected MemIntrinsicSDNode!");
26896
26897     EVT MemVT  = MemIntr->getMemoryVT();
26898
26899     uint16_t TruncationOp = IntrData->Opc0;
26900     switch (TruncationOp) {
26901     case X86ISD::VTRUNC: {
26902       if (isAllOnesConstant(Mask)) // return just a truncate store
26903         return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
26904                                  MemIntr->getMemOperand());
26905
26906       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26907       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26908       SDValue Offset = DAG.getUNDEF(VMask.getValueType());
26909
26910       return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
26911                                 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
26912                                 true /* truncating */);
26913     }
26914     case X86ISD::VTRUNCUS:
26915     case X86ISD::VTRUNCS: {
26916       bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
26917       if (isAllOnesConstant(Mask))
26918         return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
26919                                MemIntr->getMemOperand(), DAG);
26920
26921       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
26922       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26923
26924       return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
26925                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
26926     }
26927     default:
26928       llvm_unreachable("Unsupported truncstore intrinsic");
26929     }
26930   }
26931   }
26932 }
26933
26934 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
26935                                            SelectionDAG &DAG) const {
26936   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26937   MFI.setReturnAddressIsTaken(true);
26938
26939   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
26940     return SDValue();
26941
26942   unsigned Depth = Op.getConstantOperandVal(0);
26943   SDLoc dl(Op);
26944   EVT PtrVT = getPointerTy(DAG.getDataLayout());
26945
26946   if (Depth > 0) {
26947     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
26948     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26949     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
26950     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26951                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
26952                        MachinePointerInfo());
26953   }
26954
26955   // Just load the return address.
26956   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
26957   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
26958                      MachinePointerInfo());
26959 }
26960
26961 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
26962                                                  SelectionDAG &DAG) const {
26963   DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
26964   return getReturnAddressFrameIndex(DAG);
26965 }
26966
26967 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
26968   MachineFunction &MF = DAG.getMachineFunction();
26969   MachineFrameInfo &MFI = MF.getFrameInfo();
26970   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
26971   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26972   EVT VT = Op.getValueType();
26973
26974   MFI.setFrameAddressIsTaken(true);
26975
26976   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
26977     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
26978     // is not possible to crawl up the stack without looking at the unwind codes
26979     // simultaneously.
26980     int FrameAddrIndex = FuncInfo->getFAIndex();
26981     if (!FrameAddrIndex) {
26982       // Set up a frame object for the return address.
26983       unsigned SlotSize = RegInfo->getSlotSize();
26984       FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
26985           SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
26986       FuncInfo->setFAIndex(FrameAddrIndex);
26987     }
26988     return DAG.getFrameIndex(FrameAddrIndex, VT);
26989   }
26990
26991   unsigned FrameReg =
26992       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
26993   SDLoc dl(Op);  // FIXME probably not meaningful
26994   unsigned Depth = Op.getConstantOperandVal(0);
26995   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
26996           (FrameReg == X86::EBP && VT == MVT::i32)) &&
26997          "Invalid Frame Register!");
26998   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
26999   while (Depth--)
27000     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27001                             MachinePointerInfo());
27002   return FrameAddr;
27003 }
27004
27005 // FIXME? Maybe this could be a TableGen attribute on some registers and
27006 // this table could be generated automatically from RegInfo.
27007 Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
27008                                               const MachineFunction &MF) const {
27009   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27010
27011   Register Reg = StringSwitch<unsigned>(RegName)
27012                        .Case("esp", X86::ESP)
27013                        .Case("rsp", X86::RSP)
27014                        .Case("ebp", X86::EBP)
27015                        .Case("rbp", X86::RBP)
27016                        .Default(0);
27017
27018   if (Reg == X86::EBP || Reg == X86::RBP) {
27019     if (!TFI.hasFP(MF))
27020       report_fatal_error("register " + StringRef(RegName) +
27021                          " is allocatable: function has no frame pointer");
27022 #ifndef NDEBUG
27023     else {
27024       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27025       Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27026       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
27027              "Invalid Frame Register!");
27028     }
27029 #endif
27030   }
27031
27032   if (Reg)
27033     return Reg;
27034
27035   report_fatal_error("Invalid register name global variable");
27036 }
27037
27038 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27039                                                      SelectionDAG &DAG) const {
27040   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27041   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27042 }
27043
27044 Register X86TargetLowering::getExceptionPointerRegister(
27045     const Constant *PersonalityFn) const {
27046   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27047     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27048
27049   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27050 }
27051
27052 Register X86TargetLowering::getExceptionSelectorRegister(
27053     const Constant *PersonalityFn) const {
27054   // Funclet personalities don't use selectors (the runtime does the selection).
27055   if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
27056     return X86::NoRegister;
27057   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27058 }
27059
27060 bool X86TargetLowering::needsFixedCatchObjects() const {
27061   return Subtarget.isTargetWin64();
27062 }
27063
27064 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27065   SDValue Chain     = Op.getOperand(0);
27066   SDValue Offset    = Op.getOperand(1);
27067   SDValue Handler   = Op.getOperand(2);
27068   SDLoc dl      (Op);
27069
27070   EVT PtrVT = getPointerTy(DAG.getDataLayout());
27071   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27072   Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27073   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
27074           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
27075          "Invalid Frame Register!");
27076   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27077   Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27078
27079   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27080                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27081                                                        dl));
27082   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27083   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27084   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27085
27086   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27087                      DAG.getRegister(StoreAddrReg, PtrVT));
27088 }
27089
27090 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27091                                                SelectionDAG &DAG) const {
27092   SDLoc DL(Op);
27093   // If the subtarget is not 64bit, we may need the global base reg
27094   // after isel expand pseudo, i.e., after CGBR pass ran.
27095   // Therefore, ask for the GlobalBaseReg now, so that the pass
27096   // inserts the code for us in case we need it.
27097   // Otherwise, we will end up in a situation where we will
27098   // reference a virtual register that is not defined!
27099   if (!Subtarget.is64Bit()) {
27100     const X86InstrInfo *TII = Subtarget.getInstrInfo();
27101     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27102   }
27103   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27104                      DAG.getVTList(MVT::i32, MVT::Other),
27105                      Op.getOperand(0), Op.getOperand(1));
27106 }
27107
27108 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27109                                                 SelectionDAG &DAG) const {
27110   SDLoc DL(Op);
27111   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27112                      Op.getOperand(0), Op.getOperand(1));
27113 }
27114
27115 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27116                                                        SelectionDAG &DAG) const {
27117   SDLoc DL(Op);
27118   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27119                      Op.getOperand(0));
27120 }
27121
27122 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
27123   return Op.getOperand(0);
27124 }
27125
27126 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27127                                                 SelectionDAG &DAG) const {
27128   SDValue Root = Op.getOperand(0);
27129   SDValue Trmp = Op.getOperand(1); // trampoline
27130   SDValue FPtr = Op.getOperand(2); // nested function
27131   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27132   SDLoc dl (Op);
27133
27134   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27135   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27136
27137   if (Subtarget.is64Bit()) {
27138     SDValue OutChains[6];
27139
27140     // Large code-model.
27141     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
27142     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27143
27144     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27145     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27146
27147     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27148
27149     // Load the pointer to the nested function into R11.
27150     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27151     SDValue Addr = Trmp;
27152     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27153                                 Addr, MachinePointerInfo(TrmpAddr));
27154
27155     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27156                        DAG.getConstant(2, dl, MVT::i64));
27157     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27158                                 MachinePointerInfo(TrmpAddr, 2), Align(2));
27159
27160     // Load the 'nest' parameter value into R10.
27161     // R10 is specified in X86CallingConv.td
27162     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27163     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27164                        DAG.getConstant(10, dl, MVT::i64));
27165     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27166                                 Addr, MachinePointerInfo(TrmpAddr, 10));
27167
27168     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27169                        DAG.getConstant(12, dl, MVT::i64));
27170     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27171                                 MachinePointerInfo(TrmpAddr, 12), Align(2));
27172
27173     // Jump to the nested function.
27174     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27175     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27176                        DAG.getConstant(20, dl, MVT::i64));
27177     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27178                                 Addr, MachinePointerInfo(TrmpAddr, 20));
27179
27180     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27181     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27182                        DAG.getConstant(22, dl, MVT::i64));
27183     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27184                                 Addr, MachinePointerInfo(TrmpAddr, 22));
27185
27186     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27187   } else {
27188     const Function *Func =
27189       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27190     CallingConv::ID CC = Func->getCallingConv();
27191     unsigned NestReg;
27192
27193     switch (CC) {
27194     default:
27195       llvm_unreachable("Unsupported calling convention");
27196     case CallingConv::C:
27197     case CallingConv::X86_StdCall: {
27198       // Pass 'nest' parameter in ECX.
27199       // Must be kept in sync with X86CallingConv.td
27200       NestReg = X86::ECX;
27201
27202       // Check that ECX wasn't needed by an 'inreg' parameter.
27203       FunctionType *FTy = Func->getFunctionType();
27204       const AttributeList &Attrs = Func->getAttributes();
27205
27206       if (!Attrs.isEmpty() && !Func->isVarArg()) {
27207         unsigned InRegCount = 0;
27208         unsigned Idx = 0;
27209
27210         for (FunctionType::param_iterator I = FTy->param_begin(),
27211              E = FTy->param_end(); I != E; ++I, ++Idx)
27212           if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
27213             const DataLayout &DL = DAG.getDataLayout();
27214             // FIXME: should only count parameters that are lowered to integers.
27215             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27216           }
27217
27218         if (InRegCount > 2) {
27219           report_fatal_error("Nest register in use - reduce number of inreg"
27220                              " parameters!");
27221         }
27222       }
27223       break;
27224     }
27225     case CallingConv::X86_FastCall:
27226     case CallingConv::X86_ThisCall:
27227     case CallingConv::Fast:
27228     case CallingConv::Tail:
27229     case CallingConv::SwiftTail:
27230       // Pass 'nest' parameter in EAX.
27231       // Must be kept in sync with X86CallingConv.td
27232       NestReg = X86::EAX;
27233       break;
27234     }
27235
27236     SDValue OutChains[4];
27237     SDValue Addr, Disp;
27238
27239     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27240                        DAG.getConstant(10, dl, MVT::i32));
27241     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27242
27243     // This is storing the opcode for MOV32ri.
27244     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27245     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27246     OutChains[0] =
27247         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27248                      Trmp, MachinePointerInfo(TrmpAddr));
27249
27250     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27251                        DAG.getConstant(1, dl, MVT::i32));
27252     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27253                                 MachinePointerInfo(TrmpAddr, 1), Align(1));
27254
27255     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27256     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27257                        DAG.getConstant(5, dl, MVT::i32));
27258     OutChains[2] =
27259         DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27260                      MachinePointerInfo(TrmpAddr, 5), Align(1));
27261
27262     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27263                        DAG.getConstant(6, dl, MVT::i32));
27264     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27265                                 MachinePointerInfo(TrmpAddr, 6), Align(1));
27266
27267     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27268   }
27269 }
27270
27271 SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
27272                                              SelectionDAG &DAG) const {
27273   /*
27274    The rounding mode is in bits 11:10 of FPSR, and has the following
27275    settings:
27276      00 Round to nearest
27277      01 Round to -inf
27278      10 Round to +inf
27279      11 Round to 0
27280
27281   GET_ROUNDING, on the other hand, expects the following:
27282     -1 Undefined
27283      0 Round to 0
27284      1 Round to nearest
27285      2 Round to +inf
27286      3 Round to -inf
27287
27288   To perform the conversion, we use a packed lookup table of the four 2-bit
27289   values that we can index by FPSP[11:10]
27290     0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27291
27292     (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27293   */
27294
27295   MachineFunction &MF = DAG.getMachineFunction();
27296   MVT VT = Op.getSimpleValueType();
27297   SDLoc DL(Op);
27298
27299   // Save FP Control Word to stack slot
27300   int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27301   SDValue StackSlot =
27302       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27303
27304   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
27305
27306   SDValue Chain = Op.getOperand(0);
27307   SDValue Ops[] = {Chain, StackSlot};
27308   Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
27309                                   DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27310                                   Align(2), MachineMemOperand::MOStore);
27311
27312   // Load FP Control Word from stack slot
27313   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27314   Chain = CWD.getValue(1);
27315
27316   // Mask and turn the control bits into a shift for the lookup table.
27317   SDValue Shift =
27318     DAG.getNode(ISD::SRL, DL, MVT::i16,
27319                 DAG.getNode(ISD::AND, DL, MVT::i16,
27320                             CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27321                 DAG.getConstant(9, DL, MVT::i8));
27322   Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27323
27324   SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27325   SDValue RetVal =
27326     DAG.getNode(ISD::AND, DL, MVT::i32,
27327                 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27328                 DAG.getConstant(3, DL, MVT::i32));
27329
27330   RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27331
27332   return DAG.getMergeValues({RetVal, Chain}, DL);
27333 }
27334
27335 SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27336                                              SelectionDAG &DAG) const {
27337   MachineFunction &MF = DAG.getMachineFunction();
27338   SDLoc DL(Op);
27339   SDValue Chain = Op.getNode()->getOperand(0);
27340
27341   // FP control word may be set only from data in memory. So we need to allocate
27342   // stack space to save/load FP control word.
27343   int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27344   SDValue StackSlot =
27345       DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27346   MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
27347   MachineMemOperand *MMO =
27348       MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
27349
27350   // Store FP control word into memory.
27351   SDValue Ops[] = {Chain, StackSlot};
27352   Chain = DAG.getMemIntrinsicNode(
27353       X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27354
27355   // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27356   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27357   Chain = CWD.getValue(1);
27358   CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27359                     DAG.getConstant(0xf3ff, DL, MVT::i16));
27360
27361   // Calculate new rounding mode.
27362   SDValue NewRM = Op.getNode()->getOperand(1);
27363   SDValue RMBits;
27364   if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27365     uint64_t RM = CVal->getZExtValue();
27366     int FieldVal;
27367     switch (static_cast<RoundingMode>(RM)) {
27368     case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27369     case RoundingMode::TowardNegative:    FieldVal = X86::rmDownward; break;
27370     case RoundingMode::TowardPositive:    FieldVal = X86::rmUpward; break;
27371     case RoundingMode::TowardZero:        FieldVal = X86::rmTowardZero; break;
27372     default:
27373       llvm_unreachable("rounding mode is not supported by X86 hardware");
27374     }
27375     RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27376   } else {
27377     // Need to convert argument into bits of control word:
27378     //    0 Round to 0       -> 11
27379     //    1 Round to nearest -> 00
27380     //    2 Round to +inf    -> 10
27381     //    3 Round to -inf    -> 01
27382     // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27383     // To make the conversion, put all these values into a value 0xc9 and shift
27384     // it left depending on the rounding mode:
27385     //    (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27386     //    (0xc9 << 6) & 0xc00 = X86::rmToNearest
27387     //    ...
27388     // (0xc9 << (2 * NewRM + 4)) & 0xc00
27389     SDValue ShiftValue =
27390         DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27391                     DAG.getNode(ISD::ADD, DL, MVT::i32,
27392                                 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27393                                             DAG.getConstant(1, DL, MVT::i8)),
27394                                 DAG.getConstant(4, DL, MVT::i32)));
27395     SDValue Shifted =
27396         DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27397                     ShiftValue);
27398     RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27399                          DAG.getConstant(0xc00, DL, MVT::i16));
27400   }
27401
27402   // Update rounding mode bits and store the new FP Control Word into stack.
27403   CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27404   Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
27405
27406   // Load FP control word from the slot.
27407   SDValue OpsLD[] = {Chain, StackSlot};
27408   MachineMemOperand *MMOL =
27409       MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
27410   Chain = DAG.getMemIntrinsicNode(
27411       X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27412
27413   // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27414   // same way but in bits 14:13.
27415   if (Subtarget.hasSSE1()) {
27416     // Store MXCSR into memory.
27417     Chain = DAG.getNode(
27418         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27419         DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27420         StackSlot);
27421
27422     // Load MXCSR from stack slot and clear RM field (bits 14:13).
27423     SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27424     Chain = CWD.getValue(1);
27425     CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27426                       DAG.getConstant(0xffff9fff, DL, MVT::i32));
27427
27428     // Shift X87 RM bits from 11:10 to 14:13.
27429     RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27430     RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27431                          DAG.getConstant(3, DL, MVT::i8));
27432
27433     // Update rounding mode bits and store the new FP Control Word into stack.
27434     CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27435     Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
27436
27437     // Load MXCSR from the slot.
27438     Chain = DAG.getNode(
27439         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27440         DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27441         StackSlot);
27442   }
27443
27444   return Chain;
27445 }
27446
27447 const unsigned X87StateSize = 28;
27448 const unsigned FPStateSize = 32;
27449 [[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
27450
27451 SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
27452                                               SelectionDAG &DAG) const {
27453   MachineFunction &MF = DAG.getMachineFunction();
27454   SDLoc DL(Op);
27455   SDValue Chain = Op->getOperand(0);
27456   SDValue Ptr = Op->getOperand(1);
27457   auto *Node = cast<FPStateAccessSDNode>(Op);
27458   EVT MemVT = Node->getMemoryVT();
27459   assert(MemVT.getSizeInBits() == FPStateSizeInBits);
27460   MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27461
27462   // Get x87 state, if it presents.
27463   if (Subtarget.hasX87()) {
27464     Chain =
27465         DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
27466                                 {Chain, Ptr}, MemVT, MMO);
27467
27468     // FNSTENV changes the exception mask, so load back the stored environment.
27469     MachineMemOperand::Flags NewFlags =
27470         MachineMemOperand::MOLoad |
27471         (MMO->getFlags() & ~MachineMemOperand::MOStore);
27472     MMO = MF.getMachineMemOperand(MMO, NewFlags);
27473     Chain =
27474         DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27475                                 {Chain, Ptr}, MemVT, MMO);
27476   }
27477
27478   // If target supports SSE, get MXCSR as well.
27479   if (Subtarget.hasSSE1()) {
27480     // Get pointer to the MXCSR location in memory.
27481     MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
27482     SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27483                                     DAG.getConstant(X87StateSize, DL, PtrVT));
27484     // Store MXCSR into memory.
27485     Chain = DAG.getNode(
27486         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27487         DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27488         MXCSRAddr);
27489   }
27490
27491   return Chain;
27492 }
27493
27494 static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, SDLoc DL,
27495                                    EVT MemVT, MachineMemOperand *MMO,
27496                                    SelectionDAG &DAG,
27497                                    const X86Subtarget &Subtarget) {
27498   // Set x87 state, if it presents.
27499   if (Subtarget.hasX87())
27500     Chain =
27501         DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27502                                 {Chain, Ptr}, MemVT, MMO);
27503   // If target supports SSE, set MXCSR as well.
27504   if (Subtarget.hasSSE1()) {
27505     // Get pointer to the MXCSR location in memory.
27506     MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
27507     SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27508                                     DAG.getConstant(X87StateSize, DL, PtrVT));
27509     // Load MXCSR from memory.
27510     Chain = DAG.getNode(
27511         ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27512         DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27513         MXCSRAddr);
27514   }
27515   return Chain;
27516 }
27517
27518 SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
27519                                               SelectionDAG &DAG) const {
27520   SDLoc DL(Op);
27521   SDValue Chain = Op->getOperand(0);
27522   SDValue Ptr = Op->getOperand(1);
27523   auto *Node = cast<FPStateAccessSDNode>(Op);
27524   EVT MemVT = Node->getMemoryVT();
27525   assert(MemVT.getSizeInBits() == FPStateSizeInBits);
27526   MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27527   return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
27528 }
27529
27530 SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
27531                                             SelectionDAG &DAG) const {
27532   MachineFunction &MF = DAG.getMachineFunction();
27533   SDLoc DL(Op);
27534   SDValue Chain = Op.getNode()->getOperand(0);
27535
27536   IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
27537   ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
27538   SmallVector<Constant *, 8> FPEnvVals;
27539
27540   // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
27541   // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
27542   // for compatibility with glibc.
27543   unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
27544   FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
27545   Constant *Zero = ConstantInt::get(ItemTy, 0);
27546   for (unsigned I = 0; I < 6; ++I)
27547     FPEnvVals.push_back(Zero);
27548
27549   // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
27550   // all exceptions, sets DAZ and FTZ to 0.
27551   FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
27552   Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
27553   MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
27554   SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
27555   MachinePointerInfo MPI =
27556       MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
27557   MachineMemOperand *MMO = MF.getMachineMemOperand(
27558       MPI, MachineMemOperand::MOStore, X87StateSize, Align(4));
27559
27560   return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
27561 }
27562
27563 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
27564 //
27565 // i8/i16 vector implemented using dword LZCNT vector instruction
27566 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27567 // split the vector, perform operation on it's Lo a Hi part and
27568 // concatenate the results.
27569 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
27570                                          const X86Subtarget &Subtarget) {
27571   assert(Op.getOpcode() == ISD::CTLZ);
27572   SDLoc dl(Op);
27573   MVT VT = Op.getSimpleValueType();
27574   MVT EltVT = VT.getVectorElementType();
27575   unsigned NumElems = VT.getVectorNumElements();
27576
27577   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
27578           "Unsupported element type");
27579
27580   // Split vector, it's Lo and Hi parts will be handled in next iteration.
27581   if (NumElems > 16 ||
27582       (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27583     return splitVectorIntUnary(Op, DAG);
27584
27585   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27586   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
27587           "Unsupported value type for operation");
27588
27589   // Use native supported vector instruction vplzcntd.
27590   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27591   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27592   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27593   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27594
27595   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27596 }
27597
27598 // Lower CTLZ using a PSHUFB lookup table implementation.
27599 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
27600                                        const X86Subtarget &Subtarget,
27601                                        SelectionDAG &DAG) {
27602   MVT VT = Op.getSimpleValueType();
27603   int NumElts = VT.getVectorNumElements();
27604   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27605   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27606
27607   // Per-nibble leading zero PSHUFB lookup table.
27608   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27609                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27610                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27611                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27612
27613   SmallVector<SDValue, 64> LUTVec;
27614   for (int i = 0; i < NumBytes; ++i)
27615     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27616   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27617
27618   // Begin by bitcasting the input to byte vector, then split those bytes
27619   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
27620   // If the hi input nibble is zero then we add both results together, otherwise
27621   // we just take the hi result (by masking the lo result to zero before the
27622   // add).
27623   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27624   SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27625
27626   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27627   SDValue Lo = Op0;
27628   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27629   SDValue HiZ;
27630   if (CurrVT.is512BitVector()) {
27631     MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27632     HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27633     HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27634   } else {
27635     HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27636   }
27637
27638   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27639   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27640   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27641   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27642
27643   // Merge result back from vXi8 back to VT, working on the lo/hi halves
27644   // of the current vector width in the same way we did for the nibbles.
27645   // If the upper half of the input element is zero then add the halves'
27646   // leading zero counts together, otherwise just use the upper half's.
27647   // Double the width of the result until we are at target width.
27648   while (CurrVT != VT) {
27649     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27650     int CurrNumElts = CurrVT.getVectorNumElements();
27651     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27652     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27653     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27654
27655     // Check if the upper half of the input element is zero.
27656     if (CurrVT.is512BitVector()) {
27657       MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27658       HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27659                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27660       HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27661     } else {
27662       HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27663                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27664     }
27665     HiZ = DAG.getBitcast(NextVT, HiZ);
27666
27667     // Move the upper/lower halves to the lower bits as we'll be extending to
27668     // NextVT. Mask the lower result to zero if HiZ is true and add the results
27669     // together.
27670     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27671     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27672     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27673     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27674     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27675     CurrVT = NextVT;
27676   }
27677
27678   return Res;
27679 }
27680
27681 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
27682                                const X86Subtarget &Subtarget,
27683                                SelectionDAG &DAG) {
27684   MVT VT = Op.getSimpleValueType();
27685
27686   if (Subtarget.hasCDI() &&
27687       // vXi8 vectors need to be promoted to 512-bits for vXi32.
27688       (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27689     return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27690
27691   // Decompose 256-bit ops into smaller 128-bit ops.
27692   if (VT.is256BitVector() && !Subtarget.hasInt256())
27693     return splitVectorIntUnary(Op, DAG);
27694
27695   // Decompose 512-bit ops into smaller 256-bit ops.
27696   if (VT.is512BitVector() && !Subtarget.hasBWI())
27697     return splitVectorIntUnary(Op, DAG);
27698
27699   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
27700   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
27701 }
27702
27703 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
27704                          SelectionDAG &DAG) {
27705   MVT VT = Op.getSimpleValueType();
27706   MVT OpVT = VT;
27707   unsigned NumBits = VT.getSizeInBits();
27708   SDLoc dl(Op);
27709   unsigned Opc = Op.getOpcode();
27710
27711   if (VT.isVector())
27712     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
27713
27714   Op = Op.getOperand(0);
27715   if (VT == MVT::i8) {
27716     // Zero extend to i32 since there is not an i8 bsr.
27717     OpVT = MVT::i32;
27718     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
27719   }
27720
27721   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
27722   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
27723   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
27724
27725   if (Opc == ISD::CTLZ) {
27726     // If src is zero (i.e. bsr sets ZF), returns NumBits.
27727     SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
27728                      DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27729                      Op.getValue(1)};
27730     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
27731   }
27732
27733   // Finally xor with NumBits-1.
27734   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
27735                    DAG.getConstant(NumBits - 1, dl, OpVT));
27736
27737   if (VT == MVT::i8)
27738     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
27739   return Op;
27740 }
27741
27742 static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
27743                          SelectionDAG &DAG) {
27744   MVT VT = Op.getSimpleValueType();
27745   unsigned NumBits = VT.getScalarSizeInBits();
27746   SDValue N0 = Op.getOperand(0);
27747   SDLoc dl(Op);
27748
27749   assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
27750          "Only scalar CTTZ requires custom lowering");
27751
27752   // Issue a bsf (scan bits forward) which also sets EFLAGS.
27753   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
27754   Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
27755
27756   // If src is known never zero we can skip the CMOV.
27757   if (DAG.isKnownNeverZero(N0))
27758     return Op;
27759
27760   // If src is zero (i.e. bsf sets ZF), returns NumBits.
27761   SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
27762                    DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27763                    Op.getValue(1)};
27764   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
27765 }
27766
27767 static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
27768                            const X86Subtarget &Subtarget) {
27769   MVT VT = Op.getSimpleValueType();
27770   if (VT == MVT::i16 || VT == MVT::i32)
27771     return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
27772
27773   if (VT == MVT::v32i16 || VT == MVT::v64i8)
27774     return splitVectorIntBinary(Op, DAG);
27775
27776   assert(Op.getSimpleValueType().is256BitVector() &&
27777          Op.getSimpleValueType().isInteger() &&
27778          "Only handle AVX 256-bit vector integer operation");
27779   return splitVectorIntBinary(Op, DAG);
27780 }
27781
27782 static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
27783                                   const X86Subtarget &Subtarget) {
27784   MVT VT = Op.getSimpleValueType();
27785   SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
27786   unsigned Opcode = Op.getOpcode();
27787   SDLoc DL(Op);
27788
27789   if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
27790       (VT.is256BitVector() && !Subtarget.hasInt256())) {
27791     assert(Op.getSimpleValueType().isInteger() &&
27792            "Only handle AVX vector integer operation");
27793     return splitVectorIntBinary(Op, DAG);
27794   }
27795
27796   // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
27797   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27798   EVT SetCCResultType =
27799       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27800
27801   unsigned BitWidth = VT.getScalarSizeInBits();
27802   if (Opcode == ISD::USUBSAT) {
27803     if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
27804       // Handle a special-case with a bit-hack instead of cmp+select:
27805       // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
27806       // If the target can use VPTERNLOG, DAGToDAG will match this as
27807       // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
27808       // "broadcast" constant load.
27809       ConstantSDNode *C = isConstOrConstSplat(Y, true);
27810       if (C && C->getAPIntValue().isSignMask()) {
27811         SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
27812         SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
27813         SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
27814         SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
27815         return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
27816       }
27817     }
27818     if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
27819       // usubsat X, Y --> (X >u Y) ? X - Y : 0
27820       SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
27821       SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
27822       // TODO: Move this to DAGCombiner?
27823       if (SetCCResultType == VT &&
27824           DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
27825         return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
27826       return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
27827     }
27828   }
27829
27830   if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
27831       (!VT.isVector() || VT == MVT::v2i64)) {
27832     APInt MinVal = APInt::getSignedMinValue(BitWidth);
27833     APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
27834     SDValue Zero = DAG.getConstant(0, DL, VT);
27835     SDValue Result =
27836         DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
27837                     DAG.getVTList(VT, SetCCResultType), X, Y);
27838     SDValue SumDiff = Result.getValue(0);
27839     SDValue Overflow = Result.getValue(1);
27840     SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
27841     SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
27842     SDValue SumNeg =
27843         DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
27844     Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
27845     return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
27846   }
27847
27848   // Use default expansion.
27849   return SDValue();
27850 }
27851
27852 static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
27853                         SelectionDAG &DAG) {
27854   MVT VT = Op.getSimpleValueType();
27855   if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
27856     // Since X86 does not have CMOV for 8-bit integer, we don't convert
27857     // 8-bit integer abs to NEG and CMOV.
27858     SDLoc DL(Op);
27859     SDValue N0 = Op.getOperand(0);
27860     SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
27861                               DAG.getConstant(0, DL, VT), N0);
27862     SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
27863                      SDValue(Neg.getNode(), 1)};
27864     return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
27865   }
27866
27867   // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
27868   if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
27869     SDLoc DL(Op);
27870     SDValue Src = Op.getOperand(0);
27871     SDValue Sub =
27872         DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
27873     return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
27874   }
27875
27876   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
27877     assert(VT.isInteger() &&
27878            "Only handle AVX 256-bit vector integer operation");
27879     return splitVectorIntUnary(Op, DAG);
27880   }
27881
27882   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
27883     return splitVectorIntUnary(Op, DAG);
27884
27885   // Default to expand.
27886   return SDValue();
27887 }
27888
27889 static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
27890                         SelectionDAG &DAG) {
27891   MVT VT = Op.getSimpleValueType();
27892
27893   // For AVX1 cases, split to use legal ops.
27894   if (VT.is256BitVector() && !Subtarget.hasInt256())
27895     return splitVectorIntBinary(Op, DAG);
27896
27897   if (VT == MVT::v32i16 || VT == MVT::v64i8)
27898     return splitVectorIntBinary(Op, DAG);
27899
27900   // Default to expand.
27901   return SDValue();
27902 }
27903
27904 static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
27905                            SelectionDAG &DAG) {
27906   MVT VT = Op.getSimpleValueType();
27907
27908   // For AVX1 cases, split to use legal ops.
27909   if (VT.is256BitVector() && !Subtarget.hasInt256())
27910     return splitVectorIntBinary(Op, DAG);
27911
27912   if (VT == MVT::v32i16 || VT == MVT::v64i8)
27913     return splitVectorIntBinary(Op, DAG);
27914
27915   // Default to expand.
27916   return SDValue();
27917 }
27918
27919 static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
27920                                       SelectionDAG &DAG) {
27921   assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
27922          "Expected FMAXIMUM or FMINIMUM opcode");
27923   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27924   EVT VT = Op.getValueType();
27925   SDValue X = Op.getOperand(0);
27926   SDValue Y = Op.getOperand(1);
27927   SDLoc DL(Op);
27928   uint64_t SizeInBits = VT.getScalarSizeInBits();
27929   APInt PreferredZero = APInt::getZero(SizeInBits);
27930   APInt OppositeZero = PreferredZero;
27931   EVT IVT = VT.changeTypeToInteger();
27932   X86ISD::NodeType MinMaxOp;
27933   if (Op.getOpcode() == ISD::FMAXIMUM) {
27934     MinMaxOp = X86ISD::FMAX;
27935     OppositeZero.setSignBit();
27936   } else {
27937     PreferredZero.setSignBit();
27938     MinMaxOp = X86ISD::FMIN;
27939   }
27940   EVT SetCCType =
27941       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
27942
27943   // The tables below show the expected result of Max in cases of NaN and
27944   // signed zeros.
27945   //
27946   //                 Y                       Y
27947   //             Num   xNaN              +0     -0
27948   //          ---------------         ---------------
27949   //     Num  |  Max |   Y  |     +0  |  +0  |  +0  |
27950   // X        ---------------  X      ---------------
27951   //    xNaN  |   X  |  X/Y |     -0  |  +0  |  -0  |
27952   //          ---------------         ---------------
27953   //
27954   // It is achieved by means of FMAX/FMIN with preliminary checks and operand
27955   // reordering.
27956   //
27957   // We check if any of operands is NaN and return NaN. Then we check if any of
27958   // operands is zero or negative zero (for fmaximum and fminimum respectively)
27959   // to ensure the correct zero is returned.
27960   auto MatchesZero = [](SDValue Op, APInt Zero) {
27961     Op = peekThroughBitcasts(Op);
27962     if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
27963       return CstOp->getValueAPF().bitcastToAPInt() == Zero;
27964     if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
27965       return CstOp->getAPIntValue() == Zero;
27966     if (Op->getOpcode() == ISD::BUILD_VECTOR ||
27967         Op->getOpcode() == ISD::SPLAT_VECTOR) {
27968       for (const SDValue &OpVal : Op->op_values()) {
27969         if (OpVal.isUndef())
27970           continue;
27971         auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
27972         if (!CstOp)
27973           return false;
27974         if (!CstOp->getValueAPF().isZero())
27975           continue;
27976         if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
27977           return false;
27978       }
27979       return true;
27980     }
27981     return false;
27982   };
27983
27984   bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
27985   bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
27986   bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
27987                           Op->getFlags().hasNoSignedZeros() ||
27988                           DAG.isKnownNeverZeroFloat(X) ||
27989                           DAG.isKnownNeverZeroFloat(Y);
27990   SDValue NewX, NewY;
27991   if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
27992       MatchesZero(X, OppositeZero)) {
27993     // Operands are already in right order or order does not matter.
27994     NewX = X;
27995     NewY = Y;
27996   } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
27997     NewX = Y;
27998     NewY = X;
27999   } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
28000              (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
28001     if (IsXNeverNaN)
28002       std::swap(X, Y);
28003     // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
28004     // xmm register.
28005     MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
28006     SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorType, X);
28007     // Bits of classes:
28008     // Bits  Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4]  Imm8[5]  Imm8[6] Imm8[7]
28009     // Class    QNAN PosZero NegZero  PosINF  NegINF Denormal Negative    SNAN
28010     SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
28011                                         DL, MVT::i32);
28012     SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
28013     SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
28014                               DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
28015                               DAG.getIntPtrConstant(0, DL));
28016     SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
28017     NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
28018     NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
28019     return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28020   } else {
28021     SDValue IsXSigned;
28022     if (Subtarget.is64Bit() || VT != MVT::f64) {
28023       SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
28024       SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
28025       IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
28026     } else {
28027       assert(VT == MVT::f64);
28028       SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
28029                                 DAG.getConstantFP(0, DL, MVT::v2f64), X,
28030                                 DAG.getIntPtrConstant(0, DL));
28031       SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
28032       SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
28033                                DAG.getIntPtrConstant(1, DL));
28034       Hi = DAG.getBitcast(MVT::i32, Hi);
28035       SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
28036       EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
28037                                              *DAG.getContext(), MVT::i32);
28038       IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
28039     }
28040     if (MinMaxOp == X86ISD::FMAX) {
28041       NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28042       NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28043     } else {
28044       NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28045       NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28046     }
28047   }
28048
28049   bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
28050                    Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
28051
28052   // If we did no ordering operands for signed zero handling and we need
28053   // to process NaN and we know that the second operand is not NaN then put
28054   // it in first operand and we will not need to post handle NaN after max/min.
28055   if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
28056     std::swap(NewX, NewY);
28057
28058   SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28059
28060   if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
28061     return MinMax;
28062
28063   SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
28064   return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
28065 }
28066
28067 static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
28068                         SelectionDAG &DAG) {
28069   MVT VT = Op.getSimpleValueType();
28070
28071   // For AVX1 cases, split to use legal ops.
28072   if (VT.is256BitVector() && !Subtarget.hasInt256())
28073     return splitVectorIntBinary(Op, DAG);
28074
28075   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
28076     return splitVectorIntBinary(Op, DAG);
28077
28078   SDLoc dl(Op);
28079   bool IsSigned = Op.getOpcode() == ISD::ABDS;
28080   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28081
28082   // TODO: Move to TargetLowering expandABD() once we have ABD promotion.
28083   if (VT.isScalarInteger()) {
28084     unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
28085     MVT WideVT = MVT::getIntegerVT(WideBits);
28086     if (TLI.isTypeLegal(WideVT)) {
28087       // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
28088       // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
28089       unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28090       SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
28091       SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
28092       SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS);
28093       SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff);
28094       return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
28095     }
28096   }
28097
28098   // TODO: Move to TargetLowering expandABD().
28099   if (!Subtarget.hasSSE41() &&
28100       ((IsSigned && VT == MVT::v16i8) || VT == MVT::v4i32)) {
28101     SDValue LHS = DAG.getFreeze(Op.getOperand(0));
28102     SDValue RHS = DAG.getFreeze(Op.getOperand(1));
28103     ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
28104     SDValue Cmp = DAG.getSetCC(dl, VT, LHS, RHS, CC);
28105     SDValue Diff0 = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
28106     SDValue Diff1 = DAG.getNode(ISD::SUB, dl, VT, RHS, LHS);
28107     return getBitSelect(dl, VT, Diff0, Diff1, Cmp, DAG);
28108   }
28109
28110   // Default to expand.
28111   return SDValue();
28112 }
28113
28114 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28115                         SelectionDAG &DAG) {
28116   SDLoc dl(Op);
28117   MVT VT = Op.getSimpleValueType();
28118
28119   // Decompose 256-bit ops into 128-bit ops.
28120   if (VT.is256BitVector() && !Subtarget.hasInt256())
28121     return splitVectorIntBinary(Op, DAG);
28122
28123   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28124     return splitVectorIntBinary(Op, DAG);
28125
28126   SDValue A = Op.getOperand(0);
28127   SDValue B = Op.getOperand(1);
28128
28129   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28130   // vector pairs, multiply and truncate.
28131   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28132     unsigned NumElts = VT.getVectorNumElements();
28133
28134     if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28135         (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28136       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28137       return DAG.getNode(
28138           ISD::TRUNCATE, dl, VT,
28139           DAG.getNode(ISD::MUL, dl, ExVT,
28140                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28141                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28142     }
28143
28144     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28145
28146     // Extract the lo/hi parts to any extend to i16.
28147     // We're going to mask off the low byte of each result element of the
28148     // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28149     // element.
28150     SDValue Undef = DAG.getUNDEF(VT);
28151     SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28152     SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28153
28154     SDValue BLo, BHi;
28155     if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28156       // If the RHS is a constant, manually unpackl/unpackh.
28157       SmallVector<SDValue, 16> LoOps, HiOps;
28158       for (unsigned i = 0; i != NumElts; i += 16) {
28159         for (unsigned j = 0; j != 8; ++j) {
28160           LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28161                                                MVT::i16));
28162           HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28163                                                MVT::i16));
28164         }
28165       }
28166
28167       BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28168       BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28169     } else {
28170       BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28171       BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28172     }
28173
28174     // Multiply, mask the lower 8bits of the lo/hi results and pack.
28175     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28176     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28177     return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28178   }
28179
28180   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28181   if (VT == MVT::v4i32) {
28182     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
28183            "Should not custom lower when pmulld is available!");
28184
28185     // Extract the odd parts.
28186     static const int UnpackMask[] = { 1, -1, 3, -1 };
28187     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28188     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28189
28190     // Multiply the even parts.
28191     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28192                                 DAG.getBitcast(MVT::v2i64, A),
28193                                 DAG.getBitcast(MVT::v2i64, B));
28194     // Now multiply odd parts.
28195     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28196                                DAG.getBitcast(MVT::v2i64, Aodds),
28197                                DAG.getBitcast(MVT::v2i64, Bodds));
28198
28199     Evens = DAG.getBitcast(VT, Evens);
28200     Odds = DAG.getBitcast(VT, Odds);
28201
28202     // Merge the two vectors back together with a shuffle. This expands into 2
28203     // shuffles.
28204     static const int ShufMask[] = { 0, 4, 2, 6 };
28205     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28206   }
28207
28208   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
28209          "Only know how to lower V2I64/V4I64/V8I64 multiply");
28210   assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
28211
28212   //  Ahi = psrlqi(a, 32);
28213   //  Bhi = psrlqi(b, 32);
28214   //
28215   //  AloBlo = pmuludq(a, b);
28216   //  AloBhi = pmuludq(a, Bhi);
28217   //  AhiBlo = pmuludq(Ahi, b);
28218   //
28219   //  Hi = psllqi(AloBhi + AhiBlo, 32);
28220   //  return AloBlo + Hi;
28221   KnownBits AKnown = DAG.computeKnownBits(A);
28222   KnownBits BKnown = DAG.computeKnownBits(B);
28223
28224   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28225   bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28226   bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28227
28228   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28229   bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28230   bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28231
28232   SDValue Zero = DAG.getConstant(0, dl, VT);
28233
28234   // Only multiply lo/hi halves that aren't known to be zero.
28235   SDValue AloBlo = Zero;
28236   if (!ALoIsZero && !BLoIsZero)
28237     AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28238
28239   SDValue AloBhi = Zero;
28240   if (!ALoIsZero && !BHiIsZero) {
28241     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28242     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28243   }
28244
28245   SDValue AhiBlo = Zero;
28246   if (!AHiIsZero && !BLoIsZero) {
28247     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28248     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28249   }
28250
28251   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28252   Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28253
28254   return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28255 }
28256
28257 static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
28258                                      MVT VT, bool IsSigned,
28259                                      const X86Subtarget &Subtarget,
28260                                      SelectionDAG &DAG,
28261                                      SDValue *Low = nullptr) {
28262   unsigned NumElts = VT.getVectorNumElements();
28263
28264   // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28265   // to a vXi16 type. Do the multiplies, shift the results and pack the half
28266   // lane results back together.
28267
28268   // We'll take different approaches for signed and unsigned.
28269   // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28270   // and use pmullw to calculate the full 16-bit product.
28271   // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28272   // shift them left into the upper byte of each word. This allows us to use
28273   // pmulhw to calculate the full 16-bit product. This trick means we don't
28274   // need to sign extend the bytes to use pmullw.
28275
28276   MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28277   SDValue Zero = DAG.getConstant(0, dl, VT);
28278
28279   SDValue ALo, AHi;
28280   if (IsSigned) {
28281     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28282     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28283   } else {
28284     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28285     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28286   }
28287
28288   SDValue BLo, BHi;
28289   if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28290     // If the RHS is a constant, manually unpackl/unpackh and extend.
28291     SmallVector<SDValue, 16> LoOps, HiOps;
28292     for (unsigned i = 0; i != NumElts; i += 16) {
28293       for (unsigned j = 0; j != 8; ++j) {
28294         SDValue LoOp = B.getOperand(i + j);
28295         SDValue HiOp = B.getOperand(i + j + 8);
28296
28297         if (IsSigned) {
28298           LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28299           HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28300           LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28301                              DAG.getConstant(8, dl, MVT::i16));
28302           HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28303                              DAG.getConstant(8, dl, MVT::i16));
28304         } else {
28305           LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28306           HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28307         }
28308
28309         LoOps.push_back(LoOp);
28310         HiOps.push_back(HiOp);
28311       }
28312     }
28313
28314     BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28315     BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28316   } else if (IsSigned) {
28317     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28318     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28319   } else {
28320     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28321     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28322   }
28323
28324   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28325   // pack back to vXi8.
28326   unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28327   SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28328   SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28329
28330   if (Low)
28331     *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28332
28333   return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
28334 }
28335
28336 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28337                          SelectionDAG &DAG) {
28338   SDLoc dl(Op);
28339   MVT VT = Op.getSimpleValueType();
28340   bool IsSigned = Op->getOpcode() == ISD::MULHS;
28341   unsigned NumElts = VT.getVectorNumElements();
28342   SDValue A = Op.getOperand(0);
28343   SDValue B = Op.getOperand(1);
28344
28345   // Decompose 256-bit ops into 128-bit ops.
28346   if (VT.is256BitVector() && !Subtarget.hasInt256())
28347     return splitVectorIntBinary(Op, DAG);
28348
28349   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28350     return splitVectorIntBinary(Op, DAG);
28351
28352   if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28353     assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
28354            (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
28355            (VT == MVT::v16i32 && Subtarget.hasAVX512()));
28356
28357     // PMULxD operations multiply each even value (starting at 0) of LHS with
28358     // the related value of RHS and produce a widen result.
28359     // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28360     // => <2 x i64> <ae|cg>
28361     //
28362     // In other word, to have all the results, we need to perform two PMULxD:
28363     // 1. one with the even values.
28364     // 2. one with the odd values.
28365     // To achieve #2, with need to place the odd values at an even position.
28366     //
28367     // Place the odd value at an even position (basically, shift all values 1
28368     // step to the left):
28369     const int Mask[] = {1, -1,  3, -1,  5, -1,  7, -1,
28370                         9, -1, 11, -1, 13, -1, 15, -1};
28371     // <a|b|c|d> => <b|undef|d|undef>
28372     SDValue Odd0 =
28373         DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
28374     // <e|f|g|h> => <f|undef|h|undef>
28375     SDValue Odd1 =
28376         DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
28377
28378     // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28379     // ints.
28380     MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28381     unsigned Opcode =
28382         (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28383     // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28384     // => <2 x i64> <ae|cg>
28385     SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28386                                                   DAG.getBitcast(MulVT, A),
28387                                                   DAG.getBitcast(MulVT, B)));
28388     // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28389     // => <2 x i64> <bf|dh>
28390     SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28391                                                   DAG.getBitcast(MulVT, Odd0),
28392                                                   DAG.getBitcast(MulVT, Odd1)));
28393
28394     // Shuffle it back into the right order.
28395     SmallVector<int, 16> ShufMask(NumElts);
28396     for (int i = 0; i != (int)NumElts; ++i)
28397       ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28398
28399     SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28400
28401     // If we have a signed multiply but no PMULDQ fix up the result of an
28402     // unsigned multiply.
28403     if (IsSigned && !Subtarget.hasSSE41()) {
28404       SDValue Zero = DAG.getConstant(0, dl, VT);
28405       SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28406                                DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28407       SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28408                                DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28409
28410       SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28411       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28412     }
28413
28414     return Res;
28415   }
28416
28417   // Only i8 vectors should need custom lowering after this.
28418   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28419          (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28420          "Unsupported vector type");
28421
28422   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28423   // logical shift down the upper half and pack back to i8.
28424
28425   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28426   // and then ashr/lshr the upper bits down to the lower bits before multiply.
28427
28428   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28429       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28430     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28431     unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28432     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28433     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28434     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28435     Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28436     return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28437   }
28438
28439   return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28440 }
28441
28442 // Custom lowering for SMULO/UMULO.
28443 static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28444                          SelectionDAG &DAG) {
28445   MVT VT = Op.getSimpleValueType();
28446
28447   // Scalars defer to LowerXALUO.
28448   if (!VT.isVector())
28449     return LowerXALUO(Op, DAG);
28450
28451   SDLoc dl(Op);
28452   bool IsSigned = Op->getOpcode() == ISD::SMULO;
28453   SDValue A = Op.getOperand(0);
28454   SDValue B = Op.getOperand(1);
28455   EVT OvfVT = Op->getValueType(1);
28456
28457   if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28458       (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28459     // Extract the LHS Lo/Hi vectors
28460     SDValue LHSLo, LHSHi;
28461     std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28462
28463     // Extract the RHS Lo/Hi vectors
28464     SDValue RHSLo, RHSHi;
28465     std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28466
28467     EVT LoOvfVT, HiOvfVT;
28468     std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28469     SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28470     SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28471
28472     // Issue the split operations.
28473     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28474     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28475
28476     // Join the separate data results and the overflow results.
28477     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28478     SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28479                               Hi.getValue(1));
28480
28481     return DAG.getMergeValues({Res, Ovf}, dl);
28482   }
28483
28484   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28485   EVT SetccVT =
28486       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28487
28488   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28489       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28490     unsigned NumElts = VT.getVectorNumElements();
28491     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28492     unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28493     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28494     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28495     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28496
28497     SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28498
28499     SDValue Ovf;
28500     if (IsSigned) {
28501       SDValue High, LowSign;
28502       if (OvfVT.getVectorElementType() == MVT::i1 &&
28503           (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28504         // Rather the truncating try to do the compare on vXi16 or vXi32.
28505         // Shift the high down filling with sign bits.
28506         High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
28507         // Fill all 16 bits with the sign bit from the low.
28508         LowSign =
28509             getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
28510         LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
28511                                              15, DAG);
28512         SetccVT = OvfVT;
28513         if (!Subtarget.hasBWI()) {
28514           // We can't do a vXi16 compare so sign extend to v16i32.
28515           High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28516           LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28517         }
28518       } else {
28519         // Otherwise do the compare at vXi8.
28520         High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28521         High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28522         LowSign =
28523             DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28524       }
28525
28526       Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28527     } else {
28528       SDValue High =
28529           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28530       if (OvfVT.getVectorElementType() == MVT::i1 &&
28531           (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28532         // Rather the truncating try to do the compare on vXi16 or vXi32.
28533         SetccVT = OvfVT;
28534         if (!Subtarget.hasBWI()) {
28535           // We can't do a vXi16 compare so sign extend to v16i32.
28536           High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28537         }
28538       } else {
28539         // Otherwise do the compare at vXi8.
28540         High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28541       }
28542
28543       Ovf =
28544           DAG.getSetCC(dl, SetccVT, High,
28545                        DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28546     }
28547
28548     Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28549
28550     return DAG.getMergeValues({Low, Ovf}, dl);
28551   }
28552
28553   SDValue Low;
28554   SDValue High =
28555       LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28556
28557   SDValue Ovf;
28558   if (IsSigned) {
28559     // SMULO overflows if the high bits don't match the sign of the low.
28560     SDValue LowSign =
28561         DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28562     Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28563   } else {
28564     // UMULO overflows if the high bits are non-zero.
28565     Ovf =
28566         DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28567   }
28568
28569   Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28570
28571   return DAG.getMergeValues({Low, Ovf}, dl);
28572 }
28573
28574 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28575   assert(Subtarget.isTargetWin64() && "Unexpected target");
28576   EVT VT = Op.getValueType();
28577   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28578          "Unexpected return type for lowering");
28579
28580   if (isa<ConstantSDNode>(Op->getOperand(1))) {
28581     SmallVector<SDValue> Result;
28582     if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
28583       return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
28584   }
28585
28586   RTLIB::Libcall LC;
28587   bool isSigned;
28588   switch (Op->getOpcode()) {
28589   default: llvm_unreachable("Unexpected request for libcall!");
28590   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
28591   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
28592   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
28593   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
28594   }
28595
28596   SDLoc dl(Op);
28597   SDValue InChain = DAG.getEntryNode();
28598
28599   TargetLowering::ArgListTy Args;
28600   TargetLowering::ArgListEntry Entry;
28601   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28602     EVT ArgVT = Op->getOperand(i).getValueType();
28603     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28604            "Unexpected argument type for lowering");
28605     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28606     int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28607     MachinePointerInfo MPI =
28608         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28609     Entry.Node = StackPtr;
28610     InChain =
28611         DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28612     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28613     Entry.Ty = PointerType::get(ArgTy,0);
28614     Entry.IsSExt = false;
28615     Entry.IsZExt = false;
28616     Args.push_back(Entry);
28617   }
28618
28619   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
28620                                          getPointerTy(DAG.getDataLayout()));
28621
28622   TargetLowering::CallLoweringInfo CLI(DAG);
28623   CLI.setDebugLoc(dl)
28624       .setChain(InChain)
28625       .setLibCallee(
28626           getLibcallCallingConv(LC),
28627           static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28628           std::move(Args))
28629       .setInRegister()
28630       .setSExtResult(isSigned)
28631       .setZExtResult(!isSigned);
28632
28633   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28634   return DAG.getBitcast(VT, CallInfo.first);
28635 }
28636
28637 SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
28638                                                    SelectionDAG &DAG,
28639                                                    SDValue &Chain) const {
28640   assert(Subtarget.isTargetWin64() && "Unexpected target");
28641   EVT VT = Op.getValueType();
28642   bool IsStrict = Op->isStrictFPOpcode();
28643
28644   SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28645   EVT ArgVT = Arg.getValueType();
28646
28647   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28648          "Unexpected return type for lowering");
28649
28650   RTLIB::Libcall LC;
28651   if (Op->getOpcode() == ISD::FP_TO_SINT ||
28652       Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
28653     LC = RTLIB::getFPTOSINT(ArgVT, VT);
28654   else
28655     LC = RTLIB::getFPTOUINT(ArgVT, VT);
28656   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28657
28658   SDLoc dl(Op);
28659   MakeLibCallOptions CallOptions;
28660   Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
28661
28662   SDValue Result;
28663   // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
28664   // expected VT (i128).
28665   std::tie(Result, Chain) =
28666       makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
28667   Result = DAG.getBitcast(VT, Result);
28668   return Result;
28669 }
28670
28671 SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
28672                                                    SelectionDAG &DAG) const {
28673   assert(Subtarget.isTargetWin64() && "Unexpected target");
28674   EVT VT = Op.getValueType();
28675   bool IsStrict = Op->isStrictFPOpcode();
28676
28677   SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28678   EVT ArgVT = Arg.getValueType();
28679
28680   assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28681          "Unexpected argument type for lowering");
28682
28683   RTLIB::Libcall LC;
28684   if (Op->getOpcode() == ISD::SINT_TO_FP ||
28685       Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
28686     LC = RTLIB::getSINTTOFP(ArgVT, VT);
28687   else
28688     LC = RTLIB::getUINTTOFP(ArgVT, VT);
28689   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28690
28691   SDLoc dl(Op);
28692   MakeLibCallOptions CallOptions;
28693   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
28694
28695   // Pass the i128 argument as an indirect argument on the stack.
28696   SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28697   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28698   MachinePointerInfo MPI =
28699       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28700   Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
28701
28702   SDValue Result;
28703   std::tie(Result, Chain) =
28704       makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
28705   return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
28706 }
28707
28708 // Return true if the required (according to Opcode) shift-imm form is natively
28709 // supported by the Subtarget
28710 static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
28711                                         unsigned Opcode) {
28712   if (!VT.isSimple())
28713     return false;
28714
28715   if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
28716     return false;
28717
28718   if (VT.getScalarSizeInBits() < 16)
28719     return false;
28720
28721   if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
28722       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
28723     return true;
28724
28725   bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
28726                 (VT.is256BitVector() && Subtarget.hasInt256());
28727
28728   bool AShift = LShift && (Subtarget.hasAVX512() ||
28729                            (VT != MVT::v2i64 && VT != MVT::v4i64));
28730   return (Opcode == ISD::SRA) ? AShift : LShift;
28731 }
28732
28733 // The shift amount is a variable, but it is the same for all vector lanes.
28734 // These instructions are defined together with shift-immediate.
28735 static
28736 bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget,
28737                                       unsigned Opcode) {
28738   return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
28739 }
28740
28741 // Return true if the required (according to Opcode) variable-shift form is
28742 // natively supported by the Subtarget
28743 static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
28744                                     unsigned Opcode) {
28745   if (!VT.isSimple())
28746     return false;
28747
28748   if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
28749     return false;
28750
28751   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
28752     return false;
28753
28754   // vXi16 supported only on AVX-512, BWI
28755   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
28756     return false;
28757
28758   if (Subtarget.hasAVX512() &&
28759       (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
28760     return true;
28761
28762   bool LShift = VT.is128BitVector() || VT.is256BitVector();
28763   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
28764   return (Opcode == ISD::SRA) ? AShift : LShift;
28765 }
28766
28767 static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
28768                                            const X86Subtarget &Subtarget) {
28769   MVT VT = Op.getSimpleValueType();
28770   SDLoc dl(Op);
28771   SDValue R = Op.getOperand(0);
28772   SDValue Amt = Op.getOperand(1);
28773   unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
28774
28775   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
28776     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
28777     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
28778     SDValue Ex = DAG.getBitcast(ExVT, R);
28779
28780     // ashr(R, 63) === cmp_slt(R, 0)
28781     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
28782       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
28783              "Unsupported PCMPGT op");
28784       return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
28785     }
28786
28787     if (ShiftAmt >= 32) {
28788       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
28789       SDValue Upper =
28790           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
28791       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28792                                                  ShiftAmt - 32, DAG);
28793       if (VT == MVT::v2i64)
28794         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
28795       if (VT == MVT::v4i64)
28796         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28797                                   {9, 1, 11, 3, 13, 5, 15, 7});
28798     } else {
28799       // SRA upper i32, SRL whole i64 and select lower i32.
28800       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28801                                                  ShiftAmt, DAG);
28802       SDValue Lower =
28803           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
28804       Lower = DAG.getBitcast(ExVT, Lower);
28805       if (VT == MVT::v2i64)
28806         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
28807       if (VT == MVT::v4i64)
28808         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28809                                   {8, 1, 10, 3, 12, 5, 14, 7});
28810     }
28811     return DAG.getBitcast(VT, Ex);
28812   };
28813
28814   // Optimize shl/srl/sra with constant shift amount.
28815   APInt APIntShiftAmt;
28816   if (!X86::isConstantSplat(Amt, APIntShiftAmt))
28817     return SDValue();
28818
28819   // If the shift amount is out of range, return undef.
28820   if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
28821     return DAG.getUNDEF(VT);
28822
28823   uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
28824
28825   if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
28826     // Hardware support for vector shifts is sparse which makes us scalarize the
28827     // vector operations in many cases. Also, on sandybridge ADD is faster than
28828     // shl: (shl V, 1) -> (add (freeze V), (freeze V))
28829     if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
28830       // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
28831       // must be 0). (add undef, undef) however can be any value. To make this
28832       // safe, we must freeze R to ensure that register allocation uses the same
28833       // register for an undefined value. This ensures that the result will
28834       // still be even and preserves the original semantics.
28835       R = DAG.getFreeze(R);
28836       return DAG.getNode(ISD::ADD, dl, VT, R, R);
28837     }
28838
28839     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
28840   }
28841
28842   // i64 SRA needs to be performed as partial shifts.
28843   if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
28844        (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
28845       Op.getOpcode() == ISD::SRA)
28846     return ArithmeticShiftRight64(ShiftAmt);
28847
28848   if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28849       (Subtarget.hasBWI() && VT == MVT::v64i8)) {
28850     unsigned NumElts = VT.getVectorNumElements();
28851     MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28852
28853     // Simple i8 add case
28854     if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
28855       // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
28856       // must be 0). (add undef, undef) however can be any value. To make this
28857       // safe, we must freeze R to ensure that register allocation uses the same
28858       // register for an undefined value. This ensures that the result will
28859       // still be even and preserves the original semantics.
28860       R = DAG.getFreeze(R);
28861       return DAG.getNode(ISD::ADD, dl, VT, R, R);
28862     }
28863
28864     // ashr(R, 7)  === cmp_slt(R, 0)
28865     if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
28866       SDValue Zeros = DAG.getConstant(0, dl, VT);
28867       if (VT.is512BitVector()) {
28868         assert(VT == MVT::v64i8 && "Unexpected element type!");
28869         SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
28870         return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
28871       }
28872       return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
28873     }
28874
28875     // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
28876     if (VT == MVT::v16i8 && Subtarget.hasXOP())
28877       return SDValue();
28878
28879     if (Op.getOpcode() == ISD::SHL) {
28880       // Make a large shift.
28881       SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
28882                                                ShiftAmt, DAG);
28883       SHL = DAG.getBitcast(VT, SHL);
28884       // Zero out the rightmost bits.
28885       APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
28886       return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
28887     }
28888     if (Op.getOpcode() == ISD::SRL) {
28889       // Make a large shift.
28890       SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
28891                                                ShiftAmt, DAG);
28892       SRL = DAG.getBitcast(VT, SRL);
28893       // Zero out the leftmost bits.
28894       APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
28895       return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
28896     }
28897     if (Op.getOpcode() == ISD::SRA) {
28898       // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
28899       SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28900
28901       SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
28902       Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
28903       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
28904       return Res;
28905     }
28906     llvm_unreachable("Unknown shift opcode.");
28907   }
28908
28909   return SDValue();
28910 }
28911
28912 static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,
28913                                           const X86Subtarget &Subtarget) {
28914   MVT VT = Op.getSimpleValueType();
28915   SDLoc dl(Op);
28916   SDValue R = Op.getOperand(0);
28917   SDValue Amt = Op.getOperand(1);
28918   unsigned Opcode = Op.getOpcode();
28919   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
28920
28921   int BaseShAmtIdx = -1;
28922   if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
28923     if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
28924       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
28925                                  Subtarget, DAG);
28926
28927     // vXi8 shifts - shift as v8i16 + mask result.
28928     if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
28929          (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
28930          VT == MVT::v64i8) &&
28931         !Subtarget.hasXOP()) {
28932       unsigned NumElts = VT.getVectorNumElements();
28933       MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28934       if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
28935         unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
28936         unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
28937
28938         // Create the mask using vXi16 shifts. For shift-rights we need to move
28939         // the upper byte down before splatting the vXi8 mask.
28940         SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
28941         BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
28942                                       BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
28943         if (Opcode != ISD::SHL)
28944           BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
28945                                                8, DAG);
28946         BitMask = DAG.getBitcast(VT, BitMask);
28947         BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
28948                                        SmallVector<int, 64>(NumElts, 0));
28949
28950         SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
28951                                           DAG.getBitcast(ExtVT, R), BaseShAmt,
28952                                           BaseShAmtIdx, Subtarget, DAG);
28953         Res = DAG.getBitcast(VT, Res);
28954         Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
28955
28956         if (Opcode == ISD::SRA) {
28957           // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
28958           // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
28959           SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
28960           SignMask =
28961               getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
28962                                   BaseShAmtIdx, Subtarget, DAG);
28963           SignMask = DAG.getBitcast(VT, SignMask);
28964           Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
28965           Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
28966         }
28967         return Res;
28968       }
28969     }
28970   }
28971
28972   return SDValue();
28973 }
28974
28975 // Convert a shift/rotate left amount to a multiplication scale factor.
28976 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
28977                                        const X86Subtarget &Subtarget,
28978                                        SelectionDAG &DAG) {
28979   MVT VT = Amt.getSimpleValueType();
28980   if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
28981         (Subtarget.hasInt256() && VT == MVT::v16i16) ||
28982         (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
28983         (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
28984         (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28985         (Subtarget.hasBWI() && VT == MVT::v64i8)))
28986     return SDValue();
28987
28988   MVT SVT = VT.getVectorElementType();
28989   unsigned SVTBits = SVT.getSizeInBits();
28990   unsigned NumElems = VT.getVectorNumElements();
28991
28992   APInt UndefElts;
28993   SmallVector<APInt> EltBits;
28994   if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
28995     APInt One(SVTBits, 1);
28996     SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
28997     for (unsigned I = 0; I != NumElems; ++I) {
28998       if (UndefElts[I] || EltBits[I].uge(SVTBits))
28999         continue;
29000       uint64_t ShAmt = EltBits[I].getZExtValue();
29001       Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
29002     }
29003     return DAG.getBuildVector(VT, dl, Elts);
29004   }
29005
29006   // If the target doesn't support variable shifts, use either FP conversion
29007   // or integer multiplication to avoid shifting each element individually.
29008   if (VT == MVT::v4i32) {
29009     Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
29010     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
29011                       DAG.getConstant(0x3f800000U, dl, VT));
29012     Amt = DAG.getBitcast(MVT::v4f32, Amt);
29013     return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
29014   }
29015
29016   // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
29017   if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
29018     SDValue Z = DAG.getConstant(0, dl, VT);
29019     SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
29020     SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
29021     Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
29022     Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
29023     if (Subtarget.hasSSE41())
29024       return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29025     return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
29026   }
29027
29028   return SDValue();
29029 }
29030
29031 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
29032                           SelectionDAG &DAG) {
29033   MVT VT = Op.getSimpleValueType();
29034   SDLoc dl(Op);
29035   SDValue R = Op.getOperand(0);
29036   SDValue Amt = Op.getOperand(1);
29037   unsigned EltSizeInBits = VT.getScalarSizeInBits();
29038   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29039
29040   unsigned Opc = Op.getOpcode();
29041   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
29042   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
29043
29044   assert(VT.isVector() && "Custom lowering only for vector shifts!");
29045   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
29046
29047   if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
29048     return V;
29049
29050   if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
29051     return V;
29052
29053   if (supportedVectorVarShift(VT, Subtarget, Opc))
29054     return Op;
29055
29056   // i64 vector arithmetic shift can be emulated with the transform:
29057   // M = lshr(SIGN_MASK, Amt)
29058   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29059   if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
29060        (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29061       Opc == ISD::SRA) {
29062     SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29063     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29064     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29065     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29066     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29067     return R;
29068   }
29069
29070   // XOP has 128-bit variable logical/arithmetic shifts.
29071   // +ve/-ve Amt = shift left/right.
29072   if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
29073                              VT == MVT::v8i16 || VT == MVT::v16i8)) {
29074     if (Opc == ISD::SRL || Opc == ISD::SRA) {
29075       SDValue Zero = DAG.getConstant(0, dl, VT);
29076       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
29077     }
29078     if (Opc == ISD::SHL || Opc == ISD::SRL)
29079       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
29080     if (Opc == ISD::SRA)
29081       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
29082   }
29083
29084   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29085   // shifts per-lane and then shuffle the partial results back together.
29086   if (VT == MVT::v2i64 && Opc != ISD::SRA) {
29087     // Splat the shift amounts so the scalar shifts above will catch it.
29088     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29089     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29090     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29091     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29092     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29093   }
29094
29095   // If possible, lower this shift as a sequence of two shifts by
29096   // constant plus a BLENDing shuffle instead of scalarizing it.
29097   // Example:
29098   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29099   //
29100   // Could be rewritten as:
29101   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29102   //
29103   // The advantage is that the two shifts from the example would be
29104   // lowered as X86ISD::VSRLI nodes in parallel before blending.
29105   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29106                       (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29107     SDValue Amt1, Amt2;
29108     unsigned NumElts = VT.getVectorNumElements();
29109     SmallVector<int, 8> ShuffleMask;
29110     for (unsigned i = 0; i != NumElts; ++i) {
29111       SDValue A = Amt->getOperand(i);
29112       if (A.isUndef()) {
29113         ShuffleMask.push_back(SM_SentinelUndef);
29114         continue;
29115       }
29116       if (!Amt1 || Amt1 == A) {
29117         ShuffleMask.push_back(i);
29118         Amt1 = A;
29119         continue;
29120       }
29121       if (!Amt2 || Amt2 == A) {
29122         ShuffleMask.push_back(i + NumElts);
29123         Amt2 = A;
29124         continue;
29125       }
29126       break;
29127     }
29128
29129     // Only perform this blend if we can perform it without loading a mask.
29130     if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29131         (VT != MVT::v16i16 ||
29132          is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29133         (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29134          canWidenShuffleElements(ShuffleMask))) {
29135       auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29136       auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29137       if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29138           Cst2->getAPIntValue().ult(EltSizeInBits)) {
29139         SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29140                                                     Cst1->getZExtValue(), DAG);
29141         SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29142                                                     Cst2->getZExtValue(), DAG);
29143         return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29144       }
29145     }
29146   }
29147
29148   // If possible, lower this packed shift into a vector multiply instead of
29149   // expanding it into a sequence of scalar shifts.
29150   // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
29151   if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
29152                                                 Subtarget.canExtendTo512BW())))
29153     if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29154       return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29155
29156   // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29157   // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29158   if (Opc == ISD::SRL && ConstantAmt &&
29159       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29160     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29161     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29162     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29163       SDValue Zero = DAG.getConstant(0, dl, VT);
29164       SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29165       SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29166       return DAG.getSelect(dl, VT, ZAmt, R, Res);
29167     }
29168   }
29169
29170   // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29171   // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29172   // TODO: Special case handling for shift by 0/1, really we can afford either
29173   // of these cases in pre-SSE41/XOP/AVX512 but not both.
29174   if (Opc == ISD::SRA && ConstantAmt &&
29175       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29176       ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29177         !Subtarget.hasAVX512()) ||
29178        DAG.isKnownNeverZero(Amt))) {
29179     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29180     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29181     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29182       SDValue Amt0 =
29183           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29184       SDValue Amt1 =
29185           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29186       SDValue Sra1 =
29187           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29188       SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29189       Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29190       return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29191     }
29192   }
29193
29194   // v4i32 Non Uniform Shifts.
29195   // If the shift amount is constant we can shift each lane using the SSE2
29196   // immediate shifts, else we need to zero-extend each lane to the lower i64
29197   // and shift using the SSE2 variable shifts.
29198   // The separate results can then be blended together.
29199   if (VT == MVT::v4i32) {
29200     SDValue Amt0, Amt1, Amt2, Amt3;
29201     if (ConstantAmt) {
29202       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29203       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29204       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29205       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29206     } else {
29207       // The SSE2 shifts use the lower i64 as the same shift amount for
29208       // all lanes and the upper i64 is ignored. On AVX we're better off
29209       // just zero-extending, but for SSE just duplicating the top 16-bits is
29210       // cheaper and has the same effect for out of range values.
29211       if (Subtarget.hasAVX()) {
29212         SDValue Z = DAG.getConstant(0, dl, VT);
29213         Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29214         Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29215         Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29216         Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29217       } else {
29218         SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29219         SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29220                                              {4, 5, 6, 7, -1, -1, -1, -1});
29221         SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
29222         SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
29223         Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
29224         Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
29225         Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
29226         Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
29227       }
29228     }
29229
29230     unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29231     SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29232     SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29233     SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29234     SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29235
29236     // Merge the shifted lane results optimally with/without PBLENDW.
29237     // TODO - ideally shuffle combining would handle this.
29238     if (Subtarget.hasSSE41()) {
29239       SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29240       SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29241       return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29242     }
29243     SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29244     SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29245     return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29246   }
29247
29248   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29249   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29250   // make the existing SSE solution better.
29251   // NOTE: We honor prefered vector width before promoting to 512-bits.
29252   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29253       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29254       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29255       (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29256       (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29257     assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
29258            "Unexpected vector type");
29259     MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29260     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29261     unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29262     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29263     Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29264     return DAG.getNode(ISD::TRUNCATE, dl, VT,
29265                        DAG.getNode(Opc, dl, ExtVT, R, Amt));
29266   }
29267
29268   // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29269   // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29270   if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29271       (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29272        (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29273       !Subtarget.hasXOP()) {
29274     int NumElts = VT.getVectorNumElements();
29275     SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29276
29277     // Extend constant shift amount to vXi16 (it doesn't matter if the type
29278     // isn't legal).
29279     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29280     Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29281     Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29282     Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29283     assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
29284            "Constant build vector expected");
29285
29286     if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29287       bool IsSigned = Opc == ISD::SRA;
29288       R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
29289       R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29290       R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29291       return DAG.getZExtOrTrunc(R, dl, VT);
29292     }
29293
29294     SmallVector<SDValue, 16> LoAmt, HiAmt;
29295     for (int i = 0; i != NumElts; i += 16) {
29296       for (int j = 0; j != 8; ++j) {
29297         LoAmt.push_back(Amt.getOperand(i + j));
29298         HiAmt.push_back(Amt.getOperand(i + j + 8));
29299       }
29300     }
29301
29302     MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29303     SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29304     SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29305
29306     SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29307     SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29308     LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29309     HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29310     LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29311     HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29312     LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29313     HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29314     return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29315   }
29316
29317   if (VT == MVT::v16i8 ||
29318       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29319       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29320     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29321
29322     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29323       if (VT.is512BitVector()) {
29324         // On AVX512BW targets we make use of the fact that VSELECT lowers
29325         // to a masked blend which selects bytes based just on the sign bit
29326         // extracted to a mask.
29327         MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29328         V0 = DAG.getBitcast(VT, V0);
29329         V1 = DAG.getBitcast(VT, V1);
29330         Sel = DAG.getBitcast(VT, Sel);
29331         Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29332                            ISD::SETGT);
29333         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29334       } else if (Subtarget.hasSSE41()) {
29335         // On SSE41 targets we can use PBLENDVB which selects bytes based just
29336         // on the sign bit.
29337         V0 = DAG.getBitcast(VT, V0);
29338         V1 = DAG.getBitcast(VT, V1);
29339         Sel = DAG.getBitcast(VT, Sel);
29340         return DAG.getBitcast(SelVT,
29341                               DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29342       }
29343       // On pre-SSE41 targets we test for the sign bit by comparing to
29344       // zero - a negative value will set all bits of the lanes to true
29345       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29346       SDValue Z = DAG.getConstant(0, dl, SelVT);
29347       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29348       return DAG.getSelect(dl, SelVT, C, V0, V1);
29349     };
29350
29351     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29352     // We can safely do this using i16 shifts as we're only interested in
29353     // the 3 lower bits of each byte.
29354     Amt = DAG.getBitcast(ExtVT, Amt);
29355     Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29356     Amt = DAG.getBitcast(VT, Amt);
29357
29358     if (Opc == ISD::SHL || Opc == ISD::SRL) {
29359       // r = VSELECT(r, shift(r, 4), a);
29360       SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29361       R = SignBitSelect(VT, Amt, M, R);
29362
29363       // a += a
29364       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29365
29366       // r = VSELECT(r, shift(r, 2), a);
29367       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29368       R = SignBitSelect(VT, Amt, M, R);
29369
29370       // a += a
29371       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29372
29373       // return VSELECT(r, shift(r, 1), a);
29374       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29375       R = SignBitSelect(VT, Amt, M, R);
29376       return R;
29377     }
29378
29379     if (Opc == ISD::SRA) {
29380       // For SRA we need to unpack each byte to the higher byte of a i16 vector
29381       // so we can correctly sign extend. We don't care what happens to the
29382       // lower byte.
29383       SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29384       SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29385       SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29386       SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29387       ALo = DAG.getBitcast(ExtVT, ALo);
29388       AHi = DAG.getBitcast(ExtVT, AHi);
29389       RLo = DAG.getBitcast(ExtVT, RLo);
29390       RHi = DAG.getBitcast(ExtVT, RHi);
29391
29392       // r = VSELECT(r, shift(r, 4), a);
29393       SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29394       SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29395       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29396       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29397
29398       // a += a
29399       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29400       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29401
29402       // r = VSELECT(r, shift(r, 2), a);
29403       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29404       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29405       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29406       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29407
29408       // a += a
29409       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29410       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29411
29412       // r = VSELECT(r, shift(r, 1), a);
29413       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29414       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29415       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29416       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29417
29418       // Logical shift the result back to the lower byte, leaving a zero upper
29419       // byte meaning that we can safely pack with PACKUSWB.
29420       RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29421       RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29422       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29423     }
29424   }
29425
29426   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29427     MVT ExtVT = MVT::v8i32;
29428     SDValue Z = DAG.getConstant(0, dl, VT);
29429     SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29430     SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29431     SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29432     SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29433     ALo = DAG.getBitcast(ExtVT, ALo);
29434     AHi = DAG.getBitcast(ExtVT, AHi);
29435     RLo = DAG.getBitcast(ExtVT, RLo);
29436     RHi = DAG.getBitcast(ExtVT, RHi);
29437     SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29438     SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29439     Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29440     Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29441     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29442   }
29443
29444   if (VT == MVT::v8i16) {
29445     // If we have a constant shift amount, the non-SSE41 path is best as
29446     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29447     bool UseSSE41 = Subtarget.hasSSE41() &&
29448                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29449
29450     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29451       // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29452       // the sign bit.
29453       if (UseSSE41) {
29454         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29455         V0 = DAG.getBitcast(ExtVT, V0);
29456         V1 = DAG.getBitcast(ExtVT, V1);
29457         Sel = DAG.getBitcast(ExtVT, Sel);
29458         return DAG.getBitcast(
29459             VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29460       }
29461       // On pre-SSE41 targets we splat the sign bit - a negative value will
29462       // set all bits of the lanes to true and VSELECT uses that in
29463       // its OR(AND(V0,C),AND(V1,~C)) lowering.
29464       SDValue C =
29465           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29466       return DAG.getSelect(dl, VT, C, V0, V1);
29467     };
29468
29469     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29470     if (UseSSE41) {
29471       // On SSE41 targets we need to replicate the shift mask in both
29472       // bytes for PBLENDVB.
29473       Amt = DAG.getNode(
29474           ISD::OR, dl, VT,
29475           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29476           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29477     } else {
29478       Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29479     }
29480
29481     // r = VSELECT(r, shift(r, 8), a);
29482     SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29483     R = SignBitSelect(Amt, M, R);
29484
29485     // a += a
29486     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29487
29488     // r = VSELECT(r, shift(r, 4), a);
29489     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29490     R = SignBitSelect(Amt, M, R);
29491
29492     // a += a
29493     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29494
29495     // r = VSELECT(r, shift(r, 2), a);
29496     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29497     R = SignBitSelect(Amt, M, R);
29498
29499     // a += a
29500     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29501
29502     // return VSELECT(r, shift(r, 1), a);
29503     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29504     R = SignBitSelect(Amt, M, R);
29505     return R;
29506   }
29507
29508   // Decompose 256-bit shifts into 128-bit shifts.
29509   if (VT.is256BitVector())
29510     return splitVectorIntBinary(Op, DAG);
29511
29512   if (VT == MVT::v32i16 || VT == MVT::v64i8)
29513     return splitVectorIntBinary(Op, DAG);
29514
29515   return SDValue();
29516 }
29517
29518 static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
29519                                 SelectionDAG &DAG) {
29520   MVT VT = Op.getSimpleValueType();
29521   assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
29522          "Unexpected funnel shift opcode!");
29523
29524   SDLoc DL(Op);
29525   SDValue Op0 = Op.getOperand(0);
29526   SDValue Op1 = Op.getOperand(1);
29527   SDValue Amt = Op.getOperand(2);
29528   unsigned EltSizeInBits = VT.getScalarSizeInBits();
29529   bool IsFSHR = Op.getOpcode() == ISD::FSHR;
29530
29531   if (VT.isVector()) {
29532     APInt APIntShiftAmt;
29533     bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
29534
29535     if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
29536       if (IsFSHR)
29537         std::swap(Op0, Op1);
29538
29539       if (IsCstSplat) {
29540         uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29541         SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
29542         return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
29543                              {Op0, Op1, Imm}, DAG, Subtarget);
29544       }
29545       return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
29546                            {Op0, Op1, Amt}, DAG, Subtarget);
29547     }
29548     assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
29549             VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
29550             VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
29551            "Unexpected funnel shift type!");
29552
29553     // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
29554     // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
29555     if (IsCstSplat) {
29556       // TODO: Can't use generic expansion as UNDEF amt elements can be
29557       // converted to other values when folded to shift amounts, losing the
29558       // splat.
29559       uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29560       uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
29561       uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
29562       SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
29563                                 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
29564       SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
29565                                 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
29566       return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
29567     }
29568
29569     SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
29570     SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29571     bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
29572
29573     // Constant vXi16 funnel shifts can be efficiently handled by default.
29574     if (IsCst && EltSizeInBits == 16)
29575       return SDValue();
29576
29577     unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
29578     unsigned NumElts = VT.getVectorNumElements();
29579     MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
29580     MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
29581
29582     // Split 256-bit integers on XOP/pre-AVX2 targets.
29583     // Split 512-bit integers on non 512-bit BWI targets.
29584     if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
29585                                  !Subtarget.hasAVX2())) ||
29586         (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
29587          EltSizeInBits < 32)) {
29588       // Pre-mask the amount modulo using the wider vector.
29589       Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
29590       return splitVectorOp(Op, DAG);
29591     }
29592
29593     // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
29594     if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
29595       int ScalarAmtIdx = -1;
29596       if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
29597         // Uniform vXi16 funnel shifts can be efficiently handled by default.
29598         if (EltSizeInBits == 16)
29599           return SDValue();
29600
29601         SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29602         SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29603         Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
29604                                  ScalarAmtIdx, Subtarget, DAG);
29605         Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
29606                                  ScalarAmtIdx, Subtarget, DAG);
29607         return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29608       }
29609     }
29610
29611     MVT WideSVT = MVT::getIntegerVT(
29612         std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
29613     MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
29614
29615     // If per-element shifts are legal, fallback to generic expansion.
29616     if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
29617       return SDValue();
29618
29619     // Attempt to fold as:
29620     // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
29621     // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
29622     if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
29623         supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
29624       Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
29625       Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
29626       AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
29627       Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
29628                                        EltSizeInBits, DAG);
29629       SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
29630       Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
29631       if (!IsFSHR)
29632         Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
29633                                          EltSizeInBits, DAG);
29634       return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
29635     }
29636
29637     // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
29638     if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
29639         supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
29640       SDValue Z = DAG.getConstant(0, DL, VT);
29641       SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29642       SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29643       SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
29644       SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
29645       SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
29646       SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
29647       return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29648     }
29649
29650     // Fallback to generic expansion.
29651     return SDValue();
29652   }
29653   assert(
29654       (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
29655       "Unexpected funnel shift type!");
29656
29657   // Expand slow SHLD/SHRD cases if we are not optimizing for size.
29658   bool OptForSize = DAG.shouldOptForSize();
29659   bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
29660
29661   // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
29662   // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
29663   if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
29664       !isa<ConstantSDNode>(Amt)) {
29665     SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
29666     SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
29667     Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
29668     Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
29669     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
29670     SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
29671     Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
29672     if (IsFSHR) {
29673       Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
29674     } else {
29675       Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
29676       Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
29677     }
29678     return DAG.getZExtOrTrunc(Res, DL, VT);
29679   }
29680
29681   if (VT == MVT::i8 || ExpandFunnel)
29682     return SDValue();
29683
29684   // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
29685   if (VT == MVT::i16) {
29686     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
29687                       DAG.getConstant(15, DL, Amt.getValueType()));
29688     unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
29689     return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
29690   }
29691
29692   return Op;
29693 }
29694
29695 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
29696                            SelectionDAG &DAG) {
29697   MVT VT = Op.getSimpleValueType();
29698   assert(VT.isVector() && "Custom lowering only for vector rotates!");
29699
29700   SDLoc DL(Op);
29701   SDValue R = Op.getOperand(0);
29702   SDValue Amt = Op.getOperand(1);
29703   unsigned Opcode = Op.getOpcode();
29704   unsigned EltSizeInBits = VT.getScalarSizeInBits();
29705   int NumElts = VT.getVectorNumElements();
29706   bool IsROTL = Opcode == ISD::ROTL;
29707
29708   // Check for constant splat rotation amount.
29709   APInt CstSplatValue;
29710   bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
29711
29712   // Check for splat rotate by zero.
29713   if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
29714     return R;
29715
29716   // AVX512 implicitly uses modulo rotation amounts.
29717   if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
29718     // Attempt to rotate by immediate.
29719     if (IsCstSplat) {
29720       unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
29721       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
29722       return DAG.getNode(RotOpc, DL, VT, R,
29723                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
29724     }
29725
29726     // Else, fall-back on VPROLV/VPRORV.
29727     return Op;
29728   }
29729
29730   // AVX512 VBMI2 vXi16 - lower to funnel shifts.
29731   if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
29732     unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
29733     return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
29734   }
29735
29736   SDValue Z = DAG.getConstant(0, DL, VT);
29737
29738   if (!IsROTL) {
29739     // If the ISD::ROTR amount is constant, we're always better converting to
29740     // ISD::ROTL.
29741     if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
29742       return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
29743
29744     // XOP targets always prefers ISD::ROTL.
29745     if (Subtarget.hasXOP())
29746       return DAG.getNode(ISD::ROTL, DL, VT, R,
29747                          DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
29748   }
29749
29750   // Split 256-bit integers on XOP/pre-AVX2 targets.
29751   if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
29752     return splitVectorIntBinary(Op, DAG);
29753
29754   // XOP has 128-bit vector variable + immediate rotates.
29755   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
29756   // XOP implicitly uses modulo rotation amounts.
29757   if (Subtarget.hasXOP()) {
29758     assert(IsROTL && "Only ROTL expected");
29759     assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
29760
29761     // Attempt to rotate by immediate.
29762     if (IsCstSplat) {
29763       uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
29764       return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
29765                          DAG.getTargetConstant(RotAmt, DL, MVT::i8));
29766     }
29767
29768     // Use general rotate by variable (per-element).
29769     return Op;
29770   }
29771
29772   // Rotate by an uniform constant - expand back to shifts.
29773   // TODO: Can't use generic expansion as UNDEF amt elements can be converted
29774   // to other values when folded to shift amounts, losing the splat.
29775   if (IsCstSplat) {
29776     uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
29777     uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
29778     uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
29779     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
29780                               DAG.getShiftAmountConstant(ShlAmt, VT, DL));
29781     SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
29782                               DAG.getShiftAmountConstant(SrlAmt, VT, DL));
29783     return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
29784   }
29785
29786   // Split 512-bit integers on non 512-bit BWI targets.
29787   if (VT.is512BitVector() && !Subtarget.useBWIRegs())
29788     return splitVectorIntBinary(Op, DAG);
29789
29790   assert(
29791       (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
29792        ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
29793         Subtarget.hasAVX2()) ||
29794        ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
29795       "Only vXi32/vXi16/vXi8 vector rotates supported");
29796
29797   MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
29798   MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
29799
29800   SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
29801   SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29802
29803   // Attempt to fold as unpack(x,x) << zext(splat(y)):
29804   // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
29805   // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
29806   if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
29807     int BaseRotAmtIdx = -1;
29808     if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
29809       if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
29810         unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
29811         return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
29812       }
29813       unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
29814       SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
29815       SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
29816       Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
29817                                BaseRotAmtIdx, Subtarget, DAG);
29818       Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
29819                                BaseRotAmtIdx, Subtarget, DAG);
29820       return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
29821     }
29822   }
29823
29824   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29825   unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
29826
29827   // Attempt to fold as unpack(x,x) << zext(y):
29828   // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
29829   // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
29830   // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
29831   if (!(ConstantAmt && EltSizeInBits != 8) &&
29832       !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
29833       (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
29834     SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
29835     SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
29836     SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
29837     SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
29838     SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
29839     SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
29840     return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
29841   }
29842
29843   // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
29844   // the amount bit.
29845   // TODO: We're doing nothing here that we couldn't do for funnel shifts.
29846   if (EltSizeInBits == 8) {
29847     MVT WideVT =
29848         MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
29849
29850     // Attempt to fold as:
29851     // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
29852     // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
29853     if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
29854         supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
29855       // If we're rotating by constant, just use default promotion.
29856       if (ConstantAmt)
29857         return SDValue();
29858       // See if we can perform this by widening to vXi16 or vXi32.
29859       R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
29860       R = DAG.getNode(
29861           ISD::OR, DL, WideVT, R,
29862           getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
29863       Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
29864       R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
29865       if (IsROTL)
29866         R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
29867       return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
29868     }
29869
29870     // We don't need ModuloAmt here as we just peek at individual bits.
29871     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29872       if (Subtarget.hasSSE41()) {
29873         // On SSE41 targets we can use PBLENDVB which selects bytes based just
29874         // on the sign bit.
29875         V0 = DAG.getBitcast(VT, V0);
29876         V1 = DAG.getBitcast(VT, V1);
29877         Sel = DAG.getBitcast(VT, Sel);
29878         return DAG.getBitcast(SelVT,
29879                               DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
29880       }
29881       // On pre-SSE41 targets we test for the sign bit by comparing to
29882       // zero - a negative value will set all bits of the lanes to true
29883       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29884       SDValue Z = DAG.getConstant(0, DL, SelVT);
29885       SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
29886       return DAG.getSelect(DL, SelVT, C, V0, V1);
29887     };
29888
29889     // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
29890     if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
29891       Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
29892       IsROTL = true;
29893     }
29894
29895     unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
29896     unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
29897
29898     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29899     // We can safely do this using i16 shifts as we're only interested in
29900     // the 3 lower bits of each byte.
29901     Amt = DAG.getBitcast(ExtVT, Amt);
29902     Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
29903     Amt = DAG.getBitcast(VT, Amt);
29904
29905     // r = VSELECT(r, rot(r, 4), a);
29906     SDValue M;
29907     M = DAG.getNode(
29908         ISD::OR, DL, VT,
29909         DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
29910         DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
29911     R = SignBitSelect(VT, Amt, M, R);
29912
29913     // a += a
29914     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29915
29916     // r = VSELECT(r, rot(r, 2), a);
29917     M = DAG.getNode(
29918         ISD::OR, DL, VT,
29919         DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
29920         DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
29921     R = SignBitSelect(VT, Amt, M, R);
29922
29923     // a += a
29924     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29925
29926     // return VSELECT(r, rot(r, 1), a);
29927     M = DAG.getNode(
29928         ISD::OR, DL, VT,
29929         DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
29930         DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
29931     return SignBitSelect(VT, Amt, M, R);
29932   }
29933
29934   bool IsSplatAmt = DAG.isSplatValue(Amt);
29935   bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
29936                         supportedVectorVarShift(VT, Subtarget, ISD::SRL);
29937
29938   // Fallback for splats + all supported variable shifts.
29939   // Fallback for non-constants AVX2 vXi16 as well.
29940   if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
29941     Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29942     SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
29943     AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
29944     SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
29945     SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
29946     return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
29947   }
29948
29949   // Everything below assumes ISD::ROTL.
29950   if (!IsROTL) {
29951     Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
29952     IsROTL = true;
29953   }
29954
29955   // ISD::ROT* uses modulo rotate amounts.
29956   Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29957
29958   assert(IsROTL && "Only ROTL supported");
29959
29960   // As with shifts, attempt to convert the rotation amount to a multiplication
29961   // factor, fallback to general expansion.
29962   SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
29963   if (!Scale)
29964     return SDValue();
29965
29966   // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
29967   if (EltSizeInBits == 16) {
29968     SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
29969     SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
29970     return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29971   }
29972
29973   // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
29974   // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
29975   // that can then be OR'd with the lower 32-bits.
29976   assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
29977   static const int OddMask[] = {1, -1, 3, -1};
29978   SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
29979   SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
29980
29981   SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29982                               DAG.getBitcast(MVT::v2i64, R),
29983                               DAG.getBitcast(MVT::v2i64, Scale));
29984   SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29985                               DAG.getBitcast(MVT::v2i64, R13),
29986                               DAG.getBitcast(MVT::v2i64, Scale13));
29987   Res02 = DAG.getBitcast(VT, Res02);
29988   Res13 = DAG.getBitcast(VT, Res13);
29989
29990   return DAG.getNode(ISD::OR, DL, VT,
29991                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
29992                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
29993 }
29994
29995 /// Returns true if the operand type is exactly twice the native width, and
29996 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
29997 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
29998 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
29999 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
30000   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
30001
30002   if (OpWidth == 64)
30003     return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
30004   if (OpWidth == 128)
30005     return Subtarget.canUseCMPXCHG16B();
30006
30007   return false;
30008 }
30009
30010 TargetLoweringBase::AtomicExpansionKind
30011 X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
30012   Type *MemType = SI->getValueOperand()->getType();
30013
30014   bool NoImplicitFloatOps =
30015       SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30016   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30017       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30018       (Subtarget.hasSSE1() || Subtarget.hasX87()))
30019     return AtomicExpansionKind::None;
30020
30021   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
30022                                  : AtomicExpansionKind::None;
30023 }
30024
30025 // Note: this turns large loads into lock cmpxchg8b/16b.
30026 // TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
30027 TargetLowering::AtomicExpansionKind
30028 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
30029   Type *MemType = LI->getType();
30030
30031   // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30032   // can use movq to do the load. If we have X87 we can load into an 80-bit
30033   // X87 register and store it to a stack temporary.
30034   bool NoImplicitFloatOps =
30035       LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30036   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30037       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30038       (Subtarget.hasSSE1() || Subtarget.hasX87()))
30039     return AtomicExpansionKind::None;
30040
30041   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30042                                  : AtomicExpansionKind::None;
30043 }
30044
30045 enum BitTestKind : unsigned {
30046   UndefBit,
30047   ConstantBit,
30048   NotConstantBit,
30049   ShiftBit,
30050   NotShiftBit
30051 };
30052
30053 static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
30054   using namespace llvm::PatternMatch;
30055   BitTestKind BTK = UndefBit;
30056   auto *C = dyn_cast<ConstantInt>(V);
30057   if (C) {
30058     // Check if V is a power of 2 or NOT power of 2.
30059     if (isPowerOf2_64(C->getZExtValue()))
30060       BTK = ConstantBit;
30061     else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
30062       BTK = NotConstantBit;
30063     return {V, BTK};
30064   }
30065
30066   // Check if V is some power of 2 pattern known to be non-zero
30067   auto *I = dyn_cast<Instruction>(V);
30068   if (I) {
30069     bool Not = false;
30070     // Check if we have a NOT
30071     Value *PeekI;
30072     if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
30073         match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
30074       Not = true;
30075       I = dyn_cast<Instruction>(PeekI);
30076
30077       // If I is constant, it will fold and we can evaluate later. If its an
30078       // argument or something of that nature, we can't analyze.
30079       if (I == nullptr)
30080         return {nullptr, UndefBit};
30081     }
30082     // We can only use 1 << X without more sophisticated analysis. C << X where
30083     // C is a power of 2 but not 1 can result in zero which cannot be translated
30084     // to bittest. Likewise any C >> X (either arith or logical) can be zero.
30085     if (I->getOpcode() == Instruction::Shl) {
30086       // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
30087       // -X` and some other provable power of 2 patterns that we can use CTZ on
30088       // may be profitable.
30089       // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
30090       // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
30091       // be provably a non-zero power of 2.
30092       // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
30093       // transformable to bittest.
30094       auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
30095       if (!ShiftVal)
30096         return {nullptr, UndefBit};
30097       if (ShiftVal->equalsInt(1))
30098         BTK = Not ? NotShiftBit : ShiftBit;
30099
30100       if (BTK == UndefBit)
30101         return {nullptr, UndefBit};
30102
30103       Value *BitV = I->getOperand(1);
30104
30105       Value *AndOp;
30106       const APInt *AndC;
30107       if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
30108         // Read past a shiftmask instruction to find count
30109         if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
30110           BitV = AndOp;
30111       }
30112       return {BitV, BTK};
30113     }
30114   }
30115   return {nullptr, UndefBit};
30116 }
30117
30118 TargetLowering::AtomicExpansionKind
30119 X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
30120   using namespace llvm::PatternMatch;
30121   // If the atomicrmw's result isn't actually used, we can just add a "lock"
30122   // prefix to a normal instruction for these operations.
30123   if (AI->use_empty())
30124     return AtomicExpansionKind::None;
30125
30126   if (AI->getOperation() == AtomicRMWInst::Xor) {
30127     // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
30128     // preferable to both `cmpxchg` and `btc`.
30129     if (match(AI->getOperand(1), m_SignMask()))
30130       return AtomicExpansionKind::None;
30131   }
30132
30133   // If the atomicrmw's result is used by a single bit AND, we may use
30134   // bts/btr/btc instruction for these operations.
30135   // Note: InstCombinePass can cause a de-optimization here. It replaces the
30136   // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
30137   // (depending on CC). This pattern can only use bts/btr/btc but we don't
30138   // detect it.
30139   Instruction *I = AI->user_back();
30140   auto BitChange = FindSingleBitChange(AI->getValOperand());
30141   if (BitChange.second == UndefBit || !AI->hasOneUse() ||
30142       I->getOpcode() != Instruction::And ||
30143       AI->getType()->getPrimitiveSizeInBits() == 8 ||
30144       AI->getParent() != I->getParent())
30145     return AtomicExpansionKind::CmpXChg;
30146
30147   unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
30148
30149   // This is a redundant AND, it should get cleaned up elsewhere.
30150   if (AI == I->getOperand(OtherIdx))
30151     return AtomicExpansionKind::CmpXChg;
30152
30153   // The following instruction must be a AND single bit.
30154   if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
30155     auto *C1 = cast<ConstantInt>(AI->getValOperand());
30156     auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
30157     if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
30158       return AtomicExpansionKind::CmpXChg;
30159     }
30160     if (AI->getOperation() == AtomicRMWInst::And) {
30161       return ~C1->getValue() == C2->getValue()
30162                  ? AtomicExpansionKind::BitTestIntrinsic
30163                  : AtomicExpansionKind::CmpXChg;
30164     }
30165     return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic
30166                     : AtomicExpansionKind::CmpXChg;
30167   }
30168
30169   assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
30170
30171   auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
30172   if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
30173     return AtomicExpansionKind::CmpXChg;
30174
30175   assert(BitChange.first != nullptr && BitTested.first != nullptr);
30176
30177   // If shift amounts are not the same we can't use BitTestIntrinsic.
30178   if (BitChange.first != BitTested.first)
30179     return AtomicExpansionKind::CmpXChg;
30180
30181   // If atomic AND need to be masking all be one bit and testing the one bit
30182   // unset in the mask.
30183   if (AI->getOperation() == AtomicRMWInst::And)
30184     return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
30185                ? AtomicExpansionKind::BitTestIntrinsic
30186                : AtomicExpansionKind::CmpXChg;
30187
30188   // If atomic XOR/OR need to be setting and testing the same bit.
30189   return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
30190              ? AtomicExpansionKind::BitTestIntrinsic
30191              : AtomicExpansionKind::CmpXChg;
30192 }
30193
30194 void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
30195   IRBuilder<> Builder(AI);
30196   Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30197   Intrinsic::ID IID_C = Intrinsic::not_intrinsic;
30198   Intrinsic::ID IID_I = Intrinsic::not_intrinsic;
30199   switch (AI->getOperation()) {
30200   default:
30201     llvm_unreachable("Unknown atomic operation");
30202   case AtomicRMWInst::Or:
30203     IID_C = Intrinsic::x86_atomic_bts;
30204     IID_I = Intrinsic::x86_atomic_bts_rm;
30205     break;
30206   case AtomicRMWInst::Xor:
30207     IID_C = Intrinsic::x86_atomic_btc;
30208     IID_I = Intrinsic::x86_atomic_btc_rm;
30209     break;
30210   case AtomicRMWInst::And:
30211     IID_C = Intrinsic::x86_atomic_btr;
30212     IID_I = Intrinsic::x86_atomic_btr_rm;
30213     break;
30214   }
30215   Instruction *I = AI->user_back();
30216   LLVMContext &Ctx = AI->getContext();
30217   Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30218                                           PointerType::getUnqual(Ctx));
30219   Function *BitTest = nullptr;
30220   Value *Result = nullptr;
30221   auto BitTested = FindSingleBitChange(AI->getValOperand());
30222   assert(BitTested.first != nullptr);
30223
30224   if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
30225     auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
30226
30227     BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
30228
30229     unsigned Imm = llvm::countr_zero(C->getZExtValue());
30230     Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
30231   } else {
30232     BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
30233
30234     assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
30235
30236     Value *SI = BitTested.first;
30237     assert(SI != nullptr);
30238
30239     // BT{S|R|C} on memory operand don't modulo bit position so we need to
30240     // mask it.
30241     unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
30242     Value *BitPos =
30243         Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
30244     // Todo(1): In many cases it may be provable that SI is less than
30245     // ShiftBits in which case this mask is unnecessary
30246     // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
30247     // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
30248     // favor of just a raw BT{S|R|C}.
30249
30250     Result = Builder.CreateCall(BitTest, {Addr, BitPos});
30251     Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
30252
30253     // If the result is only used for zero/non-zero status then we don't need to
30254     // shift value back. Otherwise do so.
30255     for (auto It = I->user_begin(); It != I->user_end(); ++It) {
30256       if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
30257         if (ICmp->isEquality()) {
30258           auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
30259           auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
30260           if (C0 || C1) {
30261             assert(C0 == nullptr || C1 == nullptr);
30262             if ((C0 ? C0 : C1)->isZero())
30263               continue;
30264           }
30265         }
30266       }
30267       Result = Builder.CreateShl(Result, BitPos);
30268       break;
30269     }
30270   }
30271
30272   I->replaceAllUsesWith(Result);
30273   I->eraseFromParent();
30274   AI->eraseFromParent();
30275 }
30276
30277 static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {
30278   using namespace llvm::PatternMatch;
30279   if (!AI->hasOneUse())
30280     return false;
30281
30282   Value *Op = AI->getOperand(1);
30283   ICmpInst::Predicate Pred;
30284   Instruction *I = AI->user_back();
30285   AtomicRMWInst::BinOp Opc = AI->getOperation();
30286   if (Opc == AtomicRMWInst::Add) {
30287     if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
30288       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30289     if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
30290       if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30291         return Pred == CmpInst::ICMP_SLT;
30292       if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30293         return Pred == CmpInst::ICMP_SGT;
30294     }
30295     return false;
30296   }
30297   if (Opc == AtomicRMWInst::Sub) {
30298     if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30299       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30300     if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
30301       if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30302         return Pred == CmpInst::ICMP_SLT;
30303       if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30304         return Pred == CmpInst::ICMP_SGT;
30305     }
30306     return false;
30307   }
30308   if ((Opc == AtomicRMWInst::Or &&
30309        match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||
30310       (Opc == AtomicRMWInst::And &&
30311        match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {
30312     if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30313       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
30314              Pred == CmpInst::ICMP_SLT;
30315     if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30316       return Pred == CmpInst::ICMP_SGT;
30317     return false;
30318   }
30319   if (Opc == AtomicRMWInst::Xor) {
30320     if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30321       return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30322     if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
30323       if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30324         return Pred == CmpInst::ICMP_SLT;
30325       if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30326         return Pred == CmpInst::ICMP_SGT;
30327     }
30328     return false;
30329   }
30330
30331   return false;
30332 }
30333
30334 void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
30335     AtomicRMWInst *AI) const {
30336   IRBuilder<> Builder(AI);
30337   Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30338   Instruction *TempI = nullptr;
30339   LLVMContext &Ctx = AI->getContext();
30340   ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
30341   if (!ICI) {
30342     TempI = AI->user_back();
30343     assert(TempI->hasOneUse() && "Must have one use");
30344     ICI = cast<ICmpInst>(TempI->user_back());
30345   }
30346   X86::CondCode CC = X86::COND_INVALID;
30347   ICmpInst::Predicate Pred = ICI->getPredicate();
30348   switch (Pred) {
30349   default:
30350     llvm_unreachable("Not supported Pred");
30351   case CmpInst::ICMP_EQ:
30352     CC = X86::COND_E;
30353     break;
30354   case CmpInst::ICMP_NE:
30355     CC = X86::COND_NE;
30356     break;
30357   case CmpInst::ICMP_SLT:
30358     CC = X86::COND_S;
30359     break;
30360   case CmpInst::ICMP_SGT:
30361     CC = X86::COND_NS;
30362     break;
30363   }
30364   Intrinsic::ID IID = Intrinsic::not_intrinsic;
30365   switch (AI->getOperation()) {
30366   default:
30367     llvm_unreachable("Unknown atomic operation");
30368   case AtomicRMWInst::Add:
30369     IID = Intrinsic::x86_atomic_add_cc;
30370     break;
30371   case AtomicRMWInst::Sub:
30372     IID = Intrinsic::x86_atomic_sub_cc;
30373     break;
30374   case AtomicRMWInst::Or:
30375     IID = Intrinsic::x86_atomic_or_cc;
30376     break;
30377   case AtomicRMWInst::And:
30378     IID = Intrinsic::x86_atomic_and_cc;
30379     break;
30380   case AtomicRMWInst::Xor:
30381     IID = Intrinsic::x86_atomic_xor_cc;
30382     break;
30383   }
30384   Function *CmpArith =
30385       Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
30386   Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30387                                           PointerType::getUnqual(Ctx));
30388   Value *Call = Builder.CreateCall(
30389       CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
30390   Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
30391   ICI->replaceAllUsesWith(Result);
30392   ICI->eraseFromParent();
30393   if (TempI)
30394     TempI->eraseFromParent();
30395   AI->eraseFromParent();
30396 }
30397
30398 TargetLowering::AtomicExpansionKind
30399 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
30400   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30401   Type *MemType = AI->getType();
30402
30403   // If the operand is too big, we must see if cmpxchg8/16b is available
30404   // and default to library calls otherwise.
30405   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
30406     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30407                                    : AtomicExpansionKind::None;
30408   }
30409
30410   AtomicRMWInst::BinOp Op = AI->getOperation();
30411   switch (Op) {
30412   case AtomicRMWInst::Xchg:
30413     return AtomicExpansionKind::None;
30414   case AtomicRMWInst::Add:
30415   case AtomicRMWInst::Sub:
30416     if (shouldExpandCmpArithRMWInIR(AI))
30417       return AtomicExpansionKind::CmpArithIntrinsic;
30418     // It's better to use xadd, xsub or xchg for these in other cases.
30419     return AtomicExpansionKind::None;
30420   case AtomicRMWInst::Or:
30421   case AtomicRMWInst::And:
30422   case AtomicRMWInst::Xor:
30423     if (shouldExpandCmpArithRMWInIR(AI))
30424       return AtomicExpansionKind::CmpArithIntrinsic;
30425     return shouldExpandLogicAtomicRMWInIR(AI);
30426   case AtomicRMWInst::Nand:
30427   case AtomicRMWInst::Max:
30428   case AtomicRMWInst::Min:
30429   case AtomicRMWInst::UMax:
30430   case AtomicRMWInst::UMin:
30431   case AtomicRMWInst::FAdd:
30432   case AtomicRMWInst::FSub:
30433   case AtomicRMWInst::FMax:
30434   case AtomicRMWInst::FMin:
30435   case AtomicRMWInst::UIncWrap:
30436   case AtomicRMWInst::UDecWrap:
30437   default:
30438     // These always require a non-trivial set of data operations on x86. We must
30439     // use a cmpxchg loop.
30440     return AtomicExpansionKind::CmpXChg;
30441   }
30442 }
30443
30444 LoadInst *
30445 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
30446   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30447   Type *MemType = AI->getType();
30448   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
30449   // there is no benefit in turning such RMWs into loads, and it is actually
30450   // harmful as it introduces a mfence.
30451   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
30452     return nullptr;
30453
30454   // If this is a canonical idempotent atomicrmw w/no uses, we have a better
30455   // lowering available in lowerAtomicArith.
30456   // TODO: push more cases through this path.
30457   if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
30458     if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
30459         AI->use_empty())
30460       return nullptr;
30461
30462   IRBuilder<> Builder(AI);
30463   Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30464   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30465   auto SSID = AI->getSyncScopeID();
30466   // We must restrict the ordering to avoid generating loads with Release or
30467   // ReleaseAcquire orderings.
30468   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
30469
30470   // Before the load we need a fence. Here is an example lifted from
30471   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
30472   // is required:
30473   // Thread 0:
30474   //   x.store(1, relaxed);
30475   //   r1 = y.fetch_add(0, release);
30476   // Thread 1:
30477   //   y.fetch_add(42, acquire);
30478   //   r2 = x.load(relaxed);
30479   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
30480   // lowered to just a load without a fence. A mfence flushes the store buffer,
30481   // making the optimization clearly correct.
30482   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
30483   // otherwise, we might be able to be more aggressive on relaxed idempotent
30484   // rmw. In practice, they do not look useful, so we don't try to be
30485   // especially clever.
30486   if (SSID == SyncScope::SingleThread)
30487     // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
30488     // the IR level, so we must wrap it in an intrinsic.
30489     return nullptr;
30490
30491   if (!Subtarget.hasMFence())
30492     // FIXME: it might make sense to use a locked operation here but on a
30493     // different cache-line to prevent cache-line bouncing. In practice it
30494     // is probably a small win, and x86 processors without mfence are rare
30495     // enough that we do not bother.
30496     return nullptr;
30497
30498   Function *MFence =
30499       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
30500   Builder.CreateCall(MFence, {});
30501
30502   // Finally we can emit the atomic load.
30503   LoadInst *Loaded = Builder.CreateAlignedLoad(
30504       AI->getType(), AI->getPointerOperand(), AI->getAlign());
30505   Loaded->setAtomic(Order, SSID);
30506   AI->replaceAllUsesWith(Loaded);
30507   AI->eraseFromParent();
30508   return Loaded;
30509 }
30510
30511 bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
30512   if (!SI.isUnordered())
30513     return false;
30514   return ExperimentalUnorderedISEL;
30515 }
30516 bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
30517   if (!LI.isUnordered())
30518     return false;
30519   return ExperimentalUnorderedISEL;
30520 }
30521
30522
30523 /// Emit a locked operation on a stack location which does not change any
30524 /// memory location, but does involve a lock prefix.  Location is chosen to be
30525 /// a) very likely accessed only by a single thread to minimize cache traffic,
30526 /// and b) definitely dereferenceable.  Returns the new Chain result.
30527 static SDValue emitLockedStackOp(SelectionDAG &DAG,
30528                                  const X86Subtarget &Subtarget, SDValue Chain,
30529                                  const SDLoc &DL) {
30530   // Implementation notes:
30531   // 1) LOCK prefix creates a full read/write reordering barrier for memory
30532   // operations issued by the current processor.  As such, the location
30533   // referenced is not relevant for the ordering properties of the instruction.
30534   // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
30535   // 8.2.3.9  Loads and Stores Are Not Reordered with Locked Instructions
30536   // 2) Using an immediate operand appears to be the best encoding choice
30537   // here since it doesn't require an extra register.
30538   // 3) OR appears to be very slightly faster than ADD. (Though, the difference
30539   // is small enough it might just be measurement noise.)
30540   // 4) When choosing offsets, there are several contributing factors:
30541   //   a) If there's no redzone, we default to TOS.  (We could allocate a cache
30542   //      line aligned stack object to improve this case.)
30543   //   b) To minimize our chances of introducing a false dependence, we prefer
30544   //      to offset the stack usage from TOS slightly.
30545   //   c) To minimize concerns about cross thread stack usage - in particular,
30546   //      the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
30547   //      captures state in the TOS frame and accesses it from many threads -
30548   //      we want to use an offset such that the offset is in a distinct cache
30549   //      line from the TOS frame.
30550   //
30551   // For a general discussion of the tradeoffs and benchmark results, see:
30552   // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
30553
30554   auto &MF = DAG.getMachineFunction();
30555   auto &TFL = *Subtarget.getFrameLowering();
30556   const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
30557
30558   if (Subtarget.is64Bit()) {
30559     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30560     SDValue Ops[] = {
30561       DAG.getRegister(X86::RSP, MVT::i64),                  // Base
30562       DAG.getTargetConstant(1, DL, MVT::i8),                // Scale
30563       DAG.getRegister(0, MVT::i64),                         // Index
30564       DAG.getTargetConstant(SPOffset, DL, MVT::i32),        // Disp
30565       DAG.getRegister(0, MVT::i16),                         // Segment.
30566       Zero,
30567       Chain};
30568     SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30569                                      MVT::Other, Ops);
30570     return SDValue(Res, 1);
30571   }
30572
30573   SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30574   SDValue Ops[] = {
30575     DAG.getRegister(X86::ESP, MVT::i32),            // Base
30576     DAG.getTargetConstant(1, DL, MVT::i8),          // Scale
30577     DAG.getRegister(0, MVT::i32),                   // Index
30578     DAG.getTargetConstant(SPOffset, DL, MVT::i32),  // Disp
30579     DAG.getRegister(0, MVT::i16),                   // Segment.
30580     Zero,
30581     Chain
30582   };
30583   SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30584                                    MVT::Other, Ops);
30585   return SDValue(Res, 1);
30586 }
30587
30588 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
30589                                  SelectionDAG &DAG) {
30590   SDLoc dl(Op);
30591   AtomicOrdering FenceOrdering =
30592       static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
30593   SyncScope::ID FenceSSID =
30594       static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
30595
30596   // The only fence that needs an instruction is a sequentially-consistent
30597   // cross-thread fence.
30598   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
30599       FenceSSID == SyncScope::System) {
30600     if (Subtarget.hasMFence())
30601       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
30602
30603     SDValue Chain = Op.getOperand(0);
30604     return emitLockedStackOp(DAG, Subtarget, Chain, dl);
30605   }
30606
30607   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
30608   return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
30609 }
30610
30611 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
30612                              SelectionDAG &DAG) {
30613   MVT T = Op.getSimpleValueType();
30614   SDLoc DL(Op);
30615   unsigned Reg = 0;
30616   unsigned size = 0;
30617   switch(T.SimpleTy) {
30618   default: llvm_unreachable("Invalid value type!");
30619   case MVT::i8:  Reg = X86::AL;  size = 1; break;
30620   case MVT::i16: Reg = X86::AX;  size = 2; break;
30621   case MVT::i32: Reg = X86::EAX; size = 4; break;
30622   case MVT::i64:
30623     assert(Subtarget.is64Bit() && "Node not type legal!");
30624     Reg = X86::RAX; size = 8;
30625     break;
30626   }
30627   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
30628                                   Op.getOperand(2), SDValue());
30629   SDValue Ops[] = { cpIn.getValue(0),
30630                     Op.getOperand(1),
30631                     Op.getOperand(3),
30632                     DAG.getTargetConstant(size, DL, MVT::i8),
30633                     cpIn.getValue(1) };
30634   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
30635   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
30636   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
30637                                            Ops, T, MMO);
30638
30639   SDValue cpOut =
30640     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
30641   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
30642                                       MVT::i32, cpOut.getValue(2));
30643   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
30644
30645   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
30646                      cpOut, Success, EFLAGS.getValue(1));
30647 }
30648
30649 // Create MOVMSKB, taking into account whether we need to split for AVX1.
30650 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
30651                            const X86Subtarget &Subtarget) {
30652   MVT InVT = V.getSimpleValueType();
30653
30654   if (InVT == MVT::v64i8) {
30655     SDValue Lo, Hi;
30656     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30657     Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
30658     Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
30659     Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
30660     Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
30661     Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
30662                      DAG.getConstant(32, DL, MVT::i8));
30663     return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
30664   }
30665   if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30666     SDValue Lo, Hi;
30667     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30668     Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30669     Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30670     Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30671                      DAG.getConstant(16, DL, MVT::i8));
30672     return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30673   }
30674
30675   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30676 }
30677
30678 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
30679                             SelectionDAG &DAG) {
30680   SDValue Src = Op.getOperand(0);
30681   MVT SrcVT = Src.getSimpleValueType();
30682   MVT DstVT = Op.getSimpleValueType();
30683
30684   // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
30685   // half to v32i1 and concatenating the result.
30686   if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
30687     assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
30688     assert(Subtarget.hasBWI() && "Expected BWI target");
30689     SDLoc dl(Op);
30690     SDValue Lo, Hi;
30691     std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
30692     Lo = DAG.getBitcast(MVT::v32i1, Lo);
30693     Hi = DAG.getBitcast(MVT::v32i1, Hi);
30694     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
30695   }
30696
30697   // Use MOVMSK for vector to scalar conversion to prevent scalarization.
30698   if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
30699     assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
30700     MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
30701     SDLoc DL(Op);
30702     SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
30703     V = getPMOVMSKB(DL, V, DAG, Subtarget);
30704     return DAG.getZExtOrTrunc(V, DL, DstVT);
30705   }
30706
30707   assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
30708           SrcVT == MVT::i64) && "Unexpected VT!");
30709
30710   assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
30711   if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
30712       !(DstVT == MVT::x86mmx && SrcVT.isVector()))
30713     // This conversion needs to be expanded.
30714     return SDValue();
30715
30716   SDLoc dl(Op);
30717   if (SrcVT.isVector()) {
30718     // Widen the vector in input in the case of MVT::v2i32.
30719     // Example: from MVT::v2i32 to MVT::v4i32.
30720     MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
30721                                  SrcVT.getVectorNumElements() * 2);
30722     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
30723                       DAG.getUNDEF(SrcVT));
30724   } else {
30725     assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
30726            "Unexpected source type in LowerBITCAST");
30727     Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
30728   }
30729
30730   MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
30731   Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
30732
30733   if (DstVT == MVT::x86mmx)
30734     return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
30735
30736   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
30737                      DAG.getIntPtrConstant(0, dl));
30738 }
30739
30740 /// Compute the horizontal sum of bytes in V for the elements of VT.
30741 ///
30742 /// Requires V to be a byte vector and VT to be an integer vector type with
30743 /// wider elements than V's type. The width of the elements of VT determines
30744 /// how many bytes of V are summed horizontally to produce each element of the
30745 /// result.
30746 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
30747                                       const X86Subtarget &Subtarget,
30748                                       SelectionDAG &DAG) {
30749   SDLoc DL(V);
30750   MVT ByteVecVT = V.getSimpleValueType();
30751   MVT EltVT = VT.getVectorElementType();
30752   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
30753          "Expected value to have byte element type.");
30754   assert(EltVT != MVT::i8 &&
30755          "Horizontal byte sum only makes sense for wider elements!");
30756   unsigned VecSize = VT.getSizeInBits();
30757   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
30758
30759   // PSADBW instruction horizontally add all bytes and leave the result in i64
30760   // chunks, thus directly computes the pop count for v2i64 and v4i64.
30761   if (EltVT == MVT::i64) {
30762     SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
30763     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30764     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
30765     return DAG.getBitcast(VT, V);
30766   }
30767
30768   if (EltVT == MVT::i32) {
30769     // We unpack the low half and high half into i32s interleaved with zeros so
30770     // that we can use PSADBW to horizontally sum them. The most useful part of
30771     // this is that it lines up the results of two PSADBW instructions to be
30772     // two v2i64 vectors which concatenated are the 4 population counts. We can
30773     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
30774     SDValue Zeros = DAG.getConstant(0, DL, VT);
30775     SDValue V32 = DAG.getBitcast(VT, V);
30776     SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
30777     SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
30778
30779     // Do the horizontal sums into two v2i64s.
30780     Zeros = DAG.getConstant(0, DL, ByteVecVT);
30781     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30782     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30783                       DAG.getBitcast(ByteVecVT, Low), Zeros);
30784     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30785                        DAG.getBitcast(ByteVecVT, High), Zeros);
30786
30787     // Merge them together.
30788     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
30789     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
30790                     DAG.getBitcast(ShortVecVT, Low),
30791                     DAG.getBitcast(ShortVecVT, High));
30792
30793     return DAG.getBitcast(VT, V);
30794   }
30795
30796   // The only element type left is i16.
30797   assert(EltVT == MVT::i16 && "Unknown how to handle type");
30798
30799   // To obtain pop count for each i16 element starting from the pop count for
30800   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
30801   // right by 8. It is important to shift as i16s as i8 vector shift isn't
30802   // directly supported.
30803   SDValue ShifterV = DAG.getConstant(8, DL, VT);
30804   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30805   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
30806                   DAG.getBitcast(ByteVecVT, V));
30807   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30808 }
30809
30810 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
30811                                         const X86Subtarget &Subtarget,
30812                                         SelectionDAG &DAG) {
30813   MVT VT = Op.getSimpleValueType();
30814   MVT EltVT = VT.getVectorElementType();
30815   int NumElts = VT.getVectorNumElements();
30816   (void)EltVT;
30817   assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
30818
30819   // Implement a lookup table in register by using an algorithm based on:
30820   // http://wm.ite.pl/articles/sse-popcount.html
30821   //
30822   // The general idea is that every lower byte nibble in the input vector is an
30823   // index into a in-register pre-computed pop count table. We then split up the
30824   // input vector in two new ones: (1) a vector with only the shifted-right
30825   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
30826   // masked out higher ones) for each byte. PSHUFB is used separately with both
30827   // to index the in-register table. Next, both are added and the result is a
30828   // i8 vector where each element contains the pop count for input byte.
30829   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
30830                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
30831                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
30832                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
30833
30834   SmallVector<SDValue, 64> LUTVec;
30835   for (int i = 0; i < NumElts; ++i)
30836     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
30837   SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
30838   SDValue M0F = DAG.getConstant(0x0F, DL, VT);
30839
30840   // High nibbles
30841   SDValue FourV = DAG.getConstant(4, DL, VT);
30842   SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
30843
30844   // Low nibbles
30845   SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
30846
30847   // The input vector is used as the shuffle mask that index elements into the
30848   // LUT. After counting low and high nibbles, add the vector to obtain the
30849   // final pop count per i8 element.
30850   SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
30851   SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
30852   return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
30853 }
30854
30855 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
30856 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
30857 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
30858                                 SelectionDAG &DAG) {
30859   MVT VT = Op.getSimpleValueType();
30860   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
30861          "Unknown CTPOP type to handle");
30862   SDLoc DL(Op.getNode());
30863   SDValue Op0 = Op.getOperand(0);
30864
30865   // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
30866   if (Subtarget.hasVPOPCNTDQ()) {
30867     unsigned NumElems = VT.getVectorNumElements();
30868     assert((VT.getVectorElementType() == MVT::i8 ||
30869             VT.getVectorElementType() == MVT::i16) && "Unexpected type");
30870     if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
30871       MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
30872       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
30873       Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
30874       return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
30875     }
30876   }
30877
30878   // Decompose 256-bit ops into smaller 128-bit ops.
30879   if (VT.is256BitVector() && !Subtarget.hasInt256())
30880     return splitVectorIntUnary(Op, DAG);
30881
30882   // Decompose 512-bit ops into smaller 256-bit ops.
30883   if (VT.is512BitVector() && !Subtarget.hasBWI())
30884     return splitVectorIntUnary(Op, DAG);
30885
30886   // For element types greater than i8, do vXi8 pop counts and a bytesum.
30887   if (VT.getScalarType() != MVT::i8) {
30888     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
30889     SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
30890     SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
30891     return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
30892   }
30893
30894   // We can't use the fast LUT approach, so fall back on LegalizeDAG.
30895   if (!Subtarget.hasSSSE3())
30896     return SDValue();
30897
30898   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
30899 }
30900
30901 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
30902                           SelectionDAG &DAG) {
30903   assert(Op.getSimpleValueType().isVector() &&
30904          "We only do custom lowering for vector population count.");
30905   return LowerVectorCTPOP(Op, Subtarget, DAG);
30906 }
30907
30908 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
30909   MVT VT = Op.getSimpleValueType();
30910   SDValue In = Op.getOperand(0);
30911   SDLoc DL(Op);
30912
30913   // For scalars, its still beneficial to transfer to/from the SIMD unit to
30914   // perform the BITREVERSE.
30915   if (!VT.isVector()) {
30916     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
30917     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
30918     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
30919     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
30920                        DAG.getIntPtrConstant(0, DL));
30921   }
30922
30923   int NumElts = VT.getVectorNumElements();
30924   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
30925
30926   // Decompose 256-bit ops into smaller 128-bit ops.
30927   if (VT.is256BitVector())
30928     return splitVectorIntUnary(Op, DAG);
30929
30930   assert(VT.is128BitVector() &&
30931          "Only 128-bit vector bitreverse lowering supported.");
30932
30933   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
30934   // perform the BSWAP in the shuffle.
30935   // Its best to shuffle using the second operand as this will implicitly allow
30936   // memory folding for multiple vectors.
30937   SmallVector<SDValue, 16> MaskElts;
30938   for (int i = 0; i != NumElts; ++i) {
30939     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
30940       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
30941       int PermuteByte = SourceByte | (2 << 5);
30942       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
30943     }
30944   }
30945
30946   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
30947   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
30948   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
30949                     Res, Mask);
30950   return DAG.getBitcast(VT, Res);
30951 }
30952
30953 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
30954                                SelectionDAG &DAG) {
30955   MVT VT = Op.getSimpleValueType();
30956
30957   if (Subtarget.hasXOP() && !VT.is512BitVector())
30958     return LowerBITREVERSE_XOP(Op, DAG);
30959
30960   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
30961
30962   SDValue In = Op.getOperand(0);
30963   SDLoc DL(Op);
30964
30965   assert(VT.getScalarType() == MVT::i8 &&
30966          "Only byte vector BITREVERSE supported");
30967
30968   // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
30969   if (VT == MVT::v64i8 && !Subtarget.hasBWI())
30970     return splitVectorIntUnary(Op, DAG);
30971
30972   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
30973   if (VT == MVT::v32i8 && !Subtarget.hasInt256())
30974     return splitVectorIntUnary(Op, DAG);
30975
30976   unsigned NumElts = VT.getVectorNumElements();
30977
30978   // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
30979   if (Subtarget.hasGFNI()) {
30980     MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
30981     SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
30982     Matrix = DAG.getBitcast(VT, Matrix);
30983     return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
30984                        DAG.getTargetConstant(0, DL, MVT::i8));
30985   }
30986
30987   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
30988   // two nibbles and a PSHUFB lookup to find the bitreverse of each
30989   // 0-15 value (moved to the other nibble).
30990   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
30991   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
30992   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
30993
30994   const int LoLUT[16] = {
30995       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
30996       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
30997       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
30998       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
30999   const int HiLUT[16] = {
31000       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
31001       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
31002       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
31003       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
31004
31005   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
31006   for (unsigned i = 0; i < NumElts; ++i) {
31007     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
31008     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
31009   }
31010
31011   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
31012   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
31013   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
31014   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
31015   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31016 }
31017
31018 static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
31019                            SelectionDAG &DAG) {
31020   SDLoc DL(Op);
31021   SDValue X = Op.getOperand(0);
31022   MVT VT = Op.getSimpleValueType();
31023
31024   // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
31025   if (VT == MVT::i8 ||
31026       DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
31027     X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31028     SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
31029                                 DAG.getConstant(0, DL, MVT::i8));
31030     // Copy the inverse of the parity flag into a register with setcc.
31031     SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31032     // Extend to the original type.
31033     return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31034   }
31035
31036   // If we have POPCNT, use the default expansion.
31037   if (Subtarget.hasPOPCNT())
31038     return SDValue();
31039
31040   if (VT == MVT::i64) {
31041     // Xor the high and low 16-bits together using a 32-bit operation.
31042     SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
31043                              DAG.getNode(ISD::SRL, DL, MVT::i64, X,
31044                                          DAG.getConstant(32, DL, MVT::i8)));
31045     SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
31046     X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
31047   }
31048
31049   if (VT != MVT::i16) {
31050     // Xor the high and low 16-bits together using a 32-bit operation.
31051     SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
31052                                DAG.getConstant(16, DL, MVT::i8));
31053     X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
31054   } else {
31055     // If the input is 16-bits, we need to extend to use an i32 shift below.
31056     X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
31057   }
31058
31059   // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
31060   // This should allow an h-reg to be used to save a shift.
31061   SDValue Hi = DAG.getNode(
31062       ISD::TRUNCATE, DL, MVT::i8,
31063       DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
31064   SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31065   SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
31066   SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
31067
31068   // Copy the inverse of the parity flag into a register with setcc.
31069   SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31070   // Extend to the original type.
31071   return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31072 }
31073
31074 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
31075                                         const X86Subtarget &Subtarget) {
31076   unsigned NewOpc = 0;
31077   switch (N->getOpcode()) {
31078   case ISD::ATOMIC_LOAD_ADD:
31079     NewOpc = X86ISD::LADD;
31080     break;
31081   case ISD::ATOMIC_LOAD_SUB:
31082     NewOpc = X86ISD::LSUB;
31083     break;
31084   case ISD::ATOMIC_LOAD_OR:
31085     NewOpc = X86ISD::LOR;
31086     break;
31087   case ISD::ATOMIC_LOAD_XOR:
31088     NewOpc = X86ISD::LXOR;
31089     break;
31090   case ISD::ATOMIC_LOAD_AND:
31091     NewOpc = X86ISD::LAND;
31092     break;
31093   default:
31094     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
31095   }
31096
31097   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
31098
31099   return DAG.getMemIntrinsicNode(
31100       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
31101       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
31102       /*MemVT=*/N->getSimpleValueType(0), MMO);
31103 }
31104
31105 /// Lower atomic_load_ops into LOCK-prefixed operations.
31106 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
31107                                 const X86Subtarget &Subtarget) {
31108   AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
31109   SDValue Chain = N->getOperand(0);
31110   SDValue LHS = N->getOperand(1);
31111   SDValue RHS = N->getOperand(2);
31112   unsigned Opc = N->getOpcode();
31113   MVT VT = N->getSimpleValueType(0);
31114   SDLoc DL(N);
31115
31116   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
31117   // can only be lowered when the result is unused.  They should have already
31118   // been transformed into a cmpxchg loop in AtomicExpand.
31119   if (N->hasAnyUseOfValue(0)) {
31120     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
31121     // select LXADD if LOCK_SUB can't be selected.
31122     // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
31123     // can use LXADD as opposed to cmpxchg.
31124     if (Opc == ISD::ATOMIC_LOAD_SUB ||
31125         (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS))) {
31126       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
31127       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS,
31128                            AN->getMemOperand());
31129     }
31130     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
31131            "Used AtomicRMW ops other than Add should have been expanded!");
31132     return N;
31133   }
31134
31135   // Specialized lowering for the canonical form of an idemptotent atomicrmw.
31136   // The core idea here is that since the memory location isn't actually
31137   // changing, all we need is a lowering for the *ordering* impacts of the
31138   // atomicrmw.  As such, we can chose a different operation and memory
31139   // location to minimize impact on other code.
31140   // The above holds unless the node is marked volatile in which
31141   // case it needs to be preserved according to the langref.
31142   if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
31143     // On X86, the only ordering which actually requires an instruction is
31144     // seq_cst which isn't SingleThread, everything just needs to be preserved
31145     // during codegen and then dropped. Note that we expect (but don't assume),
31146     // that orderings other than seq_cst and acq_rel have been canonicalized to
31147     // a store or load.
31148     if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
31149         AN->getSyncScopeID() == SyncScope::System) {
31150       // Prefer a locked operation against a stack location to minimize cache
31151       // traffic.  This assumes that stack locations are very likely to be
31152       // accessed only by the owning thread.
31153       SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
31154       assert(!N->hasAnyUseOfValue(0));
31155       // NOTE: The getUNDEF is needed to give something for the unused result 0.
31156       return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31157                          DAG.getUNDEF(VT), NewChain);
31158     }
31159     // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31160     SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
31161     assert(!N->hasAnyUseOfValue(0));
31162     // NOTE: The getUNDEF is needed to give something for the unused result 0.
31163     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31164                        DAG.getUNDEF(VT), NewChain);
31165   }
31166
31167   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
31168   // RAUW the chain, but don't worry about the result, as it's unused.
31169   assert(!N->hasAnyUseOfValue(0));
31170   // NOTE: The getUNDEF is needed to give something for the unused result 0.
31171   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31172                      DAG.getUNDEF(VT), LockOp.getValue(1));
31173 }
31174
31175 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
31176                                  const X86Subtarget &Subtarget) {
31177   auto *Node = cast<AtomicSDNode>(Op.getNode());
31178   SDLoc dl(Node);
31179   EVT VT = Node->getMemoryVT();
31180
31181   bool IsSeqCst =
31182       Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
31183   bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
31184
31185   // If this store is not sequentially consistent and the type is legal
31186   // we can just keep it.
31187   if (!IsSeqCst && IsTypeLegal)
31188     return Op;
31189
31190   if (VT == MVT::i64 && !IsTypeLegal) {
31191     // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
31192     // is enabled.
31193     bool NoImplicitFloatOps =
31194         DAG.getMachineFunction().getFunction().hasFnAttribute(
31195             Attribute::NoImplicitFloat);
31196     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31197       SDValue Chain;
31198       if (Subtarget.hasSSE1()) {
31199         SDValue SclToVec =
31200             DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
31201         MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31202         SclToVec = DAG.getBitcast(StVT, SclToVec);
31203         SDVTList Tys = DAG.getVTList(MVT::Other);
31204         SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
31205         Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
31206                                         MVT::i64, Node->getMemOperand());
31207       } else if (Subtarget.hasX87()) {
31208         // First load this into an 80-bit X87 register using a stack temporary.
31209         // This will put the whole integer into the significand.
31210         SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31211         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31212         MachinePointerInfo MPI =
31213             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31214         Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
31215                              MPI, MaybeAlign(), MachineMemOperand::MOStore);
31216         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31217         SDValue LdOps[] = {Chain, StackPtr};
31218         SDValue Value = DAG.getMemIntrinsicNode(
31219             X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
31220             /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
31221         Chain = Value.getValue(1);
31222
31223         // Now use an FIST to do the atomic store.
31224         SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
31225         Chain =
31226             DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
31227                                     StoreOps, MVT::i64, Node->getMemOperand());
31228       }
31229
31230       if (Chain) {
31231         // If this is a sequentially consistent store, also emit an appropriate
31232         // barrier.
31233         if (IsSeqCst)
31234           Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
31235
31236         return Chain;
31237       }
31238     }
31239   }
31240
31241   // Convert seq_cst store -> xchg
31242   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
31243   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
31244   SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
31245                                Node->getOperand(0), Node->getOperand(2),
31246                                Node->getOperand(1), Node->getMemOperand());
31247   return Swap.getValue(1);
31248 }
31249
31250 static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
31251   SDNode *N = Op.getNode();
31252   MVT VT = N->getSimpleValueType(0);
31253   unsigned Opc = Op.getOpcode();
31254
31255   // Let legalize expand this if it isn't a legal type yet.
31256   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31257     return SDValue();
31258
31259   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
31260   SDLoc DL(N);
31261
31262   // Set the carry flag.
31263   SDValue Carry = Op.getOperand(2);
31264   EVT CarryVT = Carry.getValueType();
31265   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
31266                       Carry, DAG.getAllOnesConstant(DL, CarryVT));
31267
31268   bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
31269   SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
31270                             Op.getOperand(0), Op.getOperand(1),
31271                             Carry.getValue(1));
31272
31273   bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
31274   SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
31275                            Sum.getValue(1), DL, DAG);
31276   if (N->getValueType(1) == MVT::i1)
31277     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
31278
31279   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
31280 }
31281
31282 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
31283                             SelectionDAG &DAG) {
31284   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
31285
31286   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
31287   // which returns the values as { float, float } (in XMM0) or
31288   // { double, double } (which is returned in XMM0, XMM1).
31289   SDLoc dl(Op);
31290   SDValue Arg = Op.getOperand(0);
31291   EVT ArgVT = Arg.getValueType();
31292   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
31293
31294   TargetLowering::ArgListTy Args;
31295   TargetLowering::ArgListEntry Entry;
31296
31297   Entry.Node = Arg;
31298   Entry.Ty = ArgTy;
31299   Entry.IsSExt = false;
31300   Entry.IsZExt = false;
31301   Args.push_back(Entry);
31302
31303   bool isF64 = ArgVT == MVT::f64;
31304   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
31305   // the small struct {f32, f32} is returned in (eax, edx). For f64,
31306   // the results are returned via SRet in memory.
31307   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31308   RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
31309   const char *LibcallName = TLI.getLibcallName(LC);
31310   SDValue Callee =
31311       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
31312
31313   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
31314                       : (Type *)FixedVectorType::get(ArgTy, 4);
31315
31316   TargetLowering::CallLoweringInfo CLI(DAG);
31317   CLI.setDebugLoc(dl)
31318       .setChain(DAG.getEntryNode())
31319       .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
31320
31321   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
31322
31323   if (isF64)
31324     // Returned in xmm0 and xmm1.
31325     return CallResult.first;
31326
31327   // Returned in bits 0:31 and 32:64 xmm0.
31328   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31329                                CallResult.first, DAG.getIntPtrConstant(0, dl));
31330   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31331                                CallResult.first, DAG.getIntPtrConstant(1, dl));
31332   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
31333   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
31334 }
31335
31336 /// Widen a vector input to a vector of NVT.  The
31337 /// input vector must have the same element type as NVT.
31338 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
31339                             bool FillWithZeroes = false) {
31340   // Check if InOp already has the right width.
31341   MVT InVT = InOp.getSimpleValueType();
31342   if (InVT == NVT)
31343     return InOp;
31344
31345   if (InOp.isUndef())
31346     return DAG.getUNDEF(NVT);
31347
31348   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
31349          "input and widen element type must match");
31350
31351   unsigned InNumElts = InVT.getVectorNumElements();
31352   unsigned WidenNumElts = NVT.getVectorNumElements();
31353   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
31354          "Unexpected request for vector widening");
31355
31356   SDLoc dl(InOp);
31357   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
31358       InOp.getNumOperands() == 2) {
31359     SDValue N1 = InOp.getOperand(1);
31360     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
31361         N1.isUndef()) {
31362       InOp = InOp.getOperand(0);
31363       InVT = InOp.getSimpleValueType();
31364       InNumElts = InVT.getVectorNumElements();
31365     }
31366   }
31367   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
31368       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
31369     SmallVector<SDValue, 16> Ops;
31370     for (unsigned i = 0; i < InNumElts; ++i)
31371       Ops.push_back(InOp.getOperand(i));
31372
31373     EVT EltVT = InOp.getOperand(0).getValueType();
31374
31375     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
31376       DAG.getUNDEF(EltVT);
31377     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
31378       Ops.push_back(FillVal);
31379     return DAG.getBuildVector(NVT, dl, Ops);
31380   }
31381   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
31382     DAG.getUNDEF(NVT);
31383   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
31384                      InOp, DAG.getIntPtrConstant(0, dl));
31385 }
31386
31387 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
31388                              SelectionDAG &DAG) {
31389   assert(Subtarget.hasAVX512() &&
31390          "MGATHER/MSCATTER are supported on AVX-512 arch only");
31391
31392   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
31393   SDValue Src = N->getValue();
31394   MVT VT = Src.getSimpleValueType();
31395   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
31396   SDLoc dl(Op);
31397
31398   SDValue Scale = N->getScale();
31399   SDValue Index = N->getIndex();
31400   SDValue Mask = N->getMask();
31401   SDValue Chain = N->getChain();
31402   SDValue BasePtr = N->getBasePtr();
31403
31404   if (VT == MVT::v2f32 || VT == MVT::v2i32) {
31405     assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31406     // If the index is v2i64 and we have VLX we can use xmm for data and index.
31407     if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
31408       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31409       EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
31410       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
31411       SDVTList VTs = DAG.getVTList(MVT::Other);
31412       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31413       return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31414                                      N->getMemoryVT(), N->getMemOperand());
31415     }
31416     return SDValue();
31417   }
31418
31419   MVT IndexVT = Index.getSimpleValueType();
31420
31421   // If the index is v2i32, we're being called by type legalization and we
31422   // should just let the default handling take care of it.
31423   if (IndexVT == MVT::v2i32)
31424     return SDValue();
31425
31426   // If we don't have VLX and neither the passthru or index is 512-bits, we
31427   // need to widen until one is.
31428   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
31429       !Index.getSimpleValueType().is512BitVector()) {
31430     // Determine how much we need to widen by to get a 512-bit type.
31431     unsigned Factor = std::min(512/VT.getSizeInBits(),
31432                                512/IndexVT.getSizeInBits());
31433     unsigned NumElts = VT.getVectorNumElements() * Factor;
31434
31435     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31436     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31437     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31438
31439     Src = ExtendToType(Src, VT, DAG);
31440     Index = ExtendToType(Index, IndexVT, DAG);
31441     Mask = ExtendToType(Mask, MaskVT, DAG, true);
31442   }
31443
31444   SDVTList VTs = DAG.getVTList(MVT::Other);
31445   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31446   return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31447                                  N->getMemoryVT(), N->getMemOperand());
31448 }
31449
31450 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
31451                           SelectionDAG &DAG) {
31452
31453   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
31454   MVT VT = Op.getSimpleValueType();
31455   MVT ScalarVT = VT.getScalarType();
31456   SDValue Mask = N->getMask();
31457   MVT MaskVT = Mask.getSimpleValueType();
31458   SDValue PassThru = N->getPassThru();
31459   SDLoc dl(Op);
31460
31461   // Handle AVX masked loads which don't support passthru other than 0.
31462   if (MaskVT.getVectorElementType() != MVT::i1) {
31463     // We also allow undef in the isel pattern.
31464     if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
31465       return Op;
31466
31467     SDValue NewLoad = DAG.getMaskedLoad(
31468         VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31469         getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
31470         N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
31471         N->isExpandingLoad());
31472     // Emit a blend.
31473     SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
31474     return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
31475   }
31476
31477   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
31478          "Expanding masked load is supported on AVX-512 target only!");
31479
31480   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
31481          "Expanding masked load is supported for 32 and 64-bit types only!");
31482
31483   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31484          "Cannot lower masked load op.");
31485
31486   assert((ScalarVT.getSizeInBits() >= 32 ||
31487           (Subtarget.hasBWI() &&
31488               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31489          "Unsupported masked load op.");
31490
31491   // This operation is legal for targets with VLX, but without
31492   // VLX the vector should be widened to 512 bit
31493   unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
31494   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31495   PassThru = ExtendToType(PassThru, WideDataVT, DAG);
31496
31497   // Mask element has to be i1.
31498   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31499          "Unexpected mask type");
31500
31501   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31502
31503   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31504   SDValue NewLoad = DAG.getMaskedLoad(
31505       WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31506       PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
31507       N->getExtensionType(), N->isExpandingLoad());
31508
31509   SDValue Extract =
31510       DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
31511                   DAG.getIntPtrConstant(0, dl));
31512   SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
31513   return DAG.getMergeValues(RetOps, dl);
31514 }
31515
31516 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
31517                            SelectionDAG &DAG) {
31518   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
31519   SDValue DataToStore = N->getValue();
31520   MVT VT = DataToStore.getSimpleValueType();
31521   MVT ScalarVT = VT.getScalarType();
31522   SDValue Mask = N->getMask();
31523   SDLoc dl(Op);
31524
31525   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
31526          "Expanding masked load is supported on AVX-512 target only!");
31527
31528   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
31529          "Expanding masked load is supported for 32 and 64-bit types only!");
31530
31531   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31532          "Cannot lower masked store op.");
31533
31534   assert((ScalarVT.getSizeInBits() >= 32 ||
31535           (Subtarget.hasBWI() &&
31536               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31537           "Unsupported masked store op.");
31538
31539   // This operation is legal for targets with VLX, but without
31540   // VLX the vector should be widened to 512 bit
31541   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
31542   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31543
31544   // Mask element has to be i1.
31545   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31546          "Unexpected mask type");
31547
31548   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31549
31550   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
31551   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31552   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
31553                             N->getOffset(), Mask, N->getMemoryVT(),
31554                             N->getMemOperand(), N->getAddressingMode(),
31555                             N->isTruncatingStore(), N->isCompressingStore());
31556 }
31557
31558 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
31559                             SelectionDAG &DAG) {
31560   assert(Subtarget.hasAVX2() &&
31561          "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
31562
31563   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
31564   SDLoc dl(Op);
31565   MVT VT = Op.getSimpleValueType();
31566   SDValue Index = N->getIndex();
31567   SDValue Mask = N->getMask();
31568   SDValue PassThru = N->getPassThru();
31569   MVT IndexVT = Index.getSimpleValueType();
31570
31571   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
31572
31573   // If the index is v2i32, we're being called by type legalization.
31574   if (IndexVT == MVT::v2i32)
31575     return SDValue();
31576
31577   // If we don't have VLX and neither the passthru or index is 512-bits, we
31578   // need to widen until one is.
31579   MVT OrigVT = VT;
31580   if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31581       !IndexVT.is512BitVector()) {
31582     // Determine how much we need to widen by to get a 512-bit type.
31583     unsigned Factor = std::min(512/VT.getSizeInBits(),
31584                                512/IndexVT.getSizeInBits());
31585
31586     unsigned NumElts = VT.getVectorNumElements() * Factor;
31587
31588     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31589     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31590     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31591
31592     PassThru = ExtendToType(PassThru, VT, DAG);
31593     Index = ExtendToType(Index, IndexVT, DAG);
31594     Mask = ExtendToType(Mask, MaskVT, DAG, true);
31595   }
31596
31597   // Break dependency on the data register.
31598   if (PassThru.isUndef())
31599     PassThru = getZeroVector(VT, Subtarget, DAG, dl);
31600
31601   SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
31602                     N->getScale() };
31603   SDValue NewGather = DAG.getMemIntrinsicNode(
31604       X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
31605       N->getMemOperand());
31606   SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
31607                                 NewGather, DAG.getIntPtrConstant(0, dl));
31608   return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
31609 }
31610
31611 static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
31612   SDLoc dl(Op);
31613   SDValue Src = Op.getOperand(0);
31614   MVT DstVT = Op.getSimpleValueType();
31615
31616   AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
31617   unsigned SrcAS = N->getSrcAddressSpace();
31618
31619   assert(SrcAS != N->getDestAddressSpace() &&
31620          "addrspacecast must be between different address spaces");
31621
31622   if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
31623     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
31624   } else if (DstVT == MVT::i64) {
31625     Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
31626   } else if (DstVT == MVT::i32) {
31627     Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
31628   } else {
31629     report_fatal_error("Bad address space in addrspacecast");
31630   }
31631   return Op;
31632 }
31633
31634 SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
31635                                               SelectionDAG &DAG) const {
31636   // TODO: Eventually, the lowering of these nodes should be informed by or
31637   // deferred to the GC strategy for the function in which they appear. For
31638   // now, however, they must be lowered to something. Since they are logically
31639   // no-ops in the case of a null GC strategy (or a GC strategy which does not
31640   // require special handling for these nodes), lower them as literal NOOPs for
31641   // the time being.
31642   SmallVector<SDValue, 2> Ops;
31643   Ops.push_back(Op.getOperand(0));
31644   if (Op->getGluedNode())
31645     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
31646
31647   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
31648   return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
31649 }
31650
31651 // Custom split CVTPS2PH with wide types.
31652 static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
31653   SDLoc dl(Op);
31654   EVT VT = Op.getValueType();
31655   SDValue Lo, Hi;
31656   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
31657   EVT LoVT, HiVT;
31658   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31659   SDValue RC = Op.getOperand(1);
31660   Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
31661   Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
31662   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31663 }
31664
31665 static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget,
31666                              SelectionDAG &DAG) {
31667   unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
31668
31669   // We don't support non-data prefetch without PREFETCHI.
31670   // Just preserve the chain.
31671   if (!IsData && !Subtarget.hasPREFETCHI())
31672     return Op.getOperand(0);
31673
31674   return Op;
31675 }
31676
31677 static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,
31678                                      unsigned OpNo) {
31679   const APInt Operand(32, OpNo);
31680   std::string OpNoStr = llvm::toString(Operand, 10, false);
31681   std::string Str(" $");
31682
31683   std::string OpNoStr1(Str + OpNoStr);             // e.g. " $1" (OpNo=1)
31684   std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
31685
31686   auto I = StringRef::npos;
31687   for (auto &AsmStr : AsmStrs) {
31688     // Match the OpNo string. We should match exactly to exclude match
31689     // sub-string, e.g. "$12" contain "$1"
31690     if (AsmStr.ends_with(OpNoStr1))
31691       I = AsmStr.size() - OpNoStr1.size();
31692
31693     // Get the index of operand in AsmStr.
31694     if (I == StringRef::npos)
31695       I = AsmStr.find(OpNoStr1 + ",");
31696     if (I == StringRef::npos)
31697       I = AsmStr.find(OpNoStr2);
31698
31699     if (I == StringRef::npos)
31700       continue;
31701
31702     assert(I > 0 && "Unexpected inline asm string!");
31703     // Remove the operand string and label (if exsit).
31704     // For example:
31705     // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
31706     // ==>
31707     // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
31708     // ==>
31709     // "call dword ptr "
31710     auto TmpStr = AsmStr.substr(0, I);
31711     I = TmpStr.rfind(':');
31712     if (I != StringRef::npos)
31713       TmpStr = TmpStr.substr(I + 1);
31714     return TmpStr.take_while(llvm::isAlpha);
31715   }
31716
31717   return StringRef();
31718 }
31719
31720 bool X86TargetLowering::isInlineAsmTargetBranch(
31721     const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
31722   // In a __asm block, __asm inst foo where inst is CALL or JMP should be
31723   // changed from indirect TargetLowering::C_Memory to direct
31724   // TargetLowering::C_Address.
31725   // We don't need to special case LOOP* and Jcc, which cannot target a memory
31726   // location.
31727   StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
31728   return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
31729 }
31730
31731 /// Provide custom lowering hooks for some operations.
31732 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
31733   switch (Op.getOpcode()) {
31734   default: llvm_unreachable("Should not custom lower this!");
31735   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
31736   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
31737     return LowerCMP_SWAP(Op, Subtarget, DAG);
31738   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
31739   case ISD::ATOMIC_LOAD_ADD:
31740   case ISD::ATOMIC_LOAD_SUB:
31741   case ISD::ATOMIC_LOAD_OR:
31742   case ISD::ATOMIC_LOAD_XOR:
31743   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
31744   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG, Subtarget);
31745   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
31746   case ISD::PARITY:             return LowerPARITY(Op, Subtarget, DAG);
31747   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
31748   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
31749   case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
31750   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
31751   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
31752   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
31753   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
31754   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
31755   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
31756   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
31757   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
31758   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
31759   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
31760   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
31761   case ISD::SHL_PARTS:
31762   case ISD::SRA_PARTS:
31763   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
31764   case ISD::FSHL:
31765   case ISD::FSHR:               return LowerFunnelShift(Op, Subtarget, DAG);
31766   case ISD::STRICT_SINT_TO_FP:
31767   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
31768   case ISD::STRICT_UINT_TO_FP:
31769   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
31770   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
31771   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
31772   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
31773   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
31774   case ISD::ZERO_EXTEND_VECTOR_INREG:
31775   case ISD::SIGN_EXTEND_VECTOR_INREG:
31776     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
31777   case ISD::FP_TO_SINT:
31778   case ISD::STRICT_FP_TO_SINT:
31779   case ISD::FP_TO_UINT:
31780   case ISD::STRICT_FP_TO_UINT:  return LowerFP_TO_INT(Op, DAG);
31781   case ISD::FP_TO_SINT_SAT:
31782   case ISD::FP_TO_UINT_SAT:     return LowerFP_TO_INT_SAT(Op, DAG);
31783   case ISD::FP_EXTEND:
31784   case ISD::STRICT_FP_EXTEND:   return LowerFP_EXTEND(Op, DAG);
31785   case ISD::FP_ROUND:
31786   case ISD::STRICT_FP_ROUND:    return LowerFP_ROUND(Op, DAG);
31787   case ISD::FP16_TO_FP:
31788   case ISD::STRICT_FP16_TO_FP:  return LowerFP16_TO_FP(Op, DAG);
31789   case ISD::FP_TO_FP16:
31790   case ISD::STRICT_FP_TO_FP16:  return LowerFP_TO_FP16(Op, DAG);
31791   case ISD::FP_TO_BF16:         return LowerFP_TO_BF16(Op, DAG);
31792   case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
31793   case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
31794   case ISD::FADD:
31795   case ISD::FSUB:               return lowerFaddFsub(Op, DAG);
31796   case ISD::FROUND:             return LowerFROUND(Op, DAG);
31797   case ISD::FABS:
31798   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
31799   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
31800   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
31801   case ISD::LRINT:
31802   case ISD::LLRINT:             return LowerLRINT_LLRINT(Op, DAG);
31803   case ISD::SETCC:
31804   case ISD::STRICT_FSETCC:
31805   case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);
31806   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
31807   case ISD::SELECT:             return LowerSELECT(Op, DAG);
31808   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
31809   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
31810   case ISD::VASTART:            return LowerVASTART(Op, DAG);
31811   case ISD::VAARG:              return LowerVAARG(Op, DAG);
31812   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
31813   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
31814   case ISD::INTRINSIC_VOID:
31815   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
31816   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
31817   case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
31818   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
31819   case ISD::FRAME_TO_ARGS_OFFSET:
31820                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
31821   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
31822   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
31823   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
31824   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
31825   case ISD::EH_SJLJ_SETUP_DISPATCH:
31826     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
31827   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
31828   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
31829   case ISD::GET_ROUNDING:       return LowerGET_ROUNDING(Op, DAG);
31830   case ISD::SET_ROUNDING:       return LowerSET_ROUNDING(Op, DAG);
31831   case ISD::GET_FPENV_MEM:      return LowerGET_FPENV_MEM(Op, DAG);
31832   case ISD::SET_FPENV_MEM:      return LowerSET_FPENV_MEM(Op, DAG);
31833   case ISD::RESET_FPENV:        return LowerRESET_FPENV(Op, DAG);
31834   case ISD::CTLZ:
31835   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
31836   case ISD::CTTZ:
31837   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, Subtarget, DAG);
31838   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
31839   case ISD::MULHS:
31840   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
31841   case ISD::ROTL:
31842   case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
31843   case ISD::SRA:
31844   case ISD::SRL:
31845   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
31846   case ISD::SADDO:
31847   case ISD::UADDO:
31848   case ISD::SSUBO:
31849   case ISD::USUBO:              return LowerXALUO(Op, DAG);
31850   case ISD::SMULO:
31851   case ISD::UMULO:              return LowerMULO(Op, Subtarget, DAG);
31852   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
31853   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
31854   case ISD::SADDO_CARRY:
31855   case ISD::SSUBO_CARRY:
31856   case ISD::UADDO_CARRY:
31857   case ISD::USUBO_CARRY:        return LowerADDSUBO_CARRY(Op, DAG);
31858   case ISD::ADD:
31859   case ISD::SUB:                return lowerAddSub(Op, DAG, Subtarget);
31860   case ISD::UADDSAT:
31861   case ISD::SADDSAT:
31862   case ISD::USUBSAT:
31863   case ISD::SSUBSAT:            return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
31864   case ISD::SMAX:
31865   case ISD::SMIN:
31866   case ISD::UMAX:
31867   case ISD::UMIN:               return LowerMINMAX(Op, Subtarget, DAG);
31868   case ISD::FMINIMUM:
31869   case ISD::FMAXIMUM:
31870     return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
31871   case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);
31872   case ISD::ABDS:
31873   case ISD::ABDU:               return LowerABD(Op, Subtarget, DAG);
31874   case ISD::AVGCEILU:           return LowerAVG(Op, Subtarget, DAG);
31875   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
31876   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
31877   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
31878   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
31879   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
31880   case ISD::GC_TRANSITION_START:
31881   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION(Op, DAG);
31882   case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
31883   case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
31884   case ISD::PREFETCH:           return LowerPREFETCH(Op, Subtarget, DAG);
31885   }
31886 }
31887
31888 /// Replace a node with an illegal result type with a new node built out of
31889 /// custom code.
31890 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
31891                                            SmallVectorImpl<SDValue>&Results,
31892                                            SelectionDAG &DAG) const {
31893   SDLoc dl(N);
31894   switch (N->getOpcode()) {
31895   default:
31896 #ifndef NDEBUG
31897     dbgs() << "ReplaceNodeResults: ";
31898     N->dump(&DAG);
31899 #endif
31900     llvm_unreachable("Do not know how to custom type legalize this operation!");
31901   case X86ISD::CVTPH2PS: {
31902     EVT VT = N->getValueType(0);
31903     SDValue Lo, Hi;
31904     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31905     EVT LoVT, HiVT;
31906     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31907     Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
31908     Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
31909     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31910     Results.push_back(Res);
31911     return;
31912   }
31913   case X86ISD::STRICT_CVTPH2PS: {
31914     EVT VT = N->getValueType(0);
31915     SDValue Lo, Hi;
31916     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
31917     EVT LoVT, HiVT;
31918     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31919     Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
31920                      {N->getOperand(0), Lo});
31921     Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
31922                      {N->getOperand(0), Hi});
31923     SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31924                                 Lo.getValue(1), Hi.getValue(1));
31925     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31926     Results.push_back(Res);
31927     Results.push_back(Chain);
31928     return;
31929   }
31930   case X86ISD::CVTPS2PH:
31931     Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
31932     return;
31933   case ISD::CTPOP: {
31934     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
31935     // Use a v2i64 if possible.
31936     bool NoImplicitFloatOps =
31937         DAG.getMachineFunction().getFunction().hasFnAttribute(
31938             Attribute::NoImplicitFloat);
31939     if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
31940       SDValue Wide =
31941           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
31942       Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
31943       // Bit count should fit in 32-bits, extract it as that and then zero
31944       // extend to i64. Otherwise we end up extracting bits 63:32 separately.
31945       Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
31946       Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
31947                          DAG.getIntPtrConstant(0, dl));
31948       Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
31949       Results.push_back(Wide);
31950     }
31951     return;
31952   }
31953   case ISD::MUL: {
31954     EVT VT = N->getValueType(0);
31955     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31956            VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
31957     // Pre-promote these to vXi16 to avoid op legalization thinking all 16
31958     // elements are needed.
31959     MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
31960     SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
31961     SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
31962     SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
31963     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
31964     unsigned NumConcats = 16 / VT.getVectorNumElements();
31965     SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
31966     ConcatOps[0] = Res;
31967     Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
31968     Results.push_back(Res);
31969     return;
31970   }
31971   case ISD::SMULO:
31972   case ISD::UMULO: {
31973     EVT VT = N->getValueType(0);
31974     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
31975            VT == MVT::v2i32 && "Unexpected VT!");
31976     bool IsSigned = N->getOpcode() == ISD::SMULO;
31977     unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
31978     SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
31979     SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
31980     SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
31981     // Extract the high 32 bits from each result using PSHUFD.
31982     // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
31983     SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
31984     Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
31985     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
31986                      DAG.getIntPtrConstant(0, dl));
31987
31988     // Truncate the low bits of the result. This will become PSHUFD.
31989     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
31990
31991     SDValue HiCmp;
31992     if (IsSigned) {
31993       // SMULO overflows if the high bits don't match the sign of the low.
31994       HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
31995     } else {
31996       // UMULO overflows if the high bits are non-zero.
31997       HiCmp = DAG.getConstant(0, dl, VT);
31998     }
31999     SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
32000
32001     // Widen the result with by padding with undef.
32002     Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32003                       DAG.getUNDEF(VT));
32004     Results.push_back(Res);
32005     Results.push_back(Ovf);
32006     return;
32007   }
32008   case X86ISD::VPMADDWD: {
32009     // Legalize types for X86ISD::VPMADDWD by widening.
32010     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32011
32012     EVT VT = N->getValueType(0);
32013     EVT InVT = N->getOperand(0).getValueType();
32014     assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
32015            "Expected a VT that divides into 128 bits.");
32016     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32017            "Unexpected type action!");
32018     unsigned NumConcat = 128 / InVT.getSizeInBits();
32019
32020     EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
32021                                     InVT.getVectorElementType(),
32022                                     NumConcat * InVT.getVectorNumElements());
32023     EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
32024                                   VT.getVectorElementType(),
32025                                   NumConcat * VT.getVectorNumElements());
32026
32027     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
32028     Ops[0] = N->getOperand(0);
32029     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32030     Ops[0] = N->getOperand(1);
32031     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32032
32033     SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
32034     Results.push_back(Res);
32035     return;
32036   }
32037   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
32038   case X86ISD::FMINC:
32039   case X86ISD::FMIN:
32040   case X86ISD::FMAXC:
32041   case X86ISD::FMAX: {
32042     EVT VT = N->getValueType(0);
32043     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
32044     SDValue UNDEF = DAG.getUNDEF(VT);
32045     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32046                               N->getOperand(0), UNDEF);
32047     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32048                               N->getOperand(1), UNDEF);
32049     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
32050     return;
32051   }
32052   case ISD::SDIV:
32053   case ISD::UDIV:
32054   case ISD::SREM:
32055   case ISD::UREM: {
32056     EVT VT = N->getValueType(0);
32057     if (VT.isVector()) {
32058       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32059              "Unexpected type action!");
32060       // If this RHS is a constant splat vector we can widen this and let
32061       // division/remainder by constant optimize it.
32062       // TODO: Can we do something for non-splat?
32063       APInt SplatVal;
32064       if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
32065         unsigned NumConcats = 128 / VT.getSizeInBits();
32066         SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
32067         Ops0[0] = N->getOperand(0);
32068         EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
32069         SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
32070         SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
32071         SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
32072         Results.push_back(Res);
32073       }
32074       return;
32075     }
32076
32077     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
32078     Results.push_back(V);
32079     return;
32080   }
32081   case ISD::TRUNCATE: {
32082     MVT VT = N->getSimpleValueType(0);
32083     if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
32084       return;
32085
32086     // The generic legalizer will try to widen the input type to the same
32087     // number of elements as the widened result type. But this isn't always
32088     // the best thing so do some custom legalization to avoid some cases.
32089     MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
32090     SDValue In = N->getOperand(0);
32091     EVT InVT = In.getValueType();
32092     EVT InEltVT = InVT.getVectorElementType();
32093     EVT EltVT = VT.getVectorElementType();
32094     unsigned MinElts = VT.getVectorNumElements();
32095     unsigned WidenNumElts = WidenVT.getVectorNumElements();
32096     unsigned InBits = InVT.getSizeInBits();
32097
32098     // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
32099     unsigned PackOpcode;
32100     if (SDValue Src =
32101             matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) {
32102       if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src,
32103                                                dl, DAG, Subtarget)) {
32104         Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
32105         Results.push_back(Res);
32106         return;
32107       }
32108     }
32109
32110     if (128 % InBits == 0) {
32111       // 128 bit and smaller inputs should avoid truncate all together and
32112       // just use a build_vector that will become a shuffle.
32113       // TODO: Widen and use a shuffle directly?
32114       SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
32115       // Use the original element count so we don't do more scalar opts than
32116       // necessary.
32117       for (unsigned i=0; i < MinElts; ++i) {
32118         SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
32119                                   DAG.getIntPtrConstant(i, dl));
32120         Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
32121       }
32122       Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
32123       return;
32124     }
32125
32126     // With AVX512 there are some cases that can use a target specific
32127     // truncate node to go from 256/512 to less than 128 with zeros in the
32128     // upper elements of the 128 bit result.
32129     if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
32130       // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
32131       if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
32132         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32133         return;
32134       }
32135       // There's one case we can widen to 512 bits and use VTRUNC.
32136       if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
32137         In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
32138                          DAG.getUNDEF(MVT::v4i64));
32139         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32140         return;
32141       }
32142     }
32143     if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
32144         getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
32145         isTypeLegal(MVT::v4i64)) {
32146       // Input needs to be split and output needs to widened. Let's use two
32147       // VTRUNCs, and shuffle their results together into the wider type.
32148       SDValue Lo, Hi;
32149       std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
32150
32151       Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
32152       Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
32153       SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
32154                                          { 0,  1,  2,  3, 16, 17, 18, 19,
32155                                           -1, -1, -1, -1, -1, -1, -1, -1 });
32156       Results.push_back(Res);
32157       return;
32158     }
32159
32160     // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
32161     // this via type legalization.
32162     if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
32163         (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
32164         (!Subtarget.hasSSSE3() ||
32165          (!isTypeLegal(InVT) &&
32166           !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
32167       SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
32168                                        InEltVT.getSizeInBits() * WidenNumElts);
32169       Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
32170       return;
32171     }
32172
32173     return;
32174   }
32175   case ISD::ANY_EXTEND:
32176     // Right now, only MVT::v8i8 has Custom action for an illegal type.
32177     // It's intended to custom handle the input type.
32178     assert(N->getValueType(0) == MVT::v8i8 &&
32179            "Do not know how to legalize this Node");
32180     return;
32181   case ISD::SIGN_EXTEND:
32182   case ISD::ZERO_EXTEND: {
32183     EVT VT = N->getValueType(0);
32184     SDValue In = N->getOperand(0);
32185     EVT InVT = In.getValueType();
32186     if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
32187         (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
32188       assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
32189              "Unexpected type action!");
32190       assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
32191       // Custom split this so we can extend i8/i16->i32 invec. This is better
32192       // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
32193       // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
32194       // we allow the sra from the extend to i32 to be shared by the split.
32195       In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
32196
32197       // Fill a vector with sign bits for each element.
32198       SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
32199       SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
32200
32201       // Create an unpackl and unpackh to interleave the sign bits then bitcast
32202       // to v2i64.
32203       SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32204                                         {0, 4, 1, 5});
32205       Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
32206       SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32207                                         {2, 6, 3, 7});
32208       Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
32209
32210       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32211       Results.push_back(Res);
32212       return;
32213     }
32214
32215     if (VT == MVT::v16i32 || VT == MVT::v8i64) {
32216       if (!InVT.is128BitVector()) {
32217         // Not a 128 bit vector, but maybe type legalization will promote
32218         // it to 128 bits.
32219         if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
32220           return;
32221         InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
32222         if (!InVT.is128BitVector())
32223           return;
32224
32225         // Promote the input to 128 bits. Type legalization will turn this into
32226         // zext_inreg/sext_inreg.
32227         In = DAG.getNode(N->getOpcode(), dl, InVT, In);
32228       }
32229
32230       // Perform custom splitting instead of the two stage extend we would get
32231       // by default.
32232       EVT LoVT, HiVT;
32233       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
32234       assert(isTypeLegal(LoVT) && "Split VT not legal?");
32235
32236       SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
32237
32238       // We need to shift the input over by half the number of elements.
32239       unsigned NumElts = InVT.getVectorNumElements();
32240       unsigned HalfNumElts = NumElts / 2;
32241       SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
32242       for (unsigned i = 0; i != HalfNumElts; ++i)
32243         ShufMask[i] = i + HalfNumElts;
32244
32245       SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
32246       Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
32247
32248       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32249       Results.push_back(Res);
32250     }
32251     return;
32252   }
32253   case ISD::FP_TO_SINT:
32254   case ISD::STRICT_FP_TO_SINT:
32255   case ISD::FP_TO_UINT:
32256   case ISD::STRICT_FP_TO_UINT: {
32257     bool IsStrict = N->isStrictFPOpcode();
32258     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
32259                     N->getOpcode() == ISD::STRICT_FP_TO_SINT;
32260     EVT VT = N->getValueType(0);
32261     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32262     SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
32263     EVT SrcVT = Src.getValueType();
32264
32265     SDValue Res;
32266     if (isSoftF16(SrcVT, Subtarget)) {
32267       EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
32268       if (IsStrict) {
32269         Res =
32270             DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
32271                         {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
32272                                             {NVT, MVT::Other}, {Chain, Src})});
32273         Chain = Res.getValue(1);
32274       } else {
32275         Res = DAG.getNode(N->getOpcode(), dl, VT,
32276                           DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
32277       }
32278       Results.push_back(Res);
32279       if (IsStrict)
32280         Results.push_back(Chain);
32281
32282       return;
32283     }
32284
32285     if (VT.isVector() && Subtarget.hasFP16() &&
32286         SrcVT.getVectorElementType() == MVT::f16) {
32287       EVT EleVT = VT.getVectorElementType();
32288       EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
32289
32290       if (SrcVT != MVT::v8f16) {
32291         SDValue Tmp =
32292             IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
32293         SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
32294         Ops[0] = Src;
32295         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
32296       }
32297
32298       if (IsStrict) {
32299         unsigned Opc =
32300             IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32301         Res =
32302             DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
32303         Chain = Res.getValue(1);
32304       } else {
32305         unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32306         Res = DAG.getNode(Opc, dl, ResVT, Src);
32307       }
32308
32309       // TODO: Need to add exception check code for strict FP.
32310       if (EleVT.getSizeInBits() < 16) {
32311         MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
32312         Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
32313
32314         // Now widen to 128 bits.
32315         unsigned NumConcats = 128 / TmpVT.getSizeInBits();
32316         MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
32317         SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
32318         ConcatOps[0] = Res;
32319         Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32320       }
32321
32322       Results.push_back(Res);
32323       if (IsStrict)
32324         Results.push_back(Chain);
32325
32326       return;
32327     }
32328
32329     if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
32330       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32331              "Unexpected type action!");
32332
32333       // Try to create a 128 bit vector, but don't exceed a 32 bit element.
32334       unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
32335       MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
32336                                        VT.getVectorNumElements());
32337       SDValue Res;
32338       SDValue Chain;
32339       if (IsStrict) {
32340         Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
32341                           {N->getOperand(0), Src});
32342         Chain = Res.getValue(1);
32343       } else
32344         Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
32345
32346       // Preserve what we know about the size of the original result. If the
32347       // result is v2i32, we have to manually widen the assert.
32348       if (PromoteVT == MVT::v2i32)
32349         Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32350                           DAG.getUNDEF(MVT::v2i32));
32351
32352       Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
32353                         Res.getValueType(), Res,
32354                         DAG.getValueType(VT.getVectorElementType()));
32355
32356       if (PromoteVT == MVT::v2i32)
32357         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
32358                           DAG.getIntPtrConstant(0, dl));
32359
32360       // Truncate back to the original width.
32361       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32362
32363       // Now widen to 128 bits.
32364       unsigned NumConcats = 128 / VT.getSizeInBits();
32365       MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
32366                                       VT.getVectorNumElements() * NumConcats);
32367       SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32368       ConcatOps[0] = Res;
32369       Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32370       Results.push_back(Res);
32371       if (IsStrict)
32372         Results.push_back(Chain);
32373       return;
32374     }
32375
32376
32377     if (VT == MVT::v2i32) {
32378       assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
32379              "Strict unsigned conversion requires AVX512");
32380       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32381       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32382              "Unexpected type action!");
32383       if (Src.getValueType() == MVT::v2f64) {
32384         if (!IsSigned && !Subtarget.hasAVX512()) {
32385           SDValue Res =
32386               expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
32387           Results.push_back(Res);
32388           return;
32389         }
32390
32391         unsigned Opc;
32392         if (IsStrict)
32393           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32394         else
32395           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32396
32397         // If we have VLX we can emit a target specific FP_TO_UINT node,.
32398         if (!IsSigned && !Subtarget.hasVLX()) {
32399           // Otherwise we can defer to the generic legalizer which will widen
32400           // the input as well. This will be further widened during op
32401           // legalization to v8i32<-v8f64.
32402           // For strict nodes we'll need to widen ourselves.
32403           // FIXME: Fix the type legalizer to safely widen strict nodes?
32404           if (!IsStrict)
32405             return;
32406           Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
32407                             DAG.getConstantFP(0.0, dl, MVT::v2f64));
32408           Opc = N->getOpcode();
32409         }
32410         SDValue Res;
32411         SDValue Chain;
32412         if (IsStrict) {
32413           Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
32414                             {N->getOperand(0), Src});
32415           Chain = Res.getValue(1);
32416         } else {
32417           Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
32418         }
32419         Results.push_back(Res);
32420         if (IsStrict)
32421           Results.push_back(Chain);
32422         return;
32423       }
32424
32425       // Custom widen strict v2f32->v2i32 by padding with zeros.
32426       // FIXME: Should generic type legalizer do this?
32427       if (Src.getValueType() == MVT::v2f32 && IsStrict) {
32428         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
32429                           DAG.getConstantFP(0.0, dl, MVT::v2f32));
32430         SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
32431                                   {N->getOperand(0), Src});
32432         Results.push_back(Res);
32433         Results.push_back(Res.getValue(1));
32434         return;
32435       }
32436
32437       // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
32438       // so early out here.
32439       return;
32440     }
32441
32442     assert(!VT.isVector() && "Vectors should have been handled above!");
32443
32444     if ((Subtarget.hasDQI() && VT == MVT::i64 &&
32445          (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
32446         (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
32447       assert(!Subtarget.is64Bit() && "i64 should be legal");
32448       unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
32449       // If we use a 128-bit result we might need to use a target specific node.
32450       unsigned SrcElts =
32451           std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
32452       MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
32453       MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
32454       unsigned Opc = N->getOpcode();
32455       if (NumElts != SrcElts) {
32456         if (IsStrict)
32457           Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
32458         else
32459           Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32460       }
32461
32462       SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
32463       SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
32464                                 DAG.getConstantFP(0.0, dl, VecInVT), Src,
32465                                 ZeroIdx);
32466       SDValue Chain;
32467       if (IsStrict) {
32468         SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
32469         Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
32470         Chain = Res.getValue(1);
32471       } else
32472         Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
32473       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
32474       Results.push_back(Res);
32475       if (IsStrict)
32476         Results.push_back(Chain);
32477       return;
32478     }
32479
32480     if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
32481       SDValue Chain;
32482       SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
32483       Results.push_back(V);
32484       if (IsStrict)
32485         Results.push_back(Chain);
32486       return;
32487     }
32488
32489     if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
32490       Results.push_back(V);
32491       if (IsStrict)
32492         Results.push_back(Chain);
32493     }
32494     return;
32495   }
32496   case ISD::LRINT:
32497   case ISD::LLRINT: {
32498     if (SDValue V = LRINT_LLRINTHelper(N, DAG))
32499       Results.push_back(V);
32500     return;
32501   }
32502
32503   case ISD::SINT_TO_FP:
32504   case ISD::STRICT_SINT_TO_FP:
32505   case ISD::UINT_TO_FP:
32506   case ISD::STRICT_UINT_TO_FP: {
32507     bool IsStrict = N->isStrictFPOpcode();
32508     bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
32509                     N->getOpcode() == ISD::STRICT_SINT_TO_FP;
32510     EVT VT = N->getValueType(0);
32511     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32512     if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
32513         Subtarget.hasVLX()) {
32514       if (Src.getValueType().getVectorElementType() == MVT::i16)
32515         return;
32516
32517       if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
32518         Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32519                           IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
32520                                    : DAG.getUNDEF(MVT::v2i32));
32521       if (IsStrict) {
32522         unsigned Opc =
32523             IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
32524         SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
32525                                   {N->getOperand(0), Src});
32526         Results.push_back(Res);
32527         Results.push_back(Res.getValue(1));
32528       } else {
32529         unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32530         Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
32531       }
32532       return;
32533     }
32534     if (VT != MVT::v2f32)
32535       return;
32536     EVT SrcVT = Src.getValueType();
32537     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
32538       if (IsStrict) {
32539         unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
32540                                 : X86ISD::STRICT_CVTUI2P;
32541         SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
32542                                   {N->getOperand(0), Src});
32543         Results.push_back(Res);
32544         Results.push_back(Res.getValue(1));
32545       } else {
32546         unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32547         Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
32548       }
32549       return;
32550     }
32551     if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
32552         Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
32553       SDValue Zero = DAG.getConstant(0, dl, SrcVT);
32554       SDValue One  = DAG.getConstant(1, dl, SrcVT);
32555       SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
32556                                  DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
32557                                  DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
32558       SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
32559       SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
32560       SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
32561       for (int i = 0; i != 2; ++i) {
32562         SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
32563                                   SignSrc, DAG.getIntPtrConstant(i, dl));
32564         if (IsStrict)
32565           SignCvts[i] =
32566               DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
32567                           {N->getOperand(0), Elt});
32568         else
32569           SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
32570       };
32571       SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
32572       SDValue Slow, Chain;
32573       if (IsStrict) {
32574         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32575                             SignCvts[0].getValue(1), SignCvts[1].getValue(1));
32576         Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
32577                            {Chain, SignCvt, SignCvt});
32578         Chain = Slow.getValue(1);
32579       } else {
32580         Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
32581       }
32582       IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
32583       IsNeg =
32584           DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
32585       SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
32586       Results.push_back(Cvt);
32587       if (IsStrict)
32588         Results.push_back(Chain);
32589       return;
32590     }
32591
32592     if (SrcVT != MVT::v2i32)
32593       return;
32594
32595     if (IsSigned || Subtarget.hasAVX512()) {
32596       if (!IsStrict)
32597         return;
32598
32599       // Custom widen strict v2i32->v2f32 to avoid scalarization.
32600       // FIXME: Should generic type legalizer do this?
32601       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32602                         DAG.getConstant(0, dl, MVT::v2i32));
32603       SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
32604                                 {N->getOperand(0), Src});
32605       Results.push_back(Res);
32606       Results.push_back(Res.getValue(1));
32607       return;
32608     }
32609
32610     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32611     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
32612     SDValue VBias = DAG.getConstantFP(
32613         llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
32614     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
32615                              DAG.getBitcast(MVT::v2i64, VBias));
32616     Or = DAG.getBitcast(MVT::v2f64, Or);
32617     if (IsStrict) {
32618       SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
32619                                 {N->getOperand(0), Or, VBias});
32620       SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
32621                                 {MVT::v4f32, MVT::Other},
32622                                 {Sub.getValue(1), Sub});
32623       Results.push_back(Res);
32624       Results.push_back(Res.getValue(1));
32625     } else {
32626       // TODO: Are there any fast-math-flags to propagate here?
32627       SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
32628       Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
32629     }
32630     return;
32631   }
32632   case ISD::STRICT_FP_ROUND:
32633   case ISD::FP_ROUND: {
32634     bool IsStrict = N->isStrictFPOpcode();
32635     SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
32636     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32637     SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
32638     EVT SrcVT = Src.getValueType();
32639     EVT VT = N->getValueType(0);
32640     SDValue V;
32641     if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
32642       SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
32643                              : DAG.getUNDEF(MVT::v2f32);
32644       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
32645     }
32646     if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
32647       assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
32648       if (SrcVT.getVectorElementType() != MVT::f32)
32649         return;
32650
32651       if (IsStrict)
32652         V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
32653                         {Chain, Src, Rnd});
32654       else
32655         V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
32656
32657       Results.push_back(DAG.getBitcast(MVT::v8f16, V));
32658       if (IsStrict)
32659         Results.push_back(V.getValue(1));
32660       return;
32661     }
32662     if (!isTypeLegal(Src.getValueType()))
32663       return;
32664     EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
32665     if (IsStrict)
32666       V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
32667                       {Chain, Src});
32668     else
32669       V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
32670     Results.push_back(V);
32671     if (IsStrict)
32672       Results.push_back(V.getValue(1));
32673     return;
32674   }
32675   case ISD::FP_EXTEND:
32676   case ISD::STRICT_FP_EXTEND: {
32677     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
32678     // No other ValueType for FP_EXTEND should reach this point.
32679     assert(N->getValueType(0) == MVT::v2f32 &&
32680            "Do not know how to legalize this Node");
32681     if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
32682       return;
32683     bool IsStrict = N->isStrictFPOpcode();
32684     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32685     SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
32686                            : DAG.getUNDEF(MVT::v2f16);
32687     SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
32688     if (IsStrict)
32689       V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
32690                       {N->getOperand(0), V});
32691     else
32692       V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
32693     Results.push_back(V);
32694     if (IsStrict)
32695       Results.push_back(V.getValue(1));
32696     return;
32697   }
32698   case ISD::INTRINSIC_W_CHAIN: {
32699     unsigned IntNo = N->getConstantOperandVal(1);
32700     switch (IntNo) {
32701     default : llvm_unreachable("Do not know how to custom type "
32702                                "legalize this intrinsic operation!");
32703     case Intrinsic::x86_rdtsc:
32704       return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
32705                                      Results);
32706     case Intrinsic::x86_rdtscp:
32707       return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
32708                                      Results);
32709     case Intrinsic::x86_rdpmc:
32710       expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
32711                                   Results);
32712       return;
32713     case Intrinsic::x86_rdpru:
32714       expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
32715         Results);
32716       return;
32717     case Intrinsic::x86_xgetbv:
32718       expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
32719                                   Results);
32720       return;
32721     }
32722   }
32723   case ISD::READCYCLECOUNTER: {
32724     return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
32725   }
32726   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
32727     EVT T = N->getValueType(0);
32728     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
32729     bool Regs64bit = T == MVT::i128;
32730     assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
32731            "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
32732     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
32733     SDValue cpInL, cpInH;
32734     std::tie(cpInL, cpInH) =
32735         DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
32736     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
32737                              Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
32738     cpInH =
32739         DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
32740                          cpInH, cpInL.getValue(1));
32741     SDValue swapInL, swapInH;
32742     std::tie(swapInL, swapInH) =
32743         DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
32744     swapInH =
32745         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
32746                          swapInH, cpInH.getValue(1));
32747
32748     // In 64-bit mode we might need the base pointer in RBX, but we can't know
32749     // until later. So we keep the RBX input in a vreg and use a custom
32750     // inserter.
32751     // Since RBX will be a reserved register the register allocator will not
32752     // make sure its value will be properly saved and restored around this
32753     // live-range.
32754     SDValue Result;
32755     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32756     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
32757     if (Regs64bit) {
32758       SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
32759                        swapInH.getValue(1)};
32760       Result =
32761           DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
32762     } else {
32763       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
32764                                  swapInH.getValue(1));
32765       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
32766                        swapInL.getValue(1)};
32767       Result =
32768           DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
32769     }
32770
32771     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
32772                                         Regs64bit ? X86::RAX : X86::EAX,
32773                                         HalfT, Result.getValue(1));
32774     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
32775                                         Regs64bit ? X86::RDX : X86::EDX,
32776                                         HalfT, cpOutL.getValue(2));
32777     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
32778
32779     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
32780                                         MVT::i32, cpOutH.getValue(2));
32781     SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
32782     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
32783
32784     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
32785     Results.push_back(Success);
32786     Results.push_back(EFLAGS.getValue(1));
32787     return;
32788   }
32789   case ISD::ATOMIC_LOAD: {
32790     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32791     bool NoImplicitFloatOps =
32792         DAG.getMachineFunction().getFunction().hasFnAttribute(
32793             Attribute::NoImplicitFloat);
32794     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
32795       auto *Node = cast<AtomicSDNode>(N);
32796       if (Subtarget.hasSSE1()) {
32797         // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
32798         // Then extract the lower 64-bits.
32799         MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
32800         SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
32801         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
32802         SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
32803                                              MVT::i64, Node->getMemOperand());
32804         if (Subtarget.hasSSE2()) {
32805           SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
32806                                     DAG.getIntPtrConstant(0, dl));
32807           Results.push_back(Res);
32808           Results.push_back(Ld.getValue(1));
32809           return;
32810         }
32811         // We use an alternative sequence for SSE1 that extracts as v2f32 and
32812         // then casts to i64. This avoids a 128-bit stack temporary being
32813         // created by type legalization if we were to cast v4f32->v2i64.
32814         SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
32815                                   DAG.getIntPtrConstant(0, dl));
32816         Res = DAG.getBitcast(MVT::i64, Res);
32817         Results.push_back(Res);
32818         Results.push_back(Ld.getValue(1));
32819         return;
32820       }
32821       if (Subtarget.hasX87()) {
32822         // First load this into an 80-bit X87 register. This will put the whole
32823         // integer into the significand.
32824         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
32825         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
32826         SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
32827                                                  dl, Tys, Ops, MVT::i64,
32828                                                  Node->getMemOperand());
32829         SDValue Chain = Result.getValue(1);
32830
32831         // Now store the X87 register to a stack temporary and convert to i64.
32832         // This store is not atomic and doesn't need to be.
32833         // FIXME: We don't need a stack temporary if the result of the load
32834         // is already being stored. We could just directly store there.
32835         SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
32836         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
32837         MachinePointerInfo MPI =
32838             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
32839         SDValue StoreOps[] = { Chain, Result, StackPtr };
32840         Chain = DAG.getMemIntrinsicNode(
32841             X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
32842             MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
32843
32844         // Finally load the value back from the stack temporary and return it.
32845         // This load is not atomic and doesn't need to be.
32846         // This load will be further type legalized.
32847         Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
32848         Results.push_back(Result);
32849         Results.push_back(Result.getValue(1));
32850         return;
32851       }
32852     }
32853     // TODO: Use MOVLPS when SSE1 is available?
32854     // Delegate to generic TypeLegalization. Situations we can really handle
32855     // should have already been dealt with by AtomicExpandPass.cpp.
32856     break;
32857   }
32858   case ISD::ATOMIC_SWAP:
32859   case ISD::ATOMIC_LOAD_ADD:
32860   case ISD::ATOMIC_LOAD_SUB:
32861   case ISD::ATOMIC_LOAD_AND:
32862   case ISD::ATOMIC_LOAD_OR:
32863   case ISD::ATOMIC_LOAD_XOR:
32864   case ISD::ATOMIC_LOAD_NAND:
32865   case ISD::ATOMIC_LOAD_MIN:
32866   case ISD::ATOMIC_LOAD_MAX:
32867   case ISD::ATOMIC_LOAD_UMIN:
32868   case ISD::ATOMIC_LOAD_UMAX:
32869     // Delegate to generic TypeLegalization. Situations we can really handle
32870     // should have already been dealt with by AtomicExpandPass.cpp.
32871     break;
32872
32873   case ISD::BITCAST: {
32874     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32875     EVT DstVT = N->getValueType(0);
32876     EVT SrcVT = N->getOperand(0).getValueType();
32877
32878     // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
32879     // we can split using the k-register rather than memory.
32880     if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
32881       assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32882       SDValue Lo, Hi;
32883       std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
32884       Lo = DAG.getBitcast(MVT::i32, Lo);
32885       Hi = DAG.getBitcast(MVT::i32, Hi);
32886       SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
32887       Results.push_back(Res);
32888       return;
32889     }
32890
32891     if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
32892       // FIXME: Use v4f32 for SSE1?
32893       assert(Subtarget.hasSSE2() && "Requires SSE2");
32894       assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
32895              "Unexpected type action!");
32896       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
32897       SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
32898                                 N->getOperand(0));
32899       Res = DAG.getBitcast(WideVT, Res);
32900       Results.push_back(Res);
32901       return;
32902     }
32903
32904     return;
32905   }
32906   case ISD::MGATHER: {
32907     EVT VT = N->getValueType(0);
32908     if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
32909         (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
32910       auto *Gather = cast<MaskedGatherSDNode>(N);
32911       SDValue Index = Gather->getIndex();
32912       if (Index.getValueType() != MVT::v2i64)
32913         return;
32914       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32915              "Unexpected type action!");
32916       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32917       SDValue Mask = Gather->getMask();
32918       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
32919       SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
32920                                      Gather->getPassThru(),
32921                                      DAG.getUNDEF(VT));
32922       if (!Subtarget.hasVLX()) {
32923         // We need to widen the mask, but the instruction will only use 2
32924         // of its elements. So we can use undef.
32925         Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
32926                            DAG.getUNDEF(MVT::v2i1));
32927         Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
32928       }
32929       SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
32930                         Gather->getBasePtr(), Index, Gather->getScale() };
32931       SDValue Res = DAG.getMemIntrinsicNode(
32932           X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
32933           Gather->getMemoryVT(), Gather->getMemOperand());
32934       Results.push_back(Res);
32935       Results.push_back(Res.getValue(1));
32936       return;
32937     }
32938     return;
32939   }
32940   case ISD::LOAD: {
32941     // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
32942     // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
32943     // cast since type legalization will try to use an i64 load.
32944     MVT VT = N->getSimpleValueType(0);
32945     assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
32946     assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
32947            "Unexpected type action!");
32948     if (!ISD::isNON_EXTLoad(N))
32949       return;
32950     auto *Ld = cast<LoadSDNode>(N);
32951     if (Subtarget.hasSSE2()) {
32952       MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
32953       SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
32954                                 Ld->getPointerInfo(), Ld->getOriginalAlign(),
32955                                 Ld->getMemOperand()->getFlags());
32956       SDValue Chain = Res.getValue(1);
32957       MVT VecVT = MVT::getVectorVT(LdVT, 2);
32958       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
32959       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32960       Res = DAG.getBitcast(WideVT, Res);
32961       Results.push_back(Res);
32962       Results.push_back(Chain);
32963       return;
32964     }
32965     assert(Subtarget.hasSSE1() && "Expected SSE");
32966     SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
32967     SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
32968     SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
32969                                           MVT::i64, Ld->getMemOperand());
32970     Results.push_back(Res);
32971     Results.push_back(Res.getValue(1));
32972     return;
32973   }
32974   case ISD::ADDRSPACECAST: {
32975     SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
32976     Results.push_back(V);
32977     return;
32978   }
32979   case ISD::BITREVERSE: {
32980     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32981     assert(Subtarget.hasXOP() && "Expected XOP");
32982     // We can use VPPERM by copying to a vector register and back. We'll need
32983     // to move the scalar in two i32 pieces.
32984     Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
32985     return;
32986   }
32987   case ISD::EXTRACT_VECTOR_ELT: {
32988     // f16 = extract vXf16 %vec, i64 %idx
32989     assert(N->getSimpleValueType(0) == MVT::f16 &&
32990            "Unexpected Value type of EXTRACT_VECTOR_ELT!");
32991     assert(Subtarget.hasFP16() && "Expected FP16");
32992     SDValue VecOp = N->getOperand(0);
32993     EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();
32994     SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
32995     Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
32996                         N->getOperand(1));
32997     Split = DAG.getBitcast(MVT::f16, Split);
32998     Results.push_back(Split);
32999     return;
33000   }
33001   }
33002 }
33003
33004 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
33005   switch ((X86ISD::NodeType)Opcode) {
33006   case X86ISD::FIRST_NUMBER:       break;
33007 #define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
33008   NODE_NAME_CASE(BSF)
33009   NODE_NAME_CASE(BSR)
33010   NODE_NAME_CASE(FSHL)
33011   NODE_NAME_CASE(FSHR)
33012   NODE_NAME_CASE(FAND)
33013   NODE_NAME_CASE(FANDN)
33014   NODE_NAME_CASE(FOR)
33015   NODE_NAME_CASE(FXOR)
33016   NODE_NAME_CASE(FILD)
33017   NODE_NAME_CASE(FIST)
33018   NODE_NAME_CASE(FP_TO_INT_IN_MEM)
33019   NODE_NAME_CASE(FLD)
33020   NODE_NAME_CASE(FST)
33021   NODE_NAME_CASE(CALL)
33022   NODE_NAME_CASE(CALL_RVMARKER)
33023   NODE_NAME_CASE(BT)
33024   NODE_NAME_CASE(CMP)
33025   NODE_NAME_CASE(FCMP)
33026   NODE_NAME_CASE(STRICT_FCMP)
33027   NODE_NAME_CASE(STRICT_FCMPS)
33028   NODE_NAME_CASE(COMI)
33029   NODE_NAME_CASE(UCOMI)
33030   NODE_NAME_CASE(CMPM)
33031   NODE_NAME_CASE(CMPMM)
33032   NODE_NAME_CASE(STRICT_CMPM)
33033   NODE_NAME_CASE(CMPMM_SAE)
33034   NODE_NAME_CASE(SETCC)
33035   NODE_NAME_CASE(SETCC_CARRY)
33036   NODE_NAME_CASE(FSETCC)
33037   NODE_NAME_CASE(FSETCCM)
33038   NODE_NAME_CASE(FSETCCM_SAE)
33039   NODE_NAME_CASE(CMOV)
33040   NODE_NAME_CASE(BRCOND)
33041   NODE_NAME_CASE(RET_GLUE)
33042   NODE_NAME_CASE(IRET)
33043   NODE_NAME_CASE(REP_STOS)
33044   NODE_NAME_CASE(REP_MOVS)
33045   NODE_NAME_CASE(GlobalBaseReg)
33046   NODE_NAME_CASE(Wrapper)
33047   NODE_NAME_CASE(WrapperRIP)
33048   NODE_NAME_CASE(MOVQ2DQ)
33049   NODE_NAME_CASE(MOVDQ2Q)
33050   NODE_NAME_CASE(MMX_MOVD2W)
33051   NODE_NAME_CASE(MMX_MOVW2D)
33052   NODE_NAME_CASE(PEXTRB)
33053   NODE_NAME_CASE(PEXTRW)
33054   NODE_NAME_CASE(INSERTPS)
33055   NODE_NAME_CASE(PINSRB)
33056   NODE_NAME_CASE(PINSRW)
33057   NODE_NAME_CASE(PSHUFB)
33058   NODE_NAME_CASE(ANDNP)
33059   NODE_NAME_CASE(BLENDI)
33060   NODE_NAME_CASE(BLENDV)
33061   NODE_NAME_CASE(HADD)
33062   NODE_NAME_CASE(HSUB)
33063   NODE_NAME_CASE(FHADD)
33064   NODE_NAME_CASE(FHSUB)
33065   NODE_NAME_CASE(CONFLICT)
33066   NODE_NAME_CASE(FMAX)
33067   NODE_NAME_CASE(FMAXS)
33068   NODE_NAME_CASE(FMAX_SAE)
33069   NODE_NAME_CASE(FMAXS_SAE)
33070   NODE_NAME_CASE(FMIN)
33071   NODE_NAME_CASE(FMINS)
33072   NODE_NAME_CASE(FMIN_SAE)
33073   NODE_NAME_CASE(FMINS_SAE)
33074   NODE_NAME_CASE(FMAXC)
33075   NODE_NAME_CASE(FMINC)
33076   NODE_NAME_CASE(FRSQRT)
33077   NODE_NAME_CASE(FRCP)
33078   NODE_NAME_CASE(EXTRQI)
33079   NODE_NAME_CASE(INSERTQI)
33080   NODE_NAME_CASE(TLSADDR)
33081   NODE_NAME_CASE(TLSBASEADDR)
33082   NODE_NAME_CASE(TLSCALL)
33083   NODE_NAME_CASE(EH_SJLJ_SETJMP)
33084   NODE_NAME_CASE(EH_SJLJ_LONGJMP)
33085   NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
33086   NODE_NAME_CASE(EH_RETURN)
33087   NODE_NAME_CASE(TC_RETURN)
33088   NODE_NAME_CASE(FNSTCW16m)
33089   NODE_NAME_CASE(FLDCW16m)
33090   NODE_NAME_CASE(FNSTENVm)
33091   NODE_NAME_CASE(FLDENVm)
33092   NODE_NAME_CASE(LCMPXCHG_DAG)
33093   NODE_NAME_CASE(LCMPXCHG8_DAG)
33094   NODE_NAME_CASE(LCMPXCHG16_DAG)
33095   NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
33096   NODE_NAME_CASE(LADD)
33097   NODE_NAME_CASE(LSUB)
33098   NODE_NAME_CASE(LOR)
33099   NODE_NAME_CASE(LXOR)
33100   NODE_NAME_CASE(LAND)
33101   NODE_NAME_CASE(LBTS)
33102   NODE_NAME_CASE(LBTC)
33103   NODE_NAME_CASE(LBTR)
33104   NODE_NAME_CASE(LBTS_RM)
33105   NODE_NAME_CASE(LBTC_RM)
33106   NODE_NAME_CASE(LBTR_RM)
33107   NODE_NAME_CASE(AADD)
33108   NODE_NAME_CASE(AOR)
33109   NODE_NAME_CASE(AXOR)
33110   NODE_NAME_CASE(AAND)
33111   NODE_NAME_CASE(VZEXT_MOVL)
33112   NODE_NAME_CASE(VZEXT_LOAD)
33113   NODE_NAME_CASE(VEXTRACT_STORE)
33114   NODE_NAME_CASE(VTRUNC)
33115   NODE_NAME_CASE(VTRUNCS)
33116   NODE_NAME_CASE(VTRUNCUS)
33117   NODE_NAME_CASE(VMTRUNC)
33118   NODE_NAME_CASE(VMTRUNCS)
33119   NODE_NAME_CASE(VMTRUNCUS)
33120   NODE_NAME_CASE(VTRUNCSTORES)
33121   NODE_NAME_CASE(VTRUNCSTOREUS)
33122   NODE_NAME_CASE(VMTRUNCSTORES)
33123   NODE_NAME_CASE(VMTRUNCSTOREUS)
33124   NODE_NAME_CASE(VFPEXT)
33125   NODE_NAME_CASE(STRICT_VFPEXT)
33126   NODE_NAME_CASE(VFPEXT_SAE)
33127   NODE_NAME_CASE(VFPEXTS)
33128   NODE_NAME_CASE(VFPEXTS_SAE)
33129   NODE_NAME_CASE(VFPROUND)
33130   NODE_NAME_CASE(STRICT_VFPROUND)
33131   NODE_NAME_CASE(VMFPROUND)
33132   NODE_NAME_CASE(VFPROUND_RND)
33133   NODE_NAME_CASE(VFPROUNDS)
33134   NODE_NAME_CASE(VFPROUNDS_RND)
33135   NODE_NAME_CASE(VSHLDQ)
33136   NODE_NAME_CASE(VSRLDQ)
33137   NODE_NAME_CASE(VSHL)
33138   NODE_NAME_CASE(VSRL)
33139   NODE_NAME_CASE(VSRA)
33140   NODE_NAME_CASE(VSHLI)
33141   NODE_NAME_CASE(VSRLI)
33142   NODE_NAME_CASE(VSRAI)
33143   NODE_NAME_CASE(VSHLV)
33144   NODE_NAME_CASE(VSRLV)
33145   NODE_NAME_CASE(VSRAV)
33146   NODE_NAME_CASE(VROTLI)
33147   NODE_NAME_CASE(VROTRI)
33148   NODE_NAME_CASE(VPPERM)
33149   NODE_NAME_CASE(CMPP)
33150   NODE_NAME_CASE(STRICT_CMPP)
33151   NODE_NAME_CASE(PCMPEQ)
33152   NODE_NAME_CASE(PCMPGT)
33153   NODE_NAME_CASE(PHMINPOS)
33154   NODE_NAME_CASE(ADD)
33155   NODE_NAME_CASE(SUB)
33156   NODE_NAME_CASE(ADC)
33157   NODE_NAME_CASE(SBB)
33158   NODE_NAME_CASE(SMUL)
33159   NODE_NAME_CASE(UMUL)
33160   NODE_NAME_CASE(OR)
33161   NODE_NAME_CASE(XOR)
33162   NODE_NAME_CASE(AND)
33163   NODE_NAME_CASE(BEXTR)
33164   NODE_NAME_CASE(BEXTRI)
33165   NODE_NAME_CASE(BZHI)
33166   NODE_NAME_CASE(PDEP)
33167   NODE_NAME_CASE(PEXT)
33168   NODE_NAME_CASE(MUL_IMM)
33169   NODE_NAME_CASE(MOVMSK)
33170   NODE_NAME_CASE(PTEST)
33171   NODE_NAME_CASE(TESTP)
33172   NODE_NAME_CASE(KORTEST)
33173   NODE_NAME_CASE(KTEST)
33174   NODE_NAME_CASE(KADD)
33175   NODE_NAME_CASE(KSHIFTL)
33176   NODE_NAME_CASE(KSHIFTR)
33177   NODE_NAME_CASE(PACKSS)
33178   NODE_NAME_CASE(PACKUS)
33179   NODE_NAME_CASE(PALIGNR)
33180   NODE_NAME_CASE(VALIGN)
33181   NODE_NAME_CASE(VSHLD)
33182   NODE_NAME_CASE(VSHRD)
33183   NODE_NAME_CASE(VSHLDV)
33184   NODE_NAME_CASE(VSHRDV)
33185   NODE_NAME_CASE(PSHUFD)
33186   NODE_NAME_CASE(PSHUFHW)
33187   NODE_NAME_CASE(PSHUFLW)
33188   NODE_NAME_CASE(SHUFP)
33189   NODE_NAME_CASE(SHUF128)
33190   NODE_NAME_CASE(MOVLHPS)
33191   NODE_NAME_CASE(MOVHLPS)
33192   NODE_NAME_CASE(MOVDDUP)
33193   NODE_NAME_CASE(MOVSHDUP)
33194   NODE_NAME_CASE(MOVSLDUP)
33195   NODE_NAME_CASE(MOVSD)
33196   NODE_NAME_CASE(MOVSS)
33197   NODE_NAME_CASE(MOVSH)
33198   NODE_NAME_CASE(UNPCKL)
33199   NODE_NAME_CASE(UNPCKH)
33200   NODE_NAME_CASE(VBROADCAST)
33201   NODE_NAME_CASE(VBROADCAST_LOAD)
33202   NODE_NAME_CASE(VBROADCASTM)
33203   NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
33204   NODE_NAME_CASE(VPERMILPV)
33205   NODE_NAME_CASE(VPERMILPI)
33206   NODE_NAME_CASE(VPERM2X128)
33207   NODE_NAME_CASE(VPERMV)
33208   NODE_NAME_CASE(VPERMV3)
33209   NODE_NAME_CASE(VPERMI)
33210   NODE_NAME_CASE(VPTERNLOG)
33211   NODE_NAME_CASE(VFIXUPIMM)
33212   NODE_NAME_CASE(VFIXUPIMM_SAE)
33213   NODE_NAME_CASE(VFIXUPIMMS)
33214   NODE_NAME_CASE(VFIXUPIMMS_SAE)
33215   NODE_NAME_CASE(VRANGE)
33216   NODE_NAME_CASE(VRANGE_SAE)
33217   NODE_NAME_CASE(VRANGES)
33218   NODE_NAME_CASE(VRANGES_SAE)
33219   NODE_NAME_CASE(PMULUDQ)
33220   NODE_NAME_CASE(PMULDQ)
33221   NODE_NAME_CASE(PSADBW)
33222   NODE_NAME_CASE(DBPSADBW)
33223   NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
33224   NODE_NAME_CASE(VAARG_64)
33225   NODE_NAME_CASE(VAARG_X32)
33226   NODE_NAME_CASE(DYN_ALLOCA)
33227   NODE_NAME_CASE(MFENCE)
33228   NODE_NAME_CASE(SEG_ALLOCA)
33229   NODE_NAME_CASE(PROBED_ALLOCA)
33230   NODE_NAME_CASE(RDRAND)
33231   NODE_NAME_CASE(RDSEED)
33232   NODE_NAME_CASE(RDPKRU)
33233   NODE_NAME_CASE(WRPKRU)
33234   NODE_NAME_CASE(VPMADDUBSW)
33235   NODE_NAME_CASE(VPMADDWD)
33236   NODE_NAME_CASE(VPSHA)
33237   NODE_NAME_CASE(VPSHL)
33238   NODE_NAME_CASE(VPCOM)
33239   NODE_NAME_CASE(VPCOMU)
33240   NODE_NAME_CASE(VPERMIL2)
33241   NODE_NAME_CASE(FMSUB)
33242   NODE_NAME_CASE(STRICT_FMSUB)
33243   NODE_NAME_CASE(FNMADD)
33244   NODE_NAME_CASE(STRICT_FNMADD)
33245   NODE_NAME_CASE(FNMSUB)
33246   NODE_NAME_CASE(STRICT_FNMSUB)
33247   NODE_NAME_CASE(FMADDSUB)
33248   NODE_NAME_CASE(FMSUBADD)
33249   NODE_NAME_CASE(FMADD_RND)
33250   NODE_NAME_CASE(FNMADD_RND)
33251   NODE_NAME_CASE(FMSUB_RND)
33252   NODE_NAME_CASE(FNMSUB_RND)
33253   NODE_NAME_CASE(FMADDSUB_RND)
33254   NODE_NAME_CASE(FMSUBADD_RND)
33255   NODE_NAME_CASE(VFMADDC)
33256   NODE_NAME_CASE(VFMADDC_RND)
33257   NODE_NAME_CASE(VFCMADDC)
33258   NODE_NAME_CASE(VFCMADDC_RND)
33259   NODE_NAME_CASE(VFMULC)
33260   NODE_NAME_CASE(VFMULC_RND)
33261   NODE_NAME_CASE(VFCMULC)
33262   NODE_NAME_CASE(VFCMULC_RND)
33263   NODE_NAME_CASE(VFMULCSH)
33264   NODE_NAME_CASE(VFMULCSH_RND)
33265   NODE_NAME_CASE(VFCMULCSH)
33266   NODE_NAME_CASE(VFCMULCSH_RND)
33267   NODE_NAME_CASE(VFMADDCSH)
33268   NODE_NAME_CASE(VFMADDCSH_RND)
33269   NODE_NAME_CASE(VFCMADDCSH)
33270   NODE_NAME_CASE(VFCMADDCSH_RND)
33271   NODE_NAME_CASE(VPMADD52H)
33272   NODE_NAME_CASE(VPMADD52L)
33273   NODE_NAME_CASE(VRNDSCALE)
33274   NODE_NAME_CASE(STRICT_VRNDSCALE)
33275   NODE_NAME_CASE(VRNDSCALE_SAE)
33276   NODE_NAME_CASE(VRNDSCALES)
33277   NODE_NAME_CASE(VRNDSCALES_SAE)
33278   NODE_NAME_CASE(VREDUCE)
33279   NODE_NAME_CASE(VREDUCE_SAE)
33280   NODE_NAME_CASE(VREDUCES)
33281   NODE_NAME_CASE(VREDUCES_SAE)
33282   NODE_NAME_CASE(VGETMANT)
33283   NODE_NAME_CASE(VGETMANT_SAE)
33284   NODE_NAME_CASE(VGETMANTS)
33285   NODE_NAME_CASE(VGETMANTS_SAE)
33286   NODE_NAME_CASE(PCMPESTR)
33287   NODE_NAME_CASE(PCMPISTR)
33288   NODE_NAME_CASE(XTEST)
33289   NODE_NAME_CASE(COMPRESS)
33290   NODE_NAME_CASE(EXPAND)
33291   NODE_NAME_CASE(SELECTS)
33292   NODE_NAME_CASE(ADDSUB)
33293   NODE_NAME_CASE(RCP14)
33294   NODE_NAME_CASE(RCP14S)
33295   NODE_NAME_CASE(RCP28)
33296   NODE_NAME_CASE(RCP28_SAE)
33297   NODE_NAME_CASE(RCP28S)
33298   NODE_NAME_CASE(RCP28S_SAE)
33299   NODE_NAME_CASE(EXP2)
33300   NODE_NAME_CASE(EXP2_SAE)
33301   NODE_NAME_CASE(RSQRT14)
33302   NODE_NAME_CASE(RSQRT14S)
33303   NODE_NAME_CASE(RSQRT28)
33304   NODE_NAME_CASE(RSQRT28_SAE)
33305   NODE_NAME_CASE(RSQRT28S)
33306   NODE_NAME_CASE(RSQRT28S_SAE)
33307   NODE_NAME_CASE(FADD_RND)
33308   NODE_NAME_CASE(FADDS)
33309   NODE_NAME_CASE(FADDS_RND)
33310   NODE_NAME_CASE(FSUB_RND)
33311   NODE_NAME_CASE(FSUBS)
33312   NODE_NAME_CASE(FSUBS_RND)
33313   NODE_NAME_CASE(FMUL_RND)
33314   NODE_NAME_CASE(FMULS)
33315   NODE_NAME_CASE(FMULS_RND)
33316   NODE_NAME_CASE(FDIV_RND)
33317   NODE_NAME_CASE(FDIVS)
33318   NODE_NAME_CASE(FDIVS_RND)
33319   NODE_NAME_CASE(FSQRT_RND)
33320   NODE_NAME_CASE(FSQRTS)
33321   NODE_NAME_CASE(FSQRTS_RND)
33322   NODE_NAME_CASE(FGETEXP)
33323   NODE_NAME_CASE(FGETEXP_SAE)
33324   NODE_NAME_CASE(FGETEXPS)
33325   NODE_NAME_CASE(FGETEXPS_SAE)
33326   NODE_NAME_CASE(SCALEF)
33327   NODE_NAME_CASE(SCALEF_RND)
33328   NODE_NAME_CASE(SCALEFS)
33329   NODE_NAME_CASE(SCALEFS_RND)
33330   NODE_NAME_CASE(MULHRS)
33331   NODE_NAME_CASE(SINT_TO_FP_RND)
33332   NODE_NAME_CASE(UINT_TO_FP_RND)
33333   NODE_NAME_CASE(CVTTP2SI)
33334   NODE_NAME_CASE(CVTTP2UI)
33335   NODE_NAME_CASE(STRICT_CVTTP2SI)
33336   NODE_NAME_CASE(STRICT_CVTTP2UI)
33337   NODE_NAME_CASE(MCVTTP2SI)
33338   NODE_NAME_CASE(MCVTTP2UI)
33339   NODE_NAME_CASE(CVTTP2SI_SAE)
33340   NODE_NAME_CASE(CVTTP2UI_SAE)
33341   NODE_NAME_CASE(CVTTS2SI)
33342   NODE_NAME_CASE(CVTTS2UI)
33343   NODE_NAME_CASE(CVTTS2SI_SAE)
33344   NODE_NAME_CASE(CVTTS2UI_SAE)
33345   NODE_NAME_CASE(CVTSI2P)
33346   NODE_NAME_CASE(CVTUI2P)
33347   NODE_NAME_CASE(STRICT_CVTSI2P)
33348   NODE_NAME_CASE(STRICT_CVTUI2P)
33349   NODE_NAME_CASE(MCVTSI2P)
33350   NODE_NAME_CASE(MCVTUI2P)
33351   NODE_NAME_CASE(VFPCLASS)
33352   NODE_NAME_CASE(VFPCLASSS)
33353   NODE_NAME_CASE(MULTISHIFT)
33354   NODE_NAME_CASE(SCALAR_SINT_TO_FP)
33355   NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
33356   NODE_NAME_CASE(SCALAR_UINT_TO_FP)
33357   NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
33358   NODE_NAME_CASE(CVTPS2PH)
33359   NODE_NAME_CASE(STRICT_CVTPS2PH)
33360   NODE_NAME_CASE(CVTPS2PH_SAE)
33361   NODE_NAME_CASE(MCVTPS2PH)
33362   NODE_NAME_CASE(MCVTPS2PH_SAE)
33363   NODE_NAME_CASE(CVTPH2PS)
33364   NODE_NAME_CASE(STRICT_CVTPH2PS)
33365   NODE_NAME_CASE(CVTPH2PS_SAE)
33366   NODE_NAME_CASE(CVTP2SI)
33367   NODE_NAME_CASE(CVTP2UI)
33368   NODE_NAME_CASE(MCVTP2SI)
33369   NODE_NAME_CASE(MCVTP2UI)
33370   NODE_NAME_CASE(CVTP2SI_RND)
33371   NODE_NAME_CASE(CVTP2UI_RND)
33372   NODE_NAME_CASE(CVTS2SI)
33373   NODE_NAME_CASE(CVTS2UI)
33374   NODE_NAME_CASE(CVTS2SI_RND)
33375   NODE_NAME_CASE(CVTS2UI_RND)
33376   NODE_NAME_CASE(CVTNE2PS2BF16)
33377   NODE_NAME_CASE(CVTNEPS2BF16)
33378   NODE_NAME_CASE(MCVTNEPS2BF16)
33379   NODE_NAME_CASE(DPBF16PS)
33380   NODE_NAME_CASE(LWPINS)
33381   NODE_NAME_CASE(MGATHER)
33382   NODE_NAME_CASE(MSCATTER)
33383   NODE_NAME_CASE(VPDPBUSD)
33384   NODE_NAME_CASE(VPDPBUSDS)
33385   NODE_NAME_CASE(VPDPWSSD)
33386   NODE_NAME_CASE(VPDPWSSDS)
33387   NODE_NAME_CASE(VPSHUFBITQMB)
33388   NODE_NAME_CASE(GF2P8MULB)
33389   NODE_NAME_CASE(GF2P8AFFINEQB)
33390   NODE_NAME_CASE(GF2P8AFFINEINVQB)
33391   NODE_NAME_CASE(NT_CALL)
33392   NODE_NAME_CASE(NT_BRIND)
33393   NODE_NAME_CASE(UMWAIT)
33394   NODE_NAME_CASE(TPAUSE)
33395   NODE_NAME_CASE(ENQCMD)
33396   NODE_NAME_CASE(ENQCMDS)
33397   NODE_NAME_CASE(VP2INTERSECT)
33398   NODE_NAME_CASE(VPDPBSUD)
33399   NODE_NAME_CASE(VPDPBSUDS)
33400   NODE_NAME_CASE(VPDPBUUD)
33401   NODE_NAME_CASE(VPDPBUUDS)
33402   NODE_NAME_CASE(VPDPBSSD)
33403   NODE_NAME_CASE(VPDPBSSDS)
33404   NODE_NAME_CASE(AESENC128KL)
33405   NODE_NAME_CASE(AESDEC128KL)
33406   NODE_NAME_CASE(AESENC256KL)
33407   NODE_NAME_CASE(AESDEC256KL)
33408   NODE_NAME_CASE(AESENCWIDE128KL)
33409   NODE_NAME_CASE(AESDECWIDE128KL)
33410   NODE_NAME_CASE(AESENCWIDE256KL)
33411   NODE_NAME_CASE(AESDECWIDE256KL)
33412   NODE_NAME_CASE(CMPCCXADD)
33413   NODE_NAME_CASE(TESTUI)
33414   NODE_NAME_CASE(FP80_ADD)
33415   NODE_NAME_CASE(STRICT_FP80_ADD)
33416   }
33417   return nullptr;
33418 #undef NODE_NAME_CASE
33419 }
33420
33421 /// Return true if the addressing mode represented by AM is legal for this
33422 /// target, for a load/store of the specified type.
33423 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
33424                                               const AddrMode &AM, Type *Ty,
33425                                               unsigned AS,
33426                                               Instruction *I) const {
33427   // X86 supports extremely general addressing modes.
33428   CodeModel::Model M = getTargetMachine().getCodeModel();
33429
33430   // X86 allows a sign-extended 32-bit immediate field as a displacement.
33431   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
33432     return false;
33433
33434   if (AM.BaseGV) {
33435     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
33436
33437     // If a reference to this global requires an extra load, we can't fold it.
33438     if (isGlobalStubReference(GVFlags))
33439       return false;
33440
33441     // If BaseGV requires a register for the PIC base, we cannot also have a
33442     // BaseReg specified.
33443     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
33444       return false;
33445
33446     // If lower 4G is not available, then we must use rip-relative addressing.
33447     if ((M != CodeModel::Small || isPositionIndependent()) &&
33448         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
33449       return false;
33450   }
33451
33452   switch (AM.Scale) {
33453   case 0:
33454   case 1:
33455   case 2:
33456   case 4:
33457   case 8:
33458     // These scales always work.
33459     break;
33460   case 3:
33461   case 5:
33462   case 9:
33463     // These scales are formed with basereg+scalereg.  Only accept if there is
33464     // no basereg yet.
33465     if (AM.HasBaseReg)
33466       return false;
33467     break;
33468   default:  // Other stuff never works.
33469     return false;
33470   }
33471
33472   return true;
33473 }
33474
33475 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
33476   unsigned Bits = Ty->getScalarSizeInBits();
33477
33478   // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
33479   // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
33480   if (Subtarget.hasXOP() &&
33481       (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
33482     return false;
33483
33484   // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
33485   // shifts just as cheap as scalar ones.
33486   if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
33487     return false;
33488
33489   // AVX512BW has shifts such as vpsllvw.
33490   if (Subtarget.hasBWI() && Bits == 16)
33491     return false;
33492
33493   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
33494   // fully general vector.
33495   return true;
33496 }
33497
33498 bool X86TargetLowering::isBinOp(unsigned Opcode) const {
33499   switch (Opcode) {
33500   // These are non-commutative binops.
33501   // TODO: Add more X86ISD opcodes once we have test coverage.
33502   case X86ISD::ANDNP:
33503   case X86ISD::PCMPGT:
33504   case X86ISD::FMAX:
33505   case X86ISD::FMIN:
33506   case X86ISD::FANDN:
33507   case X86ISD::VPSHA:
33508   case X86ISD::VPSHL:
33509   case X86ISD::VSHLV:
33510   case X86ISD::VSRLV:
33511   case X86ISD::VSRAV:
33512     return true;
33513   }
33514
33515   return TargetLoweringBase::isBinOp(Opcode);
33516 }
33517
33518 bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
33519   switch (Opcode) {
33520   // TODO: Add more X86ISD opcodes once we have test coverage.
33521   case X86ISD::PCMPEQ:
33522   case X86ISD::PMULDQ:
33523   case X86ISD::PMULUDQ:
33524   case X86ISD::FMAXC:
33525   case X86ISD::FMINC:
33526   case X86ISD::FAND:
33527   case X86ISD::FOR:
33528   case X86ISD::FXOR:
33529     return true;
33530   }
33531
33532   return TargetLoweringBase::isCommutativeBinOp(Opcode);
33533 }
33534
33535 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
33536   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33537     return false;
33538   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
33539   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
33540   return NumBits1 > NumBits2;
33541 }
33542
33543 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
33544   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33545     return false;
33546
33547   if (!isTypeLegal(EVT::getEVT(Ty1)))
33548     return false;
33549
33550   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
33551
33552   // Assuming the caller doesn't have a zeroext or signext return parameter,
33553   // truncation all the way down to i1 is valid.
33554   return true;
33555 }
33556
33557 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
33558   return isInt<32>(Imm);
33559 }
33560
33561 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
33562   // Can also use sub to handle negated immediates.
33563   return isInt<32>(Imm);
33564 }
33565
33566 bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
33567   return isInt<32>(Imm);
33568 }
33569
33570 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
33571   if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
33572     return false;
33573   unsigned NumBits1 = VT1.getSizeInBits();
33574   unsigned NumBits2 = VT2.getSizeInBits();
33575   return NumBits1 > NumBits2;
33576 }
33577
33578 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
33579   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
33580   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
33581 }
33582
33583 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
33584   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
33585   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
33586 }
33587
33588 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
33589   EVT VT1 = Val.getValueType();
33590   if (isZExtFree(VT1, VT2))
33591     return true;
33592
33593   if (Val.getOpcode() != ISD::LOAD)
33594     return false;
33595
33596   if (!VT1.isSimple() || !VT1.isInteger() ||
33597       !VT2.isSimple() || !VT2.isInteger())
33598     return false;
33599
33600   switch (VT1.getSimpleVT().SimpleTy) {
33601   default: break;
33602   case MVT::i8:
33603   case MVT::i16:
33604   case MVT::i32:
33605     // X86 has 8, 16, and 32-bit zero-extending loads.
33606     return true;
33607   }
33608
33609   return false;
33610 }
33611
33612 bool X86TargetLowering::shouldSinkOperands(Instruction *I,
33613                                            SmallVectorImpl<Use *> &Ops) const {
33614   using namespace llvm::PatternMatch;
33615
33616   FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
33617   if (!VTy)
33618     return false;
33619
33620   if (I->getOpcode() == Instruction::Mul &&
33621       VTy->getElementType()->isIntegerTy(64)) {
33622     for (auto &Op : I->operands()) {
33623       // Make sure we are not already sinking this operand
33624       if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
33625         continue;
33626
33627       // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
33628       // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
33629       if (Subtarget.hasSSE41() &&
33630           match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
33631                                  m_SpecificInt(32)))) {
33632         Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
33633         Ops.push_back(&Op);
33634       } else if (Subtarget.hasSSE2() &&
33635                  match(Op.get(),
33636                        m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
33637         Ops.push_back(&Op);
33638       }
33639     }
33640
33641     return !Ops.empty();
33642   }
33643
33644   // A uniform shift amount in a vector shift or funnel shift may be much
33645   // cheaper than a generic variable vector shift, so make that pattern visible
33646   // to SDAG by sinking the shuffle instruction next to the shift.
33647   int ShiftAmountOpNum = -1;
33648   if (I->isShift())
33649     ShiftAmountOpNum = 1;
33650   else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
33651     if (II->getIntrinsicID() == Intrinsic::fshl ||
33652         II->getIntrinsicID() == Intrinsic::fshr)
33653       ShiftAmountOpNum = 2;
33654   }
33655
33656   if (ShiftAmountOpNum == -1)
33657     return false;
33658
33659   auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
33660   if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
33661       isVectorShiftByScalarCheap(I->getType())) {
33662     Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
33663     return true;
33664   }
33665
33666   return false;
33667 }
33668
33669 bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
33670   if (!Subtarget.is64Bit())
33671     return false;
33672   return TargetLowering::shouldConvertPhiType(From, To);
33673 }
33674
33675 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
33676   if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
33677     return false;
33678
33679   EVT SrcVT = ExtVal.getOperand(0).getValueType();
33680
33681   // There is no extending load for vXi1.
33682   if (SrcVT.getScalarType() == MVT::i1)
33683     return false;
33684
33685   return true;
33686 }
33687
33688 bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
33689                                                    EVT VT) const {
33690   if (!Subtarget.hasAnyFMA())
33691     return false;
33692
33693   VT = VT.getScalarType();
33694
33695   if (!VT.isSimple())
33696     return false;
33697
33698   switch (VT.getSimpleVT().SimpleTy) {
33699   case MVT::f16:
33700     return Subtarget.hasFP16();
33701   case MVT::f32:
33702   case MVT::f64:
33703     return true;
33704   default:
33705     break;
33706   }
33707
33708   return false;
33709 }
33710
33711 bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
33712   // i16 instructions are longer (0x66 prefix) and potentially slower.
33713   return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
33714 }
33715
33716 bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,
33717                                                              EVT VT) const {
33718   // TODO: This is too general. There are cases where pre-AVX512 codegen would
33719   //       benefit. The transform may also be profitable for scalar code.
33720   if (!Subtarget.hasAVX512())
33721     return false;
33722   if (!Subtarget.hasVLX() && !VT.is512BitVector())
33723     return false;
33724   if (!VT.isVector() || VT.getScalarType() == MVT::i1)
33725     return false;
33726
33727   return true;
33728 }
33729
33730 /// Targets can use this to indicate that they only support *some*
33731 /// VECTOR_SHUFFLE operations, those with specific masks.
33732 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
33733 /// are assumed to be legal.
33734 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
33735   if (!VT.isSimple())
33736     return false;
33737
33738   // Not for i1 vectors
33739   if (VT.getSimpleVT().getScalarType() == MVT::i1)
33740     return false;
33741
33742   // Very little shuffling can be done for 64-bit vectors right now.
33743   if (VT.getSimpleVT().getSizeInBits() == 64)
33744     return false;
33745
33746   // We only care that the types being shuffled are legal. The lowering can
33747   // handle any possible shuffle mask that results.
33748   return isTypeLegal(VT.getSimpleVT());
33749 }
33750
33751 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
33752                                                EVT VT) const {
33753   // Don't convert an 'and' into a shuffle that we don't directly support.
33754   // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
33755   if (!Subtarget.hasAVX2())
33756     if (VT == MVT::v32i8 || VT == MVT::v16i16)
33757       return false;
33758
33759   // Just delegate to the generic legality, clear masks aren't special.
33760   return isShuffleMaskLegal(Mask, VT);
33761 }
33762
33763 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
33764   // If the subtarget is using thunks, we need to not generate jump tables.
33765   if (Subtarget.useIndirectThunkBranches())
33766     return false;
33767
33768   // Otherwise, fallback on the generic logic.
33769   return TargetLowering::areJTsAllowed(Fn);
33770 }
33771
33772 MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,
33773                                                        EVT ConditionVT) const {
33774   // Avoid 8 and 16 bit types because they increase the chance for unnecessary
33775   // zero-extensions.
33776   if (ConditionVT.getSizeInBits() < 32)
33777     return MVT::i32;
33778   return TargetLoweringBase::getPreferredSwitchConditionType(Context,
33779                                                              ConditionVT);
33780 }
33781
33782 //===----------------------------------------------------------------------===//
33783 //                           X86 Scheduler Hooks
33784 //===----------------------------------------------------------------------===//
33785
33786 // Returns true if EFLAG is consumed after this iterator in the rest of the
33787 // basic block or any successors of the basic block.
33788 static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
33789                               MachineBasicBlock *BB) {
33790   // Scan forward through BB for a use/def of EFLAGS.
33791   for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
33792     if (mi.readsRegister(X86::EFLAGS))
33793       return true;
33794     // If we found a def, we can stop searching.
33795     if (mi.definesRegister(X86::EFLAGS))
33796       return false;
33797   }
33798
33799   // If we hit the end of the block, check whether EFLAGS is live into a
33800   // successor.
33801   for (MachineBasicBlock *Succ : BB->successors())
33802     if (Succ->isLiveIn(X86::EFLAGS))
33803       return true;
33804
33805   return false;
33806 }
33807
33808 /// Utility function to emit xbegin specifying the start of an RTM region.
33809 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
33810                                      const TargetInstrInfo *TII) {
33811   const MIMetadata MIMD(MI);
33812
33813   const BasicBlock *BB = MBB->getBasicBlock();
33814   MachineFunction::iterator I = ++MBB->getIterator();
33815
33816   // For the v = xbegin(), we generate
33817   //
33818   // thisMBB:
33819   //  xbegin sinkMBB
33820   //
33821   // mainMBB:
33822   //  s0 = -1
33823   //
33824   // fallBB:
33825   //  eax = # XABORT_DEF
33826   //  s1 = eax
33827   //
33828   // sinkMBB:
33829   //  v = phi(s0/mainBB, s1/fallBB)
33830
33831   MachineBasicBlock *thisMBB = MBB;
33832   MachineFunction *MF = MBB->getParent();
33833   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
33834   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
33835   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
33836   MF->insert(I, mainMBB);
33837   MF->insert(I, fallMBB);
33838   MF->insert(I, sinkMBB);
33839
33840   if (isEFLAGSLiveAfter(MI, MBB)) {
33841     mainMBB->addLiveIn(X86::EFLAGS);
33842     fallMBB->addLiveIn(X86::EFLAGS);
33843     sinkMBB->addLiveIn(X86::EFLAGS);
33844   }
33845
33846   // Transfer the remainder of BB and its successor edges to sinkMBB.
33847   sinkMBB->splice(sinkMBB->begin(), MBB,
33848                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33849   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
33850
33851   MachineRegisterInfo &MRI = MF->getRegInfo();
33852   Register DstReg = MI.getOperand(0).getReg();
33853   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
33854   Register mainDstReg = MRI.createVirtualRegister(RC);
33855   Register fallDstReg = MRI.createVirtualRegister(RC);
33856
33857   // thisMBB:
33858   //  xbegin fallMBB
33859   //  # fallthrough to mainMBB
33860   //  # abortion to fallMBB
33861   BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
33862   thisMBB->addSuccessor(mainMBB);
33863   thisMBB->addSuccessor(fallMBB);
33864
33865   // mainMBB:
33866   //  mainDstReg := -1
33867   BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
33868   BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
33869   mainMBB->addSuccessor(sinkMBB);
33870
33871   // fallMBB:
33872   //  ; pseudo instruction to model hardware's definition from XABORT
33873   //  EAX := XABORT_DEF
33874   //  fallDstReg := EAX
33875   BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
33876   BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
33877       .addReg(X86::EAX);
33878   fallMBB->addSuccessor(sinkMBB);
33879
33880   // sinkMBB:
33881   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
33882   BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
33883       .addReg(mainDstReg).addMBB(mainMBB)
33884       .addReg(fallDstReg).addMBB(fallMBB);
33885
33886   MI.eraseFromParent();
33887   return sinkMBB;
33888 }
33889
33890 MachineBasicBlock *
33891 X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
33892                                                MachineBasicBlock *MBB) const {
33893   // Emit va_arg instruction on X86-64.
33894
33895   // Operands to this pseudo-instruction:
33896   // 0  ) Output        : destination address (reg)
33897   // 1-5) Input         : va_list address (addr, i64mem)
33898   // 6  ) ArgSize       : Size (in bytes) of vararg type
33899   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
33900   // 8  ) Align         : Alignment of type
33901   // 9  ) EFLAGS (implicit-def)
33902
33903   assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
33904   static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
33905
33906   Register DestReg = MI.getOperand(0).getReg();
33907   MachineOperand &Base = MI.getOperand(1);
33908   MachineOperand &Scale = MI.getOperand(2);
33909   MachineOperand &Index = MI.getOperand(3);
33910   MachineOperand &Disp = MI.getOperand(4);
33911   MachineOperand &Segment = MI.getOperand(5);
33912   unsigned ArgSize = MI.getOperand(6).getImm();
33913   unsigned ArgMode = MI.getOperand(7).getImm();
33914   Align Alignment = Align(MI.getOperand(8).getImm());
33915
33916   MachineFunction *MF = MBB->getParent();
33917
33918   // Memory Reference
33919   assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
33920
33921   MachineMemOperand *OldMMO = MI.memoperands().front();
33922
33923   // Clone the MMO into two separate MMOs for loading and storing
33924   MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
33925       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
33926   MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
33927       OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
33928
33929   // Machine Information
33930   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33931   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
33932   const TargetRegisterClass *AddrRegClass =
33933       getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
33934   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
33935   const MIMetadata MIMD(MI);
33936
33937   // struct va_list {
33938   //   i32   gp_offset
33939   //   i32   fp_offset
33940   //   i64   overflow_area (address)
33941   //   i64   reg_save_area (address)
33942   // }
33943   // sizeof(va_list) = 24
33944   // alignment(va_list) = 8
33945
33946   unsigned TotalNumIntRegs = 6;
33947   unsigned TotalNumXMMRegs = 8;
33948   bool UseGPOffset = (ArgMode == 1);
33949   bool UseFPOffset = (ArgMode == 2);
33950   unsigned MaxOffset = TotalNumIntRegs * 8 +
33951                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
33952
33953   /* Align ArgSize to a multiple of 8 */
33954   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
33955   bool NeedsAlign = (Alignment > 8);
33956
33957   MachineBasicBlock *thisMBB = MBB;
33958   MachineBasicBlock *overflowMBB;
33959   MachineBasicBlock *offsetMBB;
33960   MachineBasicBlock *endMBB;
33961
33962   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
33963   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
33964   unsigned OffsetReg = 0;
33965
33966   if (!UseGPOffset && !UseFPOffset) {
33967     // If we only pull from the overflow region, we don't create a branch.
33968     // We don't need to alter control flow.
33969     OffsetDestReg = 0; // unused
33970     OverflowDestReg = DestReg;
33971
33972     offsetMBB = nullptr;
33973     overflowMBB = thisMBB;
33974     endMBB = thisMBB;
33975   } else {
33976     // First emit code to check if gp_offset (or fp_offset) is below the bound.
33977     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
33978     // If not, pull from overflow_area. (branch to overflowMBB)
33979     //
33980     //       thisMBB
33981     //         |     .
33982     //         |        .
33983     //     offsetMBB   overflowMBB
33984     //         |        .
33985     //         |     .
33986     //        endMBB
33987
33988     // Registers for the PHI in endMBB
33989     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
33990     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
33991
33992     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
33993     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33994     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33995     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33996
33997     MachineFunction::iterator MBBIter = ++MBB->getIterator();
33998
33999     // Insert the new basic blocks
34000     MF->insert(MBBIter, offsetMBB);
34001     MF->insert(MBBIter, overflowMBB);
34002     MF->insert(MBBIter, endMBB);
34003
34004     // Transfer the remainder of MBB and its successor edges to endMBB.
34005     endMBB->splice(endMBB->begin(), thisMBB,
34006                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
34007     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
34008
34009     // Make offsetMBB and overflowMBB successors of thisMBB
34010     thisMBB->addSuccessor(offsetMBB);
34011     thisMBB->addSuccessor(overflowMBB);
34012
34013     // endMBB is a successor of both offsetMBB and overflowMBB
34014     offsetMBB->addSuccessor(endMBB);
34015     overflowMBB->addSuccessor(endMBB);
34016
34017     // Load the offset value into a register
34018     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34019     BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
34020         .add(Base)
34021         .add(Scale)
34022         .add(Index)
34023         .addDisp(Disp, UseFPOffset ? 4 : 0)
34024         .add(Segment)
34025         .setMemRefs(LoadOnlyMMO);
34026
34027     // Check if there is enough room left to pull this argument.
34028     BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
34029       .addReg(OffsetReg)
34030       .addImm(MaxOffset + 8 - ArgSizeA8);
34031
34032     // Branch to "overflowMBB" if offset >= max
34033     // Fall through to "offsetMBB" otherwise
34034     BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
34035       .addMBB(overflowMBB).addImm(X86::COND_AE);
34036   }
34037
34038   // In offsetMBB, emit code to use the reg_save_area.
34039   if (offsetMBB) {
34040     assert(OffsetReg != 0);
34041
34042     // Read the reg_save_area address.
34043     Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
34044     BuildMI(
34045         offsetMBB, MIMD,
34046         TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34047         RegSaveReg)
34048         .add(Base)
34049         .add(Scale)
34050         .add(Index)
34051         .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
34052         .add(Segment)
34053         .setMemRefs(LoadOnlyMMO);
34054
34055     if (Subtarget.isTarget64BitLP64()) {
34056       // Zero-extend the offset
34057       Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
34058       BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
34059           .addImm(0)
34060           .addReg(OffsetReg)
34061           .addImm(X86::sub_32bit);
34062
34063       // Add the offset to the reg_save_area to get the final address.
34064       BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
34065           .addReg(OffsetReg64)
34066           .addReg(RegSaveReg);
34067     } else {
34068       // Add the offset to the reg_save_area to get the final address.
34069       BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
34070           .addReg(OffsetReg)
34071           .addReg(RegSaveReg);
34072     }
34073
34074     // Compute the offset for the next argument
34075     Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34076     BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
34077       .addReg(OffsetReg)
34078       .addImm(UseFPOffset ? 16 : 8);
34079
34080     // Store it back into the va_list.
34081     BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
34082         .add(Base)
34083         .add(Scale)
34084         .add(Index)
34085         .addDisp(Disp, UseFPOffset ? 4 : 0)
34086         .add(Segment)
34087         .addReg(NextOffsetReg)
34088         .setMemRefs(StoreOnlyMMO);
34089
34090     // Jump to endMBB
34091     BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
34092       .addMBB(endMBB);
34093   }
34094
34095   //
34096   // Emit code to use overflow area
34097   //
34098
34099   // Load the overflow_area address into a register.
34100   Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
34101   BuildMI(overflowMBB, MIMD,
34102           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34103           OverflowAddrReg)
34104       .add(Base)
34105       .add(Scale)
34106       .add(Index)
34107       .addDisp(Disp, 8)
34108       .add(Segment)
34109       .setMemRefs(LoadOnlyMMO);
34110
34111   // If we need to align it, do so. Otherwise, just copy the address
34112   // to OverflowDestReg.
34113   if (NeedsAlign) {
34114     // Align the overflow address
34115     Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
34116
34117     // aligned_addr = (addr + (align-1)) & ~(align-1)
34118     BuildMI(
34119         overflowMBB, MIMD,
34120         TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34121         TmpReg)
34122         .addReg(OverflowAddrReg)
34123         .addImm(Alignment.value() - 1);
34124
34125     BuildMI(
34126         overflowMBB, MIMD,
34127         TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
34128         OverflowDestReg)
34129         .addReg(TmpReg)
34130         .addImm(~(uint64_t)(Alignment.value() - 1));
34131   } else {
34132     BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
34133       .addReg(OverflowAddrReg);
34134   }
34135
34136   // Compute the next overflow address after this argument.
34137   // (the overflow address should be kept 8-byte aligned)
34138   Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
34139   BuildMI(
34140       overflowMBB, MIMD,
34141       TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34142       NextAddrReg)
34143       .addReg(OverflowDestReg)
34144       .addImm(ArgSizeA8);
34145
34146   // Store the new overflow address.
34147   BuildMI(overflowMBB, MIMD,
34148           TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
34149       .add(Base)
34150       .add(Scale)
34151       .add(Index)
34152       .addDisp(Disp, 8)
34153       .add(Segment)
34154       .addReg(NextAddrReg)
34155       .setMemRefs(StoreOnlyMMO);
34156
34157   // If we branched, emit the PHI to the front of endMBB.
34158   if (offsetMBB) {
34159     BuildMI(*endMBB, endMBB->begin(), MIMD,
34160             TII->get(X86::PHI), DestReg)
34161       .addReg(OffsetDestReg).addMBB(offsetMBB)
34162       .addReg(OverflowDestReg).addMBB(overflowMBB);
34163   }
34164
34165   // Erase the pseudo instruction
34166   MI.eraseFromParent();
34167
34168   return endMBB;
34169 }
34170
34171 // The EFLAGS operand of SelectItr might be missing a kill marker
34172 // because there were multiple uses of EFLAGS, and ISel didn't know
34173 // which to mark. Figure out whether SelectItr should have had a
34174 // kill marker, and set it if it should. Returns the correct kill
34175 // marker value.
34176 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
34177                                      MachineBasicBlock* BB,
34178                                      const TargetRegisterInfo* TRI) {
34179   if (isEFLAGSLiveAfter(SelectItr, BB))
34180     return false;
34181
34182   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
34183   // out. SelectMI should have a kill flag on EFLAGS.
34184   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
34185   return true;
34186 }
34187
34188 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
34189 // together with other CMOV pseudo-opcodes into a single basic-block with
34190 // conditional jump around it.
34191 static bool isCMOVPseudo(MachineInstr &MI) {
34192   switch (MI.getOpcode()) {
34193   case X86::CMOV_FR16:
34194   case X86::CMOV_FR16X:
34195   case X86::CMOV_FR32:
34196   case X86::CMOV_FR32X:
34197   case X86::CMOV_FR64:
34198   case X86::CMOV_FR64X:
34199   case X86::CMOV_GR8:
34200   case X86::CMOV_GR16:
34201   case X86::CMOV_GR32:
34202   case X86::CMOV_RFP32:
34203   case X86::CMOV_RFP64:
34204   case X86::CMOV_RFP80:
34205   case X86::CMOV_VR64:
34206   case X86::CMOV_VR128:
34207   case X86::CMOV_VR128X:
34208   case X86::CMOV_VR256:
34209   case X86::CMOV_VR256X:
34210   case X86::CMOV_VR512:
34211   case X86::CMOV_VK1:
34212   case X86::CMOV_VK2:
34213   case X86::CMOV_VK4:
34214   case X86::CMOV_VK8:
34215   case X86::CMOV_VK16:
34216   case X86::CMOV_VK32:
34217   case X86::CMOV_VK64:
34218     return true;
34219
34220   default:
34221     return false;
34222   }
34223 }
34224
34225 // Helper function, which inserts PHI functions into SinkMBB:
34226 //   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
34227 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
34228 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
34229 // the last PHI function inserted.
34230 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
34231     MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
34232     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
34233     MachineBasicBlock *SinkMBB) {
34234   MachineFunction *MF = TrueMBB->getParent();
34235   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
34236   const MIMetadata MIMD(*MIItBegin);
34237
34238   X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
34239   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
34240
34241   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
34242
34243   // As we are creating the PHIs, we have to be careful if there is more than
34244   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
34245   // PHIs have to reference the individual true/false inputs from earlier PHIs.
34246   // That also means that PHI construction must work forward from earlier to
34247   // later, and that the code must maintain a mapping from earlier PHI's
34248   // destination registers, and the registers that went into the PHI.
34249   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
34250   MachineInstrBuilder MIB;
34251
34252   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
34253     Register DestReg = MIIt->getOperand(0).getReg();
34254     Register Op1Reg = MIIt->getOperand(1).getReg();
34255     Register Op2Reg = MIIt->getOperand(2).getReg();
34256
34257     // If this CMOV we are generating is the opposite condition from
34258     // the jump we generated, then we have to swap the operands for the
34259     // PHI that is going to be generated.
34260     if (MIIt->getOperand(3).getImm() == OppCC)
34261       std::swap(Op1Reg, Op2Reg);
34262
34263     if (RegRewriteTable.contains(Op1Reg))
34264       Op1Reg = RegRewriteTable[Op1Reg].first;
34265
34266     if (RegRewriteTable.contains(Op2Reg))
34267       Op2Reg = RegRewriteTable[Op2Reg].second;
34268
34269     MIB =
34270         BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
34271             .addReg(Op1Reg)
34272             .addMBB(FalseMBB)
34273             .addReg(Op2Reg)
34274             .addMBB(TrueMBB);
34275
34276     // Add this PHI to the rewrite table.
34277     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
34278   }
34279
34280   return MIB;
34281 }
34282
34283 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
34284 MachineBasicBlock *
34285 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
34286                                              MachineInstr &SecondCascadedCMOV,
34287                                              MachineBasicBlock *ThisMBB) const {
34288   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34289   const MIMetadata MIMD(FirstCMOV);
34290
34291   // We lower cascaded CMOVs such as
34292   //
34293   //   (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
34294   //
34295   // to two successive branches.
34296   //
34297   // Without this, we would add a PHI between the two jumps, which ends up
34298   // creating a few copies all around. For instance, for
34299   //
34300   //    (sitofp (zext (fcmp une)))
34301   //
34302   // we would generate:
34303   //
34304   //         ucomiss %xmm1, %xmm0
34305   //         movss  <1.0f>, %xmm0
34306   //         movaps  %xmm0, %xmm1
34307   //         jne     .LBB5_2
34308   //         xorps   %xmm1, %xmm1
34309   // .LBB5_2:
34310   //         jp      .LBB5_4
34311   //         movaps  %xmm1, %xmm0
34312   // .LBB5_4:
34313   //         retq
34314   //
34315   // because this custom-inserter would have generated:
34316   //
34317   //   A
34318   //   | \
34319   //   |  B
34320   //   | /
34321   //   C
34322   //   | \
34323   //   |  D
34324   //   | /
34325   //   E
34326   //
34327   // A: X = ...; Y = ...
34328   // B: empty
34329   // C: Z = PHI [X, A], [Y, B]
34330   // D: empty
34331   // E: PHI [X, C], [Z, D]
34332   //
34333   // If we lower both CMOVs in a single step, we can instead generate:
34334   //
34335   //   A
34336   //   | \
34337   //   |  C
34338   //   | /|
34339   //   |/ |
34340   //   |  |
34341   //   |  D
34342   //   | /
34343   //   E
34344   //
34345   // A: X = ...; Y = ...
34346   // D: empty
34347   // E: PHI [X, A], [X, C], [Y, D]
34348   //
34349   // Which, in our sitofp/fcmp example, gives us something like:
34350   //
34351   //         ucomiss %xmm1, %xmm0
34352   //         movss  <1.0f>, %xmm0
34353   //         jne     .LBB5_4
34354   //         jp      .LBB5_4
34355   //         xorps   %xmm0, %xmm0
34356   // .LBB5_4:
34357   //         retq
34358   //
34359
34360   // We lower cascaded CMOV into two successive branches to the same block.
34361   // EFLAGS is used by both, so mark it as live in the second.
34362   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34363   MachineFunction *F = ThisMBB->getParent();
34364   MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34365   MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34366   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34367
34368   MachineFunction::iterator It = ++ThisMBB->getIterator();
34369   F->insert(It, FirstInsertedMBB);
34370   F->insert(It, SecondInsertedMBB);
34371   F->insert(It, SinkMBB);
34372
34373   // For a cascaded CMOV, we lower it to two successive branches to
34374   // the same block (SinkMBB).  EFLAGS is used by both, so mark it as live in
34375   // the FirstInsertedMBB.
34376   FirstInsertedMBB->addLiveIn(X86::EFLAGS);
34377
34378   // If the EFLAGS register isn't dead in the terminator, then claim that it's
34379   // live into the sink and copy blocks.
34380   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34381   if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
34382       !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
34383     SecondInsertedMBB->addLiveIn(X86::EFLAGS);
34384     SinkMBB->addLiveIn(X86::EFLAGS);
34385   }
34386
34387   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34388   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
34389                   std::next(MachineBasicBlock::iterator(FirstCMOV)),
34390                   ThisMBB->end());
34391   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34392
34393   // Fallthrough block for ThisMBB.
34394   ThisMBB->addSuccessor(FirstInsertedMBB);
34395   // The true block target of the first branch is always SinkMBB.
34396   ThisMBB->addSuccessor(SinkMBB);
34397   // Fallthrough block for FirstInsertedMBB.
34398   FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
34399   // The true block for the branch of FirstInsertedMBB.
34400   FirstInsertedMBB->addSuccessor(SinkMBB);
34401   // This is fallthrough.
34402   SecondInsertedMBB->addSuccessor(SinkMBB);
34403
34404   // Create the conditional branch instructions.
34405   X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
34406   BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
34407
34408   X86::CondCode SecondCC =
34409       X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
34410   BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
34411       .addMBB(SinkMBB)
34412       .addImm(SecondCC);
34413
34414   //  SinkMBB:
34415   //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
34416   Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
34417   Register Op1Reg = FirstCMOV.getOperand(1).getReg();
34418   Register Op2Reg = FirstCMOV.getOperand(2).getReg();
34419   MachineInstrBuilder MIB =
34420       BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
34421           .addReg(Op1Reg)
34422           .addMBB(SecondInsertedMBB)
34423           .addReg(Op2Reg)
34424           .addMBB(ThisMBB);
34425
34426   // The second SecondInsertedMBB provides the same incoming value as the
34427   // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
34428   MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
34429
34430   // Now remove the CMOVs.
34431   FirstCMOV.eraseFromParent();
34432   SecondCascadedCMOV.eraseFromParent();
34433
34434   return SinkMBB;
34435 }
34436
34437 MachineBasicBlock *
34438 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
34439                                      MachineBasicBlock *ThisMBB) const {
34440   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34441   const MIMetadata MIMD(MI);
34442
34443   // To "insert" a SELECT_CC instruction, we actually have to insert the
34444   // diamond control-flow pattern.  The incoming instruction knows the
34445   // destination vreg to set, the condition code register to branch on, the
34446   // true/false values to select between and a branch opcode to use.
34447
34448   //  ThisMBB:
34449   //  ...
34450   //   TrueVal = ...
34451   //   cmpTY ccX, r1, r2
34452   //   bCC copy1MBB
34453   //   fallthrough --> FalseMBB
34454
34455   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
34456   // as described above, by inserting a BB, and then making a PHI at the join
34457   // point to select the true and false operands of the CMOV in the PHI.
34458   //
34459   // The code also handles two different cases of multiple CMOV opcodes
34460   // in a row.
34461   //
34462   // Case 1:
34463   // In this case, there are multiple CMOVs in a row, all which are based on
34464   // the same condition setting (or the exact opposite condition setting).
34465   // In this case we can lower all the CMOVs using a single inserted BB, and
34466   // then make a number of PHIs at the join point to model the CMOVs. The only
34467   // trickiness here, is that in a case like:
34468   //
34469   // t2 = CMOV cond1 t1, f1
34470   // t3 = CMOV cond1 t2, f2
34471   //
34472   // when rewriting this into PHIs, we have to perform some renaming on the
34473   // temps since you cannot have a PHI operand refer to a PHI result earlier
34474   // in the same block.  The "simple" but wrong lowering would be:
34475   //
34476   // t2 = PHI t1(BB1), f1(BB2)
34477   // t3 = PHI t2(BB1), f2(BB2)
34478   //
34479   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
34480   // renaming is to note that on the path through BB1, t2 is really just a
34481   // copy of t1, and do that renaming, properly generating:
34482   //
34483   // t2 = PHI t1(BB1), f1(BB2)
34484   // t3 = PHI t1(BB1), f2(BB2)
34485   //
34486   // Case 2:
34487   // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
34488   // function - EmitLoweredCascadedSelect.
34489
34490   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
34491   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
34492   MachineInstr *LastCMOV = &MI;
34493   MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
34494
34495   // Check for case 1, where there are multiple CMOVs with the same condition
34496   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
34497   // number of jumps the most.
34498
34499   if (isCMOVPseudo(MI)) {
34500     // See if we have a string of CMOVS with the same condition. Skip over
34501     // intervening debug insts.
34502     while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
34503            (NextMIIt->getOperand(3).getImm() == CC ||
34504             NextMIIt->getOperand(3).getImm() == OppCC)) {
34505       LastCMOV = &*NextMIIt;
34506       NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
34507     }
34508   }
34509
34510   // This checks for case 2, but only do this if we didn't already find
34511   // case 1, as indicated by LastCMOV == MI.
34512   if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
34513       NextMIIt->getOpcode() == MI.getOpcode() &&
34514       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
34515       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
34516       NextMIIt->getOperand(1).isKill()) {
34517     return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
34518   }
34519
34520   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34521   MachineFunction *F = ThisMBB->getParent();
34522   MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
34523   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34524
34525   MachineFunction::iterator It = ++ThisMBB->getIterator();
34526   F->insert(It, FalseMBB);
34527   F->insert(It, SinkMBB);
34528
34529   // Set the call frame size on entry to the new basic blocks.
34530   unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
34531   FalseMBB->setCallFrameSize(CallFrameSize);
34532   SinkMBB->setCallFrameSize(CallFrameSize);
34533
34534   // If the EFLAGS register isn't dead in the terminator, then claim that it's
34535   // live into the sink and copy blocks.
34536   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34537   if (!LastCMOV->killsRegister(X86::EFLAGS) &&
34538       !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
34539     FalseMBB->addLiveIn(X86::EFLAGS);
34540     SinkMBB->addLiveIn(X86::EFLAGS);
34541   }
34542
34543   // Transfer any debug instructions inside the CMOV sequence to the sunk block.
34544   auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),
34545                                    MachineBasicBlock::iterator(LastCMOV));
34546   for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
34547     if (MI.isDebugInstr())
34548       SinkMBB->push_back(MI.removeFromParent());
34549
34550   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34551   SinkMBB->splice(SinkMBB->end(), ThisMBB,
34552                   std::next(MachineBasicBlock::iterator(LastCMOV)),
34553                   ThisMBB->end());
34554   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34555
34556   // Fallthrough block for ThisMBB.
34557   ThisMBB->addSuccessor(FalseMBB);
34558   // The true block target of the first (or only) branch is always a SinkMBB.
34559   ThisMBB->addSuccessor(SinkMBB);
34560   // Fallthrough block for FalseMBB.
34561   FalseMBB->addSuccessor(SinkMBB);
34562
34563   // Create the conditional branch instruction.
34564   BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
34565
34566   //  SinkMBB:
34567   //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
34568   //  ...
34569   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
34570   MachineBasicBlock::iterator MIItEnd =
34571       std::next(MachineBasicBlock::iterator(LastCMOV));
34572   createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
34573
34574   // Now remove the CMOV(s).
34575   ThisMBB->erase(MIItBegin, MIItEnd);
34576
34577   return SinkMBB;
34578 }
34579
34580 static unsigned getSUBriOpcode(bool IsLP64) {
34581   if (IsLP64)
34582     return X86::SUB64ri32;
34583   else
34584     return X86::SUB32ri;
34585 }
34586
34587 MachineBasicBlock *
34588 X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
34589                                            MachineBasicBlock *MBB) const {
34590   MachineFunction *MF = MBB->getParent();
34591   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34592   const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
34593   const MIMetadata MIMD(MI);
34594   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
34595
34596   const unsigned ProbeSize = getStackProbeSize(*MF);
34597
34598   MachineRegisterInfo &MRI = MF->getRegInfo();
34599   MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34600   MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34601   MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34602
34603   MachineFunction::iterator MBBIter = ++MBB->getIterator();
34604   MF->insert(MBBIter, testMBB);
34605   MF->insert(MBBIter, blockMBB);
34606   MF->insert(MBBIter, tailMBB);
34607
34608   Register sizeVReg = MI.getOperand(1).getReg();
34609
34610   Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
34611
34612   Register TmpStackPtr = MRI.createVirtualRegister(
34613       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
34614   Register FinalStackPtr = MRI.createVirtualRegister(
34615       TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
34616
34617   BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
34618       .addReg(physSPReg);
34619   {
34620     const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
34621     BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
34622         .addReg(TmpStackPtr)
34623         .addReg(sizeVReg);
34624   }
34625
34626   // test rsp size
34627
34628   BuildMI(testMBB, MIMD,
34629           TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
34630       .addReg(FinalStackPtr)
34631       .addReg(physSPReg);
34632
34633   BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
34634       .addMBB(tailMBB)
34635       .addImm(X86::COND_GE);
34636   testMBB->addSuccessor(blockMBB);
34637   testMBB->addSuccessor(tailMBB);
34638
34639   // Touch the block then extend it. This is done on the opposite side of
34640   // static probe where we allocate then touch, to avoid the need of probing the
34641   // tail of the static alloca. Possible scenarios are:
34642   //
34643   //       + ---- <- ------------ <- ------------- <- ------------ +
34644   //       |                                                       |
34645   // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
34646   //                                                               |                                                               |
34647   //                                                               + <- ----------- <- ------------ <- ----------- <- ------------ +
34648   //
34649   // The property we want to enforce is to never have more than [page alloc] between two probes.
34650
34651   const unsigned XORMIOpc =
34652       TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
34653   addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
34654       .addImm(0);
34655
34656   BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
34657           physSPReg)
34658       .addReg(physSPReg)
34659       .addImm(ProbeSize);
34660
34661   BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
34662   blockMBB->addSuccessor(testMBB);
34663
34664   // Replace original instruction by the expected stack ptr
34665   BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
34666           MI.getOperand(0).getReg())
34667       .addReg(FinalStackPtr);
34668
34669   tailMBB->splice(tailMBB->end(), MBB,
34670                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34671   tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
34672   MBB->addSuccessor(testMBB);
34673
34674   // Delete the original pseudo instruction.
34675   MI.eraseFromParent();
34676
34677   // And we're done.
34678   return tailMBB;
34679 }
34680
34681 MachineBasicBlock *
34682 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
34683                                         MachineBasicBlock *BB) const {
34684   MachineFunction *MF = BB->getParent();
34685   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34686   const MIMetadata MIMD(MI);
34687   const BasicBlock *LLVM_BB = BB->getBasicBlock();
34688
34689   assert(MF->shouldSplitStack());
34690
34691   const bool Is64Bit = Subtarget.is64Bit();
34692   const bool IsLP64 = Subtarget.isTarget64BitLP64();
34693
34694   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
34695   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
34696
34697   // BB:
34698   //  ... [Till the alloca]
34699   // If stacklet is not large enough, jump to mallocMBB
34700   //
34701   // bumpMBB:
34702   //  Allocate by subtracting from RSP
34703   //  Jump to continueMBB
34704   //
34705   // mallocMBB:
34706   //  Allocate by call to runtime
34707   //
34708   // continueMBB:
34709   //  ...
34710   //  [rest of original BB]
34711   //
34712
34713   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34714   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34715   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34716
34717   MachineRegisterInfo &MRI = MF->getRegInfo();
34718   const TargetRegisterClass *AddrRegClass =
34719       getRegClassFor(getPointerTy(MF->getDataLayout()));
34720
34721   Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
34722            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
34723            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
34724            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
34725            sizeVReg = MI.getOperand(1).getReg(),
34726            physSPReg =
34727                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
34728
34729   MachineFunction::iterator MBBIter = ++BB->getIterator();
34730
34731   MF->insert(MBBIter, bumpMBB);
34732   MF->insert(MBBIter, mallocMBB);
34733   MF->insert(MBBIter, continueMBB);
34734
34735   continueMBB->splice(continueMBB->begin(), BB,
34736                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
34737   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
34738
34739   // Add code to the main basic block to check if the stack limit has been hit,
34740   // and if so, jump to mallocMBB otherwise to bumpMBB.
34741   BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
34742   BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
34743     .addReg(tmpSPVReg).addReg(sizeVReg);
34744   BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
34745     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
34746     .addReg(SPLimitVReg);
34747   BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
34748
34749   // bumpMBB simply decreases the stack pointer, since we know the current
34750   // stacklet has enough space.
34751   BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
34752     .addReg(SPLimitVReg);
34753   BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
34754     .addReg(SPLimitVReg);
34755   BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
34756
34757   // Calls into a routine in libgcc to allocate more space from the heap.
34758   const uint32_t *RegMask =
34759       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
34760   if (IsLP64) {
34761     BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
34762       .addReg(sizeVReg);
34763     BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
34764       .addExternalSymbol("__morestack_allocate_stack_space")
34765       .addRegMask(RegMask)
34766       .addReg(X86::RDI, RegState::Implicit)
34767       .addReg(X86::RAX, RegState::ImplicitDefine);
34768   } else if (Is64Bit) {
34769     BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
34770       .addReg(sizeVReg);
34771     BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
34772       .addExternalSymbol("__morestack_allocate_stack_space")
34773       .addRegMask(RegMask)
34774       .addReg(X86::EDI, RegState::Implicit)
34775       .addReg(X86::EAX, RegState::ImplicitDefine);
34776   } else {
34777     BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
34778       .addImm(12);
34779     BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
34780     BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
34781       .addExternalSymbol("__morestack_allocate_stack_space")
34782       .addRegMask(RegMask)
34783       .addReg(X86::EAX, RegState::ImplicitDefine);
34784   }
34785
34786   if (!Is64Bit)
34787     BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
34788       .addImm(16);
34789
34790   BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
34791     .addReg(IsLP64 ? X86::RAX : X86::EAX);
34792   BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
34793
34794   // Set up the CFG correctly.
34795   BB->addSuccessor(bumpMBB);
34796   BB->addSuccessor(mallocMBB);
34797   mallocMBB->addSuccessor(continueMBB);
34798   bumpMBB->addSuccessor(continueMBB);
34799
34800   // Take care of the PHI nodes.
34801   BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
34802           MI.getOperand(0).getReg())
34803       .addReg(mallocPtrVReg)
34804       .addMBB(mallocMBB)
34805       .addReg(bumpSPPtrVReg)
34806       .addMBB(bumpMBB);
34807
34808   // Delete the original pseudo instruction.
34809   MI.eraseFromParent();
34810
34811   // And we're done.
34812   return continueMBB;
34813 }
34814
34815 MachineBasicBlock *
34816 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
34817                                        MachineBasicBlock *BB) const {
34818   MachineFunction *MF = BB->getParent();
34819   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
34820   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
34821   const MIMetadata MIMD(MI);
34822
34823   assert(!isAsynchronousEHPersonality(
34824              classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
34825          "SEH does not use catchret!");
34826
34827   // Only 32-bit EH needs to worry about manually restoring stack pointers.
34828   if (!Subtarget.is32Bit())
34829     return BB;
34830
34831   // C++ EH creates a new target block to hold the restore code, and wires up
34832   // the new block to the return destination with a normal JMP_4.
34833   MachineBasicBlock *RestoreMBB =
34834       MF->CreateMachineBasicBlock(BB->getBasicBlock());
34835   assert(BB->succ_size() == 1);
34836   MF->insert(std::next(BB->getIterator()), RestoreMBB);
34837   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
34838   BB->addSuccessor(RestoreMBB);
34839   MI.getOperand(0).setMBB(RestoreMBB);
34840
34841   // Marking this as an EH pad but not a funclet entry block causes PEI to
34842   // restore stack pointers in the block.
34843   RestoreMBB->setIsEHPad(true);
34844
34845   auto RestoreMBBI = RestoreMBB->begin();
34846   BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
34847   return BB;
34848 }
34849
34850 MachineBasicBlock *
34851 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
34852                                       MachineBasicBlock *BB) const {
34853   // So, here we replace TLSADDR with the sequence:
34854   // adjust_stackdown -> TLSADDR -> adjust_stackup.
34855   // We need this because TLSADDR is lowered into calls
34856   // inside MC, therefore without the two markers shrink-wrapping
34857   // may push the prologue/epilogue pass them.
34858   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
34859   const MIMetadata MIMD(MI);
34860   MachineFunction &MF = *BB->getParent();
34861
34862   // Emit CALLSEQ_START right before the instruction.
34863   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
34864   MachineInstrBuilder CallseqStart =
34865       BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
34866   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
34867
34868   // Emit CALLSEQ_END right after the instruction.
34869   // We don't call erase from parent because we want to keep the
34870   // original instruction around.
34871   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
34872   MachineInstrBuilder CallseqEnd =
34873       BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
34874   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
34875
34876   return BB;
34877 }
34878
34879 MachineBasicBlock *
34880 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
34881                                       MachineBasicBlock *BB) const {
34882   // This is pretty easy.  We're taking the value that we received from
34883   // our load from the relocation, sticking it in either RDI (x86-64)
34884   // or EAX and doing an indirect call.  The return value will then
34885   // be in the normal return register.
34886   MachineFunction *F = BB->getParent();
34887   const X86InstrInfo *TII = Subtarget.getInstrInfo();
34888   const MIMetadata MIMD(MI);
34889
34890   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
34891   assert(MI.getOperand(3).isGlobal() && "This should be a global");
34892
34893   // Get a register mask for the lowered call.
34894   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
34895   // proper register mask.
34896   const uint32_t *RegMask =
34897       Subtarget.is64Bit() ?
34898       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
34899       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
34900   if (Subtarget.is64Bit()) {
34901     MachineInstrBuilder MIB =
34902         BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
34903             .addReg(X86::RIP)
34904             .addImm(0)
34905             .addReg(0)
34906             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34907                               MI.getOperand(3).getTargetFlags())
34908             .addReg(0);
34909     MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
34910     addDirectMem(MIB, X86::RDI);
34911     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
34912   } else if (!isPositionIndependent()) {
34913     MachineInstrBuilder MIB =
34914         BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
34915             .addReg(0)
34916             .addImm(0)
34917             .addReg(0)
34918             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34919                               MI.getOperand(3).getTargetFlags())
34920             .addReg(0);
34921     MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
34922     addDirectMem(MIB, X86::EAX);
34923     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
34924   } else {
34925     MachineInstrBuilder MIB =
34926         BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
34927             .addReg(TII->getGlobalBaseReg(F))
34928             .addImm(0)
34929             .addReg(0)
34930             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
34931                               MI.getOperand(3).getTargetFlags())
34932             .addReg(0);
34933     MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
34934     addDirectMem(MIB, X86::EAX);
34935     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
34936   }
34937
34938   MI.eraseFromParent(); // The pseudo instruction is gone now.
34939   return BB;
34940 }
34941
34942 static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
34943   switch (RPOpc) {
34944   case X86::INDIRECT_THUNK_CALL32:
34945     return X86::CALLpcrel32;
34946   case X86::INDIRECT_THUNK_CALL64:
34947     return X86::CALL64pcrel32;
34948   case X86::INDIRECT_THUNK_TCRETURN32:
34949     return X86::TCRETURNdi;
34950   case X86::INDIRECT_THUNK_TCRETURN64:
34951     return X86::TCRETURNdi64;
34952   }
34953   llvm_unreachable("not indirect thunk opcode");
34954 }
34955
34956 static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
34957                                           unsigned Reg) {
34958   if (Subtarget.useRetpolineExternalThunk()) {
34959     // When using an external thunk for retpolines, we pick names that match the
34960     // names GCC happens to use as well. This helps simplify the implementation
34961     // of the thunks for kernels where they have no easy ability to create
34962     // aliases and are doing non-trivial configuration of the thunk's body. For
34963     // example, the Linux kernel will do boot-time hot patching of the thunk
34964     // bodies and cannot easily export aliases of these to loaded modules.
34965     //
34966     // Note that at any point in the future, we may need to change the semantics
34967     // of how we implement retpolines and at that time will likely change the
34968     // name of the called thunk. Essentially, there is no hard guarantee that
34969     // LLVM will generate calls to specific thunks, we merely make a best-effort
34970     // attempt to help out kernels and other systems where duplicating the
34971     // thunks is costly.
34972     switch (Reg) {
34973     case X86::EAX:
34974       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34975       return "__x86_indirect_thunk_eax";
34976     case X86::ECX:
34977       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34978       return "__x86_indirect_thunk_ecx";
34979     case X86::EDX:
34980       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34981       return "__x86_indirect_thunk_edx";
34982     case X86::EDI:
34983       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34984       return "__x86_indirect_thunk_edi";
34985     case X86::R11:
34986       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
34987       return "__x86_indirect_thunk_r11";
34988     }
34989     llvm_unreachable("unexpected reg for external indirect thunk");
34990   }
34991
34992   if (Subtarget.useRetpolineIndirectCalls() ||
34993       Subtarget.useRetpolineIndirectBranches()) {
34994     // When targeting an internal COMDAT thunk use an LLVM-specific name.
34995     switch (Reg) {
34996     case X86::EAX:
34997       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
34998       return "__llvm_retpoline_eax";
34999     case X86::ECX:
35000       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35001       return "__llvm_retpoline_ecx";
35002     case X86::EDX:
35003       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35004       return "__llvm_retpoline_edx";
35005     case X86::EDI:
35006       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35007       return "__llvm_retpoline_edi";
35008     case X86::R11:
35009       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35010       return "__llvm_retpoline_r11";
35011     }
35012     llvm_unreachable("unexpected reg for retpoline");
35013   }
35014
35015   if (Subtarget.useLVIControlFlowIntegrity()) {
35016     assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35017     return "__llvm_lvi_thunk_r11";
35018   }
35019   llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
35020 }
35021
35022 MachineBasicBlock *
35023 X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
35024                                             MachineBasicBlock *BB) const {
35025   // Copy the virtual register into the R11 physical register and
35026   // call the retpoline thunk.
35027   const MIMetadata MIMD(MI);
35028   const X86InstrInfo *TII = Subtarget.getInstrInfo();
35029   Register CalleeVReg = MI.getOperand(0).getReg();
35030   unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
35031
35032   // Find an available scratch register to hold the callee. On 64-bit, we can
35033   // just use R11, but we scan for uses anyway to ensure we don't generate
35034   // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
35035   // already a register use operand to the call to hold the callee. If none
35036   // are available, use EDI instead. EDI is chosen because EBX is the PIC base
35037   // register and ESI is the base pointer to realigned stack frames with VLAs.
35038   SmallVector<unsigned, 3> AvailableRegs;
35039   if (Subtarget.is64Bit())
35040     AvailableRegs.push_back(X86::R11);
35041   else
35042     AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
35043
35044   // Zero out any registers that are already used.
35045   for (const auto &MO : MI.operands()) {
35046     if (MO.isReg() && MO.isUse())
35047       for (unsigned &Reg : AvailableRegs)
35048         if (Reg == MO.getReg())
35049           Reg = 0;
35050   }
35051
35052   // Choose the first remaining non-zero available register.
35053   unsigned AvailableReg = 0;
35054   for (unsigned MaybeReg : AvailableRegs) {
35055     if (MaybeReg) {
35056       AvailableReg = MaybeReg;
35057       break;
35058     }
35059   }
35060   if (!AvailableReg)
35061     report_fatal_error("calling convention incompatible with retpoline, no "
35062                        "available registers");
35063
35064   const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
35065
35066   BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
35067       .addReg(CalleeVReg);
35068   MI.getOperand(0).ChangeToES(Symbol);
35069   MI.setDesc(TII->get(Opc));
35070   MachineInstrBuilder(*BB->getParent(), &MI)
35071       .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
35072   return BB;
35073 }
35074
35075 /// SetJmp implies future control flow change upon calling the corresponding
35076 /// LongJmp.
35077 /// Instead of using the 'return' instruction, the long jump fixes the stack and
35078 /// performs an indirect branch. To do so it uses the registers that were stored
35079 /// in the jump buffer (when calling SetJmp).
35080 /// In case the shadow stack is enabled we need to fix it as well, because some
35081 /// return addresses will be skipped.
35082 /// The function will save the SSP for future fixing in the function
35083 /// emitLongJmpShadowStackFix.
35084 /// \sa emitLongJmpShadowStackFix
35085 /// \param [in] MI The temporary Machine Instruction for the builtin.
35086 /// \param [in] MBB The Machine Basic Block that will be modified.
35087 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
35088                                                  MachineBasicBlock *MBB) const {
35089   const MIMetadata MIMD(MI);
35090   MachineFunction *MF = MBB->getParent();
35091   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35092   MachineRegisterInfo &MRI = MF->getRegInfo();
35093   MachineInstrBuilder MIB;
35094
35095   // Memory Reference.
35096   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35097                                            MI.memoperands_end());
35098
35099   // Initialize a register with zero.
35100   MVT PVT = getPointerTy(MF->getDataLayout());
35101   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35102   Register ZReg = MRI.createVirtualRegister(PtrRC);
35103   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
35104   BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
35105       .addDef(ZReg)
35106       .addReg(ZReg, RegState::Undef)
35107       .addReg(ZReg, RegState::Undef);
35108
35109   // Read the current SSP Register value to the zeroed register.
35110   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35111   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35112   BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35113
35114   // Write the SSP register value to offset 3 in input memory buffer.
35115   unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35116   MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
35117   const int64_t SSPOffset = 3 * PVT.getStoreSize();
35118   const unsigned MemOpndSlot = 1;
35119   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35120     if (i == X86::AddrDisp)
35121       MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
35122     else
35123       MIB.add(MI.getOperand(MemOpndSlot + i));
35124   }
35125   MIB.addReg(SSPCopyReg);
35126   MIB.setMemRefs(MMOs);
35127 }
35128
35129 MachineBasicBlock *
35130 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
35131                                     MachineBasicBlock *MBB) const {
35132   const MIMetadata MIMD(MI);
35133   MachineFunction *MF = MBB->getParent();
35134   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35135   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35136   MachineRegisterInfo &MRI = MF->getRegInfo();
35137
35138   const BasicBlock *BB = MBB->getBasicBlock();
35139   MachineFunction::iterator I = ++MBB->getIterator();
35140
35141   // Memory Reference
35142   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35143                                            MI.memoperands_end());
35144
35145   unsigned DstReg;
35146   unsigned MemOpndSlot = 0;
35147
35148   unsigned CurOp = 0;
35149
35150   DstReg = MI.getOperand(CurOp++).getReg();
35151   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35152   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
35153   (void)TRI;
35154   Register mainDstReg = MRI.createVirtualRegister(RC);
35155   Register restoreDstReg = MRI.createVirtualRegister(RC);
35156
35157   MemOpndSlot = CurOp;
35158
35159   MVT PVT = getPointerTy(MF->getDataLayout());
35160   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35161          "Invalid Pointer Size!");
35162
35163   // For v = setjmp(buf), we generate
35164   //
35165   // thisMBB:
35166   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
35167   //  SjLjSetup restoreMBB
35168   //
35169   // mainMBB:
35170   //  v_main = 0
35171   //
35172   // sinkMBB:
35173   //  v = phi(main, restore)
35174   //
35175   // restoreMBB:
35176   //  if base pointer being used, load it from frame
35177   //  v_restore = 1
35178
35179   MachineBasicBlock *thisMBB = MBB;
35180   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35181   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35182   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
35183   MF->insert(I, mainMBB);
35184   MF->insert(I, sinkMBB);
35185   MF->push_back(restoreMBB);
35186   restoreMBB->setMachineBlockAddressTaken();
35187
35188   MachineInstrBuilder MIB;
35189
35190   // Transfer the remainder of BB and its successor edges to sinkMBB.
35191   sinkMBB->splice(sinkMBB->begin(), MBB,
35192                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35193   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35194
35195   // thisMBB:
35196   unsigned PtrStoreOpc = 0;
35197   unsigned LabelReg = 0;
35198   const int64_t LabelOffset = 1 * PVT.getStoreSize();
35199   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35200                      !isPositionIndependent();
35201
35202   // Prepare IP either in reg or imm.
35203   if (!UseImmLabel) {
35204     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35205     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35206     LabelReg = MRI.createVirtualRegister(PtrRC);
35207     if (Subtarget.is64Bit()) {
35208       MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
35209               .addReg(X86::RIP)
35210               .addImm(0)
35211               .addReg(0)
35212               .addMBB(restoreMBB)
35213               .addReg(0);
35214     } else {
35215       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
35216       MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
35217               .addReg(XII->getGlobalBaseReg(MF))
35218               .addImm(0)
35219               .addReg(0)
35220               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
35221               .addReg(0);
35222     }
35223   } else
35224     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35225   // Store IP
35226   MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
35227   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35228     if (i == X86::AddrDisp)
35229       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
35230     else
35231       MIB.add(MI.getOperand(MemOpndSlot + i));
35232   }
35233   if (!UseImmLabel)
35234     MIB.addReg(LabelReg);
35235   else
35236     MIB.addMBB(restoreMBB);
35237   MIB.setMemRefs(MMOs);
35238
35239   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35240     emitSetJmpShadowStackFix(MI, thisMBB);
35241   }
35242
35243   // Setup
35244   MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
35245           .addMBB(restoreMBB);
35246
35247   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35248   MIB.addRegMask(RegInfo->getNoPreservedMask());
35249   thisMBB->addSuccessor(mainMBB);
35250   thisMBB->addSuccessor(restoreMBB);
35251
35252   // mainMBB:
35253   //  EAX = 0
35254   BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
35255   mainMBB->addSuccessor(sinkMBB);
35256
35257   // sinkMBB:
35258   BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35259       .addReg(mainDstReg)
35260       .addMBB(mainMBB)
35261       .addReg(restoreDstReg)
35262       .addMBB(restoreMBB);
35263
35264   // restoreMBB:
35265   if (RegInfo->hasBasePointer(*MF)) {
35266     const bool Uses64BitFramePtr =
35267         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35268     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
35269     X86FI->setRestoreBasePointer(MF);
35270     Register FramePtr = RegInfo->getFrameRegister(*MF);
35271     Register BasePtr = RegInfo->getBaseRegister();
35272     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
35273     addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
35274                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
35275       .setMIFlag(MachineInstr::FrameSetup);
35276   }
35277   BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
35278   BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35279   restoreMBB->addSuccessor(sinkMBB);
35280
35281   MI.eraseFromParent();
35282   return sinkMBB;
35283 }
35284
35285 /// Fix the shadow stack using the previously saved SSP pointer.
35286 /// \sa emitSetJmpShadowStackFix
35287 /// \param [in] MI The temporary Machine Instruction for the builtin.
35288 /// \param [in] MBB The Machine Basic Block that will be modified.
35289 /// \return The sink MBB that will perform the future indirect branch.
35290 MachineBasicBlock *
35291 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
35292                                              MachineBasicBlock *MBB) const {
35293   const MIMetadata MIMD(MI);
35294   MachineFunction *MF = MBB->getParent();
35295   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35296   MachineRegisterInfo &MRI = MF->getRegInfo();
35297
35298   // Memory Reference
35299   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35300                                            MI.memoperands_end());
35301
35302   MVT PVT = getPointerTy(MF->getDataLayout());
35303   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35304
35305   // checkSspMBB:
35306   //         xor vreg1, vreg1
35307   //         rdssp vreg1
35308   //         test vreg1, vreg1
35309   //         je sinkMBB   # Jump if Shadow Stack is not supported
35310   // fallMBB:
35311   //         mov buf+24/12(%rip), vreg2
35312   //         sub vreg1, vreg2
35313   //         jbe sinkMBB  # No need to fix the Shadow Stack
35314   // fixShadowMBB:
35315   //         shr 3/2, vreg2
35316   //         incssp vreg2  # fix the SSP according to the lower 8 bits
35317   //         shr 8, vreg2
35318   //         je sinkMBB
35319   // fixShadowLoopPrepareMBB:
35320   //         shl vreg2
35321   //         mov 128, vreg3
35322   // fixShadowLoopMBB:
35323   //         incssp vreg3
35324   //         dec vreg2
35325   //         jne fixShadowLoopMBB # Iterate until you finish fixing
35326   //                              # the Shadow Stack
35327   // sinkMBB:
35328
35329   MachineFunction::iterator I = ++MBB->getIterator();
35330   const BasicBlock *BB = MBB->getBasicBlock();
35331
35332   MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
35333   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35334   MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
35335   MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
35336   MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
35337   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35338   MF->insert(I, checkSspMBB);
35339   MF->insert(I, fallMBB);
35340   MF->insert(I, fixShadowMBB);
35341   MF->insert(I, fixShadowLoopPrepareMBB);
35342   MF->insert(I, fixShadowLoopMBB);
35343   MF->insert(I, sinkMBB);
35344
35345   // Transfer the remainder of BB and its successor edges to sinkMBB.
35346   sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
35347                   MBB->end());
35348   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
35349
35350   MBB->addSuccessor(checkSspMBB);
35351
35352   // Initialize a register with zero.
35353   Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
35354   BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
35355
35356   if (PVT == MVT::i64) {
35357     Register TmpZReg = MRI.createVirtualRegister(PtrRC);
35358     BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
35359       .addImm(0)
35360       .addReg(ZReg)
35361       .addImm(X86::sub_32bit);
35362     ZReg = TmpZReg;
35363   }
35364
35365   // Read the current SSP Register value to the zeroed register.
35366   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35367   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35368   BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35369
35370   // Check whether the result of the SSP register is zero and jump directly
35371   // to the sink.
35372   unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
35373   BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
35374       .addReg(SSPCopyReg)
35375       .addReg(SSPCopyReg);
35376   BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
35377       .addMBB(sinkMBB)
35378       .addImm(X86::COND_E);
35379   checkSspMBB->addSuccessor(sinkMBB);
35380   checkSspMBB->addSuccessor(fallMBB);
35381
35382   // Reload the previously saved SSP register value.
35383   Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
35384   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35385   const int64_t SPPOffset = 3 * PVT.getStoreSize();
35386   MachineInstrBuilder MIB =
35387       BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
35388   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35389     const MachineOperand &MO = MI.getOperand(i);
35390     if (i == X86::AddrDisp)
35391       MIB.addDisp(MO, SPPOffset);
35392     else if (MO.isReg()) // Don't add the whole operand, we don't want to
35393                          // preserve kill flags.
35394       MIB.addReg(MO.getReg());
35395     else
35396       MIB.add(MO);
35397   }
35398   MIB.setMemRefs(MMOs);
35399
35400   // Subtract the current SSP from the previous SSP.
35401   Register SspSubReg = MRI.createVirtualRegister(PtrRC);
35402   unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
35403   BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
35404       .addReg(PrevSSPReg)
35405       .addReg(SSPCopyReg);
35406
35407   // Jump to sink in case PrevSSPReg <= SSPCopyReg.
35408   BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
35409       .addMBB(sinkMBB)
35410       .addImm(X86::COND_BE);
35411   fallMBB->addSuccessor(sinkMBB);
35412   fallMBB->addSuccessor(fixShadowMBB);
35413
35414   // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
35415   unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
35416   unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
35417   Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
35418   BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
35419       .addReg(SspSubReg)
35420       .addImm(Offset);
35421
35422   // Increase SSP when looking only on the lower 8 bits of the delta.
35423   unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
35424   BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
35425
35426   // Reset the lower 8 bits.
35427   Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
35428   BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
35429       .addReg(SspFirstShrReg)
35430       .addImm(8);
35431
35432   // Jump if the result of the shift is zero.
35433   BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
35434       .addMBB(sinkMBB)
35435       .addImm(X86::COND_E);
35436   fixShadowMBB->addSuccessor(sinkMBB);
35437   fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
35438
35439   // Do a single shift left.
35440   unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
35441   Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
35442   BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
35443       .addReg(SspSecondShrReg)
35444       .addImm(1);
35445
35446   // Save the value 128 to a register (will be used next with incssp).
35447   Register Value128InReg = MRI.createVirtualRegister(PtrRC);
35448   unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
35449   BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
35450       .addImm(128);
35451   fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
35452
35453   // Since incssp only looks at the lower 8 bits, we might need to do several
35454   // iterations of incssp until we finish fixing the shadow stack.
35455   Register DecReg = MRI.createVirtualRegister(PtrRC);
35456   Register CounterReg = MRI.createVirtualRegister(PtrRC);
35457   BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
35458       .addReg(SspAfterShlReg)
35459       .addMBB(fixShadowLoopPrepareMBB)
35460       .addReg(DecReg)
35461       .addMBB(fixShadowLoopMBB);
35462
35463   // Every iteration we increase the SSP by 128.
35464   BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
35465
35466   // Every iteration we decrement the counter by 1.
35467   unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
35468   BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
35469
35470   // Jump if the counter is not zero yet.
35471   BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
35472       .addMBB(fixShadowLoopMBB)
35473       .addImm(X86::COND_NE);
35474   fixShadowLoopMBB->addSuccessor(sinkMBB);
35475   fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
35476
35477   return sinkMBB;
35478 }
35479
35480 MachineBasicBlock *
35481 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
35482                                      MachineBasicBlock *MBB) const {
35483   const MIMetadata MIMD(MI);
35484   MachineFunction *MF = MBB->getParent();
35485   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35486   MachineRegisterInfo &MRI = MF->getRegInfo();
35487
35488   // Memory Reference
35489   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35490                                            MI.memoperands_end());
35491
35492   MVT PVT = getPointerTy(MF->getDataLayout());
35493   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35494          "Invalid Pointer Size!");
35495
35496   const TargetRegisterClass *RC =
35497     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35498   Register Tmp = MRI.createVirtualRegister(RC);
35499   // Since FP is only updated here but NOT referenced, it's treated as GPR.
35500   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35501   Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
35502   Register SP = RegInfo->getStackRegister();
35503
35504   MachineInstrBuilder MIB;
35505
35506   const int64_t LabelOffset = 1 * PVT.getStoreSize();
35507   const int64_t SPOffset = 2 * PVT.getStoreSize();
35508
35509   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35510   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
35511
35512   MachineBasicBlock *thisMBB = MBB;
35513
35514   // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
35515   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35516     thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
35517   }
35518
35519   // Reload FP
35520   MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
35521   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35522     const MachineOperand &MO = MI.getOperand(i);
35523     if (MO.isReg()) // Don't add the whole operand, we don't want to
35524                     // preserve kill flags.
35525       MIB.addReg(MO.getReg());
35526     else
35527       MIB.add(MO);
35528   }
35529   MIB.setMemRefs(MMOs);
35530
35531   // Reload IP
35532   MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
35533   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35534     const MachineOperand &MO = MI.getOperand(i);
35535     if (i == X86::AddrDisp)
35536       MIB.addDisp(MO, LabelOffset);
35537     else if (MO.isReg()) // Don't add the whole operand, we don't want to
35538                          // preserve kill flags.
35539       MIB.addReg(MO.getReg());
35540     else
35541       MIB.add(MO);
35542   }
35543   MIB.setMemRefs(MMOs);
35544
35545   // Reload SP
35546   MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
35547   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35548     if (i == X86::AddrDisp)
35549       MIB.addDisp(MI.getOperand(i), SPOffset);
35550     else
35551       MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
35552                                  // the last instruction of the expansion.
35553   }
35554   MIB.setMemRefs(MMOs);
35555
35556   // Jump
35557   BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
35558
35559   MI.eraseFromParent();
35560   return thisMBB;
35561 }
35562
35563 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
35564                                                MachineBasicBlock *MBB,
35565                                                MachineBasicBlock *DispatchBB,
35566                                                int FI) const {
35567   const MIMetadata MIMD(MI);
35568   MachineFunction *MF = MBB->getParent();
35569   MachineRegisterInfo *MRI = &MF->getRegInfo();
35570   const X86InstrInfo *TII = Subtarget.getInstrInfo();
35571
35572   MVT PVT = getPointerTy(MF->getDataLayout());
35573   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
35574
35575   unsigned Op = 0;
35576   unsigned VR = 0;
35577
35578   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35579                      !isPositionIndependent();
35580
35581   if (UseImmLabel) {
35582     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35583   } else {
35584     const TargetRegisterClass *TRC =
35585         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35586     VR = MRI->createVirtualRegister(TRC);
35587     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35588
35589     if (Subtarget.is64Bit())
35590       BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
35591           .addReg(X86::RIP)
35592           .addImm(1)
35593           .addReg(0)
35594           .addMBB(DispatchBB)
35595           .addReg(0);
35596     else
35597       BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
35598           .addReg(0) /* TII->getGlobalBaseReg(MF) */
35599           .addImm(1)
35600           .addReg(0)
35601           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
35602           .addReg(0);
35603   }
35604
35605   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
35606   addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
35607   if (UseImmLabel)
35608     MIB.addMBB(DispatchBB);
35609   else
35610     MIB.addReg(VR);
35611 }
35612
35613 MachineBasicBlock *
35614 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
35615                                          MachineBasicBlock *BB) const {
35616   const MIMetadata MIMD(MI);
35617   MachineFunction *MF = BB->getParent();
35618   MachineRegisterInfo *MRI = &MF->getRegInfo();
35619   const X86InstrInfo *TII = Subtarget.getInstrInfo();
35620   int FI = MF->getFrameInfo().getFunctionContextIndex();
35621
35622   // Get a mapping of the call site numbers to all of the landing pads they're
35623   // associated with.
35624   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
35625   unsigned MaxCSNum = 0;
35626   for (auto &MBB : *MF) {
35627     if (!MBB.isEHPad())
35628       continue;
35629
35630     MCSymbol *Sym = nullptr;
35631     for (const auto &MI : MBB) {
35632       if (MI.isDebugInstr())
35633         continue;
35634
35635       assert(MI.isEHLabel() && "expected EH_LABEL");
35636       Sym = MI.getOperand(0).getMCSymbol();
35637       break;
35638     }
35639
35640     if (!MF->hasCallSiteLandingPad(Sym))
35641       continue;
35642
35643     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
35644       CallSiteNumToLPad[CSI].push_back(&MBB);
35645       MaxCSNum = std::max(MaxCSNum, CSI);
35646     }
35647   }
35648
35649   // Get an ordered list of the machine basic blocks for the jump table.
35650   std::vector<MachineBasicBlock *> LPadList;
35651   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
35652   LPadList.reserve(CallSiteNumToLPad.size());
35653
35654   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
35655     for (auto &LP : CallSiteNumToLPad[CSI]) {
35656       LPadList.push_back(LP);
35657       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
35658     }
35659   }
35660
35661   assert(!LPadList.empty() &&
35662          "No landing pad destinations for the dispatch jump table!");
35663
35664   // Create the MBBs for the dispatch code.
35665
35666   // Shove the dispatch's address into the return slot in the function context.
35667   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
35668   DispatchBB->setIsEHPad(true);
35669
35670   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
35671   BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
35672   DispatchBB->addSuccessor(TrapBB);
35673
35674   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
35675   DispatchBB->addSuccessor(DispContBB);
35676
35677   // Insert MBBs.
35678   MF->push_back(DispatchBB);
35679   MF->push_back(DispContBB);
35680   MF->push_back(TrapBB);
35681
35682   // Insert code into the entry block that creates and registers the function
35683   // context.
35684   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
35685
35686   // Create the jump table and associated information
35687   unsigned JTE = getJumpTableEncoding();
35688   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
35689   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
35690
35691   const X86RegisterInfo &RI = TII->getRegisterInfo();
35692   // Add a register mask with no preserved registers.  This results in all
35693   // registers being marked as clobbered.
35694   if (RI.hasBasePointer(*MF)) {
35695     const bool FPIs64Bit =
35696         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35697     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
35698     MFI->setRestoreBasePointer(MF);
35699
35700     Register FP = RI.getFrameRegister(*MF);
35701     Register BP = RI.getBaseRegister();
35702     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
35703     addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
35704                  MFI->getRestoreBasePointerOffset())
35705         .addRegMask(RI.getNoPreservedMask());
35706   } else {
35707     BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
35708         .addRegMask(RI.getNoPreservedMask());
35709   }
35710
35711   // IReg is used as an index in a memory operand and therefore can't be SP
35712   Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
35713   addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
35714                     Subtarget.is64Bit() ? 8 : 4);
35715   BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
35716       .addReg(IReg)
35717       .addImm(LPadList.size());
35718   BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
35719       .addMBB(TrapBB)
35720       .addImm(X86::COND_AE);
35721
35722   if (Subtarget.is64Bit()) {
35723     Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
35724     Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
35725
35726     // leaq .LJTI0_0(%rip), BReg
35727     BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
35728         .addReg(X86::RIP)
35729         .addImm(1)
35730         .addReg(0)
35731         .addJumpTableIndex(MJTI)
35732         .addReg(0);
35733     // movzx IReg64, IReg
35734     BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
35735         .addImm(0)
35736         .addReg(IReg)
35737         .addImm(X86::sub_32bit);
35738
35739     switch (JTE) {
35740     case MachineJumpTableInfo::EK_BlockAddress:
35741       // jmpq *(BReg,IReg64,8)
35742       BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
35743           .addReg(BReg)
35744           .addImm(8)
35745           .addReg(IReg64)
35746           .addImm(0)
35747           .addReg(0);
35748       break;
35749     case MachineJumpTableInfo::EK_LabelDifference32: {
35750       Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
35751       Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
35752       Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
35753
35754       // movl (BReg,IReg64,4), OReg
35755       BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
35756           .addReg(BReg)
35757           .addImm(4)
35758           .addReg(IReg64)
35759           .addImm(0)
35760           .addReg(0);
35761       // movsx OReg64, OReg
35762       BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
35763           .addReg(OReg);
35764       // addq BReg, OReg64, TReg
35765       BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
35766           .addReg(OReg64)
35767           .addReg(BReg);
35768       // jmpq *TReg
35769       BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
35770       break;
35771     }
35772     default:
35773       llvm_unreachable("Unexpected jump table encoding");
35774     }
35775   } else {
35776     // jmpl *.LJTI0_0(,IReg,4)
35777     BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
35778         .addReg(0)
35779         .addImm(4)
35780         .addReg(IReg)
35781         .addJumpTableIndex(MJTI)
35782         .addReg(0);
35783   }
35784
35785   // Add the jump table entries as successors to the MBB.
35786   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
35787   for (auto &LP : LPadList)
35788     if (SeenMBBs.insert(LP).second)
35789       DispContBB->addSuccessor(LP);
35790
35791   // N.B. the order the invoke BBs are processed in doesn't matter here.
35792   SmallVector<MachineBasicBlock *, 64> MBBLPads;
35793   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
35794   for (MachineBasicBlock *MBB : InvokeBBs) {
35795     // Remove the landing pad successor from the invoke block and replace it
35796     // with the new dispatch block.
35797     // Keep a copy of Successors since it's modified inside the loop.
35798     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
35799                                                    MBB->succ_rend());
35800     // FIXME: Avoid quadratic complexity.
35801     for (auto *MBBS : Successors) {
35802       if (MBBS->isEHPad()) {
35803         MBB->removeSuccessor(MBBS);
35804         MBBLPads.push_back(MBBS);
35805       }
35806     }
35807
35808     MBB->addSuccessor(DispatchBB);
35809
35810     // Find the invoke call and mark all of the callee-saved registers as
35811     // 'implicit defined' so that they're spilled.  This prevents code from
35812     // moving instructions to before the EH block, where they will never be
35813     // executed.
35814     for (auto &II : reverse(*MBB)) {
35815       if (!II.isCall())
35816         continue;
35817
35818       DenseMap<unsigned, bool> DefRegs;
35819       for (auto &MOp : II.operands())
35820         if (MOp.isReg())
35821           DefRegs[MOp.getReg()] = true;
35822
35823       MachineInstrBuilder MIB(*MF, &II);
35824       for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
35825         unsigned Reg = SavedRegs[RegIdx];
35826         if (!DefRegs[Reg])
35827           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
35828       }
35829
35830       break;
35831     }
35832   }
35833
35834   // Mark all former landing pads as non-landing pads.  The dispatch is the only
35835   // landing pad now.
35836   for (auto &LP : MBBLPads)
35837     LP->setIsEHPad(false);
35838
35839   // The instruction is gone now.
35840   MI.eraseFromParent();
35841   return BB;
35842 }
35843
35844 MachineBasicBlock *
35845 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
35846                                                MachineBasicBlock *BB) const {
35847   MachineFunction *MF = BB->getParent();
35848   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35849   const MIMetadata MIMD(MI);
35850
35851   auto TMMImmToTMMReg = [](unsigned Imm) {
35852     assert (Imm < 8 && "Illegal tmm index");
35853     return X86::TMM0 + Imm;
35854   };
35855   switch (MI.getOpcode()) {
35856   default: llvm_unreachable("Unexpected instr type to insert");
35857   case X86::TLS_addr32:
35858   case X86::TLS_addr64:
35859   case X86::TLS_addrX32:
35860   case X86::TLS_base_addr32:
35861   case X86::TLS_base_addr64:
35862   case X86::TLS_base_addrX32:
35863     return EmitLoweredTLSAddr(MI, BB);
35864   case X86::INDIRECT_THUNK_CALL32:
35865   case X86::INDIRECT_THUNK_CALL64:
35866   case X86::INDIRECT_THUNK_TCRETURN32:
35867   case X86::INDIRECT_THUNK_TCRETURN64:
35868     return EmitLoweredIndirectThunk(MI, BB);
35869   case X86::CATCHRET:
35870     return EmitLoweredCatchRet(MI, BB);
35871   case X86::SEG_ALLOCA_32:
35872   case X86::SEG_ALLOCA_64:
35873     return EmitLoweredSegAlloca(MI, BB);
35874   case X86::PROBED_ALLOCA_32:
35875   case X86::PROBED_ALLOCA_64:
35876     return EmitLoweredProbedAlloca(MI, BB);
35877   case X86::TLSCall_32:
35878   case X86::TLSCall_64:
35879     return EmitLoweredTLSCall(MI, BB);
35880   case X86::CMOV_FR16:
35881   case X86::CMOV_FR16X:
35882   case X86::CMOV_FR32:
35883   case X86::CMOV_FR32X:
35884   case X86::CMOV_FR64:
35885   case X86::CMOV_FR64X:
35886   case X86::CMOV_GR8:
35887   case X86::CMOV_GR16:
35888   case X86::CMOV_GR32:
35889   case X86::CMOV_RFP32:
35890   case X86::CMOV_RFP64:
35891   case X86::CMOV_RFP80:
35892   case X86::CMOV_VR64:
35893   case X86::CMOV_VR128:
35894   case X86::CMOV_VR128X:
35895   case X86::CMOV_VR256:
35896   case X86::CMOV_VR256X:
35897   case X86::CMOV_VR512:
35898   case X86::CMOV_VK1:
35899   case X86::CMOV_VK2:
35900   case X86::CMOV_VK4:
35901   case X86::CMOV_VK8:
35902   case X86::CMOV_VK16:
35903   case X86::CMOV_VK32:
35904   case X86::CMOV_VK64:
35905     return EmitLoweredSelect(MI, BB);
35906
35907   case X86::FP80_ADDr:
35908   case X86::FP80_ADDm32: {
35909     // Change the floating point control register to use double extended
35910     // precision when performing the addition.
35911     int OrigCWFrameIdx =
35912         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35913     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
35914                       OrigCWFrameIdx);
35915
35916     // Load the old value of the control word...
35917     Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35918     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
35919                       OrigCWFrameIdx);
35920
35921     // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
35922     // precision.
35923     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35924     BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
35925         .addReg(OldCW, RegState::Kill)
35926         .addImm(0x300);
35927
35928     // Extract to 16 bits.
35929     Register NewCW16 =
35930         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
35931     BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
35932         .addReg(NewCW, RegState::Kill, X86::sub_16bit);
35933
35934     // Prepare memory for FLDCW.
35935     int NewCWFrameIdx =
35936         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35937     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
35938                       NewCWFrameIdx)
35939         .addReg(NewCW16, RegState::Kill);
35940
35941     // Reload the modified control word now...
35942     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
35943                       NewCWFrameIdx);
35944
35945     // Do the addition.
35946     if (MI.getOpcode() == X86::FP80_ADDr) {
35947       BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
35948           .add(MI.getOperand(0))
35949           .add(MI.getOperand(1))
35950           .add(MI.getOperand(2));
35951     } else {
35952       BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
35953           .add(MI.getOperand(0))
35954           .add(MI.getOperand(1))
35955           .add(MI.getOperand(2))
35956           .add(MI.getOperand(3))
35957           .add(MI.getOperand(4))
35958           .add(MI.getOperand(5))
35959           .add(MI.getOperand(6));
35960     }
35961
35962     // Reload the original control word now.
35963     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
35964                       OrigCWFrameIdx);
35965
35966     MI.eraseFromParent(); // The pseudo instruction is gone now.
35967     return BB;
35968   }
35969
35970   case X86::FP32_TO_INT16_IN_MEM:
35971   case X86::FP32_TO_INT32_IN_MEM:
35972   case X86::FP32_TO_INT64_IN_MEM:
35973   case X86::FP64_TO_INT16_IN_MEM:
35974   case X86::FP64_TO_INT32_IN_MEM:
35975   case X86::FP64_TO_INT64_IN_MEM:
35976   case X86::FP80_TO_INT16_IN_MEM:
35977   case X86::FP80_TO_INT32_IN_MEM:
35978   case X86::FP80_TO_INT64_IN_MEM: {
35979     // Change the floating point control register to use "round towards zero"
35980     // mode when truncating to an integer value.
35981     int OrigCWFrameIdx =
35982         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35983     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
35984                       OrigCWFrameIdx);
35985
35986     // Load the old value of the control word...
35987     Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35988     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
35989                       OrigCWFrameIdx);
35990
35991     // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
35992     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
35993     BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
35994       .addReg(OldCW, RegState::Kill).addImm(0xC00);
35995
35996     // Extract to 16 bits.
35997     Register NewCW16 =
35998         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
35999     BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36000       .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36001
36002     // Prepare memory for FLDCW.
36003     int NewCWFrameIdx =
36004         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36005     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36006                       NewCWFrameIdx)
36007       .addReg(NewCW16, RegState::Kill);
36008
36009     // Reload the modified control word now...
36010     addFrameReference(BuildMI(*BB, MI, MIMD,
36011                               TII->get(X86::FLDCW16m)), NewCWFrameIdx);
36012
36013     // Get the X86 opcode to use.
36014     unsigned Opc;
36015     switch (MI.getOpcode()) {
36016     default: llvm_unreachable("illegal opcode!");
36017     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
36018     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
36019     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
36020     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
36021     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
36022     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
36023     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
36024     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
36025     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
36026     }
36027
36028     X86AddressMode AM = getAddressFromInstr(&MI, 0);
36029     addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
36030         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
36031
36032     // Reload the original control word now.
36033     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36034                       OrigCWFrameIdx);
36035
36036     MI.eraseFromParent(); // The pseudo instruction is gone now.
36037     return BB;
36038   }
36039
36040   // xbegin
36041   case X86::XBEGIN:
36042     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
36043
36044   case X86::VAARG_64:
36045   case X86::VAARG_X32:
36046     return EmitVAARGWithCustomInserter(MI, BB);
36047
36048   case X86::EH_SjLj_SetJmp32:
36049   case X86::EH_SjLj_SetJmp64:
36050     return emitEHSjLjSetJmp(MI, BB);
36051
36052   case X86::EH_SjLj_LongJmp32:
36053   case X86::EH_SjLj_LongJmp64:
36054     return emitEHSjLjLongJmp(MI, BB);
36055
36056   case X86::Int_eh_sjlj_setup_dispatch:
36057     return EmitSjLjDispatchBlock(MI, BB);
36058
36059   case TargetOpcode::STATEPOINT:
36060     // As an implementation detail, STATEPOINT shares the STACKMAP format at
36061     // this point in the process.  We diverge later.
36062     return emitPatchPoint(MI, BB);
36063
36064   case TargetOpcode::STACKMAP:
36065   case TargetOpcode::PATCHPOINT:
36066     return emitPatchPoint(MI, BB);
36067
36068   case TargetOpcode::PATCHABLE_EVENT_CALL:
36069   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
36070     return BB;
36071
36072   case X86::LCMPXCHG8B: {
36073     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36074     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
36075     // requires a memory operand. If it happens that current architecture is
36076     // i686 and for current function we need a base pointer
36077     // - which is ESI for i686 - register allocator would not be able to
36078     // allocate registers for an address in form of X(%reg, %reg, Y)
36079     // - there never would be enough unreserved registers during regalloc
36080     // (without the need for base ptr the only option would be X(%edi, %esi, Y).
36081     // We are giving a hand to register allocator by precomputing the address in
36082     // a new vreg using LEA.
36083
36084     // If it is not i686 or there is no base pointer - nothing to do here.
36085     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
36086       return BB;
36087
36088     // Even though this code does not necessarily needs the base pointer to
36089     // be ESI, we check for that. The reason: if this assert fails, there are
36090     // some changes happened in the compiler base pointer handling, which most
36091     // probably have to be addressed somehow here.
36092     assert(TRI->getBaseRegister() == X86::ESI &&
36093            "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
36094            "base pointer in mind");
36095
36096     MachineRegisterInfo &MRI = MF->getRegInfo();
36097     MVT SPTy = getPointerTy(MF->getDataLayout());
36098     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
36099     Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
36100
36101     X86AddressMode AM = getAddressFromInstr(&MI, 0);
36102     // Regalloc does not need any help when the memory operand of CMPXCHG8B
36103     // does not use index register.
36104     if (AM.IndexReg == X86::NoRegister)
36105       return BB;
36106
36107     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
36108     // four operand definitions that are E[ABCD] registers. We skip them and
36109     // then insert the LEA.
36110     MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
36111     while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
36112                                    RMBBI->definesRegister(X86::EBX) ||
36113                                    RMBBI->definesRegister(X86::ECX) ||
36114                                    RMBBI->definesRegister(X86::EDX))) {
36115       ++RMBBI;
36116     }
36117     MachineBasicBlock::iterator MBBI(RMBBI);
36118     addFullAddress(
36119         BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
36120
36121     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
36122
36123     return BB;
36124   }
36125   case X86::LCMPXCHG16B_NO_RBX: {
36126     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36127     Register BasePtr = TRI->getBaseRegister();
36128     if (TRI->hasBasePointer(*MF) &&
36129         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
36130       if (!BB->isLiveIn(BasePtr))
36131         BB->addLiveIn(BasePtr);
36132       // Save RBX into a virtual register.
36133       Register SaveRBX =
36134           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36135       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36136           .addReg(X86::RBX);
36137       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36138       MachineInstrBuilder MIB =
36139           BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
36140       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36141         MIB.add(MI.getOperand(Idx));
36142       MIB.add(MI.getOperand(X86::AddrNumOperands));
36143       MIB.addReg(SaveRBX);
36144     } else {
36145       // Simple case, just copy the virtual register to RBX.
36146       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
36147           .add(MI.getOperand(X86::AddrNumOperands));
36148       MachineInstrBuilder MIB =
36149           BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
36150       for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36151         MIB.add(MI.getOperand(Idx));
36152     }
36153     MI.eraseFromParent();
36154     return BB;
36155   }
36156   case X86::MWAITX: {
36157     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36158     Register BasePtr = TRI->getBaseRegister();
36159     bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
36160     // If no need to save the base pointer, we generate MWAITXrrr,
36161     // else we generate pseudo MWAITX_SAVE_RBX.
36162     if (!IsRBX || !TRI->hasBasePointer(*MF)) {
36163       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36164           .addReg(MI.getOperand(0).getReg());
36165       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36166           .addReg(MI.getOperand(1).getReg());
36167       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
36168           .addReg(MI.getOperand(2).getReg());
36169       BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
36170       MI.eraseFromParent();
36171     } else {
36172       if (!BB->isLiveIn(BasePtr)) {
36173         BB->addLiveIn(BasePtr);
36174       }
36175       // Parameters can be copied into ECX and EAX but not EBX yet.
36176       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36177           .addReg(MI.getOperand(0).getReg());
36178       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36179           .addReg(MI.getOperand(1).getReg());
36180       assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
36181       // Save RBX into a virtual register.
36182       Register SaveRBX =
36183           MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36184       BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36185           .addReg(X86::RBX);
36186       // Generate mwaitx pseudo.
36187       Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36188       BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
36189           .addDef(Dst) // Destination tied in with SaveRBX.
36190           .addReg(MI.getOperand(2).getReg()) // input value of EBX.
36191           .addUse(SaveRBX);                  // Save of base pointer.
36192       MI.eraseFromParent();
36193     }
36194     return BB;
36195   }
36196   case TargetOpcode::PREALLOCATED_SETUP: {
36197     assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
36198     auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36199     MFI->setHasPreallocatedCall(true);
36200     int64_t PreallocatedId = MI.getOperand(0).getImm();
36201     size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
36202     assert(StackAdjustment != 0 && "0 stack adjustment");
36203     LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
36204                       << StackAdjustment << "\n");
36205     BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
36206         .addReg(X86::ESP)
36207         .addImm(StackAdjustment);
36208     MI.eraseFromParent();
36209     return BB;
36210   }
36211   case TargetOpcode::PREALLOCATED_ARG: {
36212     assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
36213     int64_t PreallocatedId = MI.getOperand(1).getImm();
36214     int64_t ArgIdx = MI.getOperand(2).getImm();
36215     auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36216     size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
36217     LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
36218                       << ", arg offset " << ArgOffset << "\n");
36219     // stack pointer + offset
36220     addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
36221                          MI.getOperand(0).getReg()),
36222                  X86::ESP, false, ArgOffset);
36223     MI.eraseFromParent();
36224     return BB;
36225   }
36226   case X86::PTDPBSSD:
36227   case X86::PTDPBSUD:
36228   case X86::PTDPBUSD:
36229   case X86::PTDPBUUD:
36230   case X86::PTDPBF16PS:
36231   case X86::PTDPFP16PS: {
36232     unsigned Opc;
36233     switch (MI.getOpcode()) {
36234     default: llvm_unreachable("illegal opcode!");
36235     case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
36236     case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
36237     case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
36238     case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
36239     case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
36240     case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
36241     }
36242
36243     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36244     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36245     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36246     MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36247     MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36248
36249     MI.eraseFromParent(); // The pseudo is gone now.
36250     return BB;
36251   }
36252   case X86::PTILEZERO: {
36253     unsigned Imm = MI.getOperand(0).getImm();
36254     BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
36255     MI.eraseFromParent(); // The pseudo is gone now.
36256     return BB;
36257   }
36258   case X86::PTILELOADD:
36259   case X86::PTILELOADDT1:
36260   case X86::PTILESTORED: {
36261     unsigned Opc;
36262     switch (MI.getOpcode()) {
36263     default: llvm_unreachable("illegal opcode!");
36264     case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
36265     case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
36266     case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
36267     }
36268
36269     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36270     unsigned CurOp = 0;
36271     if (Opc != X86::TILESTORED)
36272       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36273                  RegState::Define);
36274
36275     MIB.add(MI.getOperand(CurOp++)); // base
36276     MIB.add(MI.getOperand(CurOp++)); // scale
36277     MIB.add(MI.getOperand(CurOp++)); // index -- stride
36278     MIB.add(MI.getOperand(CurOp++)); // displacement
36279     MIB.add(MI.getOperand(CurOp++)); // segment
36280
36281     if (Opc == X86::TILESTORED)
36282       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36283                  RegState::Undef);
36284
36285     MI.eraseFromParent(); // The pseudo is gone now.
36286     return BB;
36287   }
36288   case X86::PTCMMIMFP16PS:
36289   case X86::PTCMMRLFP16PS: {
36290     const MIMetadata MIMD(MI);
36291     unsigned Opc;
36292     switch (MI.getOpcode()) {
36293     default: llvm_unreachable("Unexpected instruction!");
36294     case X86::PTCMMIMFP16PS:     Opc = X86::TCMMIMFP16PS;     break;
36295     case X86::PTCMMRLFP16PS:     Opc = X86::TCMMRLFP16PS;     break;
36296     }
36297     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36298     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36299     MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36300     MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36301     MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36302     MI.eraseFromParent(); // The pseudo is gone now.
36303     return BB;
36304   }
36305   }
36306 }
36307
36308 //===----------------------------------------------------------------------===//
36309 //                           X86 Optimization Hooks
36310 //===----------------------------------------------------------------------===//
36311
36312 bool
36313 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
36314                                                 const APInt &DemandedBits,
36315                                                 const APInt &DemandedElts,
36316                                                 TargetLoweringOpt &TLO) const {
36317   EVT VT = Op.getValueType();
36318   unsigned Opcode = Op.getOpcode();
36319   unsigned EltSize = VT.getScalarSizeInBits();
36320
36321   if (VT.isVector()) {
36322     // If the constant is only all signbits in the active bits, then we should
36323     // extend it to the entire constant to allow it act as a boolean constant
36324     // vector.
36325     auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
36326       if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
36327         return false;
36328       for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
36329         if (!DemandedElts[i] || V.getOperand(i).isUndef())
36330           continue;
36331         const APInt &Val = V.getConstantOperandAPInt(i);
36332         if (Val.getBitWidth() > Val.getNumSignBits() &&
36333             Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
36334           return true;
36335       }
36336       return false;
36337     };
36338     // For vectors - if we have a constant, then try to sign extend.
36339     // TODO: Handle AND cases.
36340     unsigned ActiveBits = DemandedBits.getActiveBits();
36341     if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
36342         (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
36343         NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
36344       EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
36345       EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
36346                                    VT.getVectorNumElements());
36347       SDValue NewC =
36348           TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
36349                           Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
36350       SDValue NewOp =
36351           TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
36352       return TLO.CombineTo(Op, NewOp);
36353     }
36354     return false;
36355   }
36356
36357   // Only optimize Ands to prevent shrinking a constant that could be
36358   // matched by movzx.
36359   if (Opcode != ISD::AND)
36360     return false;
36361
36362   // Make sure the RHS really is a constant.
36363   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
36364   if (!C)
36365     return false;
36366
36367   const APInt &Mask = C->getAPIntValue();
36368
36369   // Clear all non-demanded bits initially.
36370   APInt ShrunkMask = Mask & DemandedBits;
36371
36372   // Find the width of the shrunk mask.
36373   unsigned Width = ShrunkMask.getActiveBits();
36374
36375   // If the mask is all 0s there's nothing to do here.
36376   if (Width == 0)
36377     return false;
36378
36379   // Find the next power of 2 width, rounding up to a byte.
36380   Width = llvm::bit_ceil(std::max(Width, 8U));
36381   // Truncate the width to size to handle illegal types.
36382   Width = std::min(Width, EltSize);
36383
36384   // Calculate a possible zero extend mask for this constant.
36385   APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
36386
36387   // If we aren't changing the mask, just return true to keep it and prevent
36388   // the caller from optimizing.
36389   if (ZeroExtendMask == Mask)
36390     return true;
36391
36392   // Make sure the new mask can be represented by a combination of mask bits
36393   // and non-demanded bits.
36394   if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
36395     return false;
36396
36397   // Replace the constant with the zero extend mask.
36398   SDLoc DL(Op);
36399   SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
36400   SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
36401   return TLO.CombineTo(Op, NewOp);
36402 }
36403
36404 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
36405                                                       KnownBits &Known,
36406                                                       const APInt &DemandedElts,
36407                                                       const SelectionDAG &DAG,
36408                                                       unsigned Depth) const {
36409   unsigned BitWidth = Known.getBitWidth();
36410   unsigned NumElts = DemandedElts.getBitWidth();
36411   unsigned Opc = Op.getOpcode();
36412   EVT VT = Op.getValueType();
36413   assert((Opc >= ISD::BUILTIN_OP_END ||
36414           Opc == ISD::INTRINSIC_WO_CHAIN ||
36415           Opc == ISD::INTRINSIC_W_CHAIN ||
36416           Opc == ISD::INTRINSIC_VOID) &&
36417          "Should use MaskedValueIsZero if you don't know whether Op"
36418          " is a target node!");
36419
36420   Known.resetAll();
36421   switch (Opc) {
36422   default: break;
36423   case X86ISD::MUL_IMM: {
36424     KnownBits Known2;
36425     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36426     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36427     Known = KnownBits::mul(Known, Known2);
36428     break;
36429   }
36430   case X86ISD::SETCC:
36431     Known.Zero.setBitsFrom(1);
36432     break;
36433   case X86ISD::MOVMSK: {
36434     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
36435     Known.Zero.setBitsFrom(NumLoBits);
36436     break;
36437   }
36438   case X86ISD::PEXTRB:
36439   case X86ISD::PEXTRW: {
36440     SDValue Src = Op.getOperand(0);
36441     EVT SrcVT = Src.getValueType();
36442     APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
36443                                             Op.getConstantOperandVal(1));
36444     Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
36445     Known = Known.anyextOrTrunc(BitWidth);
36446     Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
36447     break;
36448   }
36449   case X86ISD::VSRAI:
36450   case X86ISD::VSHLI:
36451   case X86ISD::VSRLI: {
36452     unsigned ShAmt = Op.getConstantOperandVal(1);
36453     if (ShAmt >= VT.getScalarSizeInBits()) {
36454       // Out of range logical bit shifts are guaranteed to be zero.
36455       // Out of range arithmetic bit shifts splat the sign bit.
36456       if (Opc != X86ISD::VSRAI) {
36457         Known.setAllZero();
36458         break;
36459       }
36460
36461       ShAmt = VT.getScalarSizeInBits() - 1;
36462     }
36463
36464     Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36465     if (Opc == X86ISD::VSHLI) {
36466       Known.Zero <<= ShAmt;
36467       Known.One <<= ShAmt;
36468       // Low bits are known zero.
36469       Known.Zero.setLowBits(ShAmt);
36470     } else if (Opc == X86ISD::VSRLI) {
36471       Known.Zero.lshrInPlace(ShAmt);
36472       Known.One.lshrInPlace(ShAmt);
36473       // High bits are known zero.
36474       Known.Zero.setHighBits(ShAmt);
36475     } else {
36476       Known.Zero.ashrInPlace(ShAmt);
36477       Known.One.ashrInPlace(ShAmt);
36478     }
36479     break;
36480   }
36481   case X86ISD::PACKUS: {
36482     // PACKUS is just a truncation if the upper half is zero.
36483     APInt DemandedLHS, DemandedRHS;
36484     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
36485
36486     Known.One = APInt::getAllOnes(BitWidth * 2);
36487     Known.Zero = APInt::getAllOnes(BitWidth * 2);
36488
36489     KnownBits Known2;
36490     if (!!DemandedLHS) {
36491       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
36492       Known = Known.intersectWith(Known2);
36493     }
36494     if (!!DemandedRHS) {
36495       Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
36496       Known = Known.intersectWith(Known2);
36497     }
36498
36499     if (Known.countMinLeadingZeros() < BitWidth)
36500       Known.resetAll();
36501     Known = Known.trunc(BitWidth);
36502     break;
36503   }
36504   case X86ISD::VBROADCAST: {
36505     SDValue Src = Op.getOperand(0);
36506     if (!Src.getSimpleValueType().isVector()) {
36507       Known = DAG.computeKnownBits(Src, Depth + 1);
36508       return;
36509     }
36510     break;
36511   }
36512   case X86ISD::AND: {
36513     if (Op.getResNo() == 0) {
36514       KnownBits Known2;
36515       Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36516       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36517       Known &= Known2;
36518     }
36519     break;
36520   }
36521   case X86ISD::ANDNP: {
36522     KnownBits Known2;
36523     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36524     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36525
36526     // ANDNP = (~X & Y);
36527     Known.One &= Known2.Zero;
36528     Known.Zero |= Known2.One;
36529     break;
36530   }
36531   case X86ISD::FOR: {
36532     KnownBits Known2;
36533     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36534     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36535
36536     Known |= Known2;
36537     break;
36538   }
36539   case X86ISD::PSADBW: {
36540     assert(VT.getScalarType() == MVT::i64 &&
36541            Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
36542            "Unexpected PSADBW types");
36543
36544     // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
36545     Known.Zero.setBitsFrom(16);
36546     break;
36547   }
36548   case X86ISD::PCMPGT:
36549   case X86ISD::PCMPEQ: {
36550     KnownBits KnownLhs =
36551         DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36552     KnownBits KnownRhs =
36553         DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36554     std::optional<bool> Res = Opc == X86ISD::PCMPEQ
36555                                   ? KnownBits::eq(KnownLhs, KnownRhs)
36556                                   : KnownBits::sgt(KnownLhs, KnownRhs);
36557     if (Res) {
36558       if (*Res)
36559         Known.setAllOnes();
36560       else
36561         Known.setAllZero();
36562     }
36563     break;
36564   }
36565   case X86ISD::PMULUDQ: {
36566     KnownBits Known2;
36567     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36568     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36569
36570     Known = Known.trunc(BitWidth / 2).zext(BitWidth);
36571     Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
36572     Known = KnownBits::mul(Known, Known2);
36573     break;
36574   }
36575   case X86ISD::CMOV: {
36576     Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
36577     // If we don't know any bits, early out.
36578     if (Known.isUnknown())
36579       break;
36580     KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
36581
36582     // Only known if known in both the LHS and RHS.
36583     Known = Known.intersectWith(Known2);
36584     break;
36585   }
36586   case X86ISD::BEXTR:
36587   case X86ISD::BEXTRI: {
36588     SDValue Op0 = Op.getOperand(0);
36589     SDValue Op1 = Op.getOperand(1);
36590
36591     if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
36592       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
36593       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
36594
36595       // If the length is 0, the result is 0.
36596       if (Length == 0) {
36597         Known.setAllZero();
36598         break;
36599       }
36600
36601       if ((Shift + Length) <= BitWidth) {
36602         Known = DAG.computeKnownBits(Op0, Depth + 1);
36603         Known = Known.extractBits(Length, Shift);
36604         Known = Known.zextOrTrunc(BitWidth);
36605       }
36606     }
36607     break;
36608   }
36609   case X86ISD::PDEP: {
36610     KnownBits Known2;
36611     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36612     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36613     // Zeros are retained from the mask operand. But not ones.
36614     Known.One.clearAllBits();
36615     // The result will have at least as many trailing zeros as the non-mask
36616     // operand since bits can only map to the same or higher bit position.
36617     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
36618     break;
36619   }
36620   case X86ISD::PEXT: {
36621     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36622     // The result has as many leading zeros as the number of zeroes in the mask.
36623     unsigned Count = Known.Zero.popcount();
36624     Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
36625     Known.One.clearAllBits();
36626     break;
36627   }
36628   case X86ISD::VTRUNC:
36629   case X86ISD::VTRUNCS:
36630   case X86ISD::VTRUNCUS:
36631   case X86ISD::CVTSI2P:
36632   case X86ISD::CVTUI2P:
36633   case X86ISD::CVTP2SI:
36634   case X86ISD::CVTP2UI:
36635   case X86ISD::MCVTP2SI:
36636   case X86ISD::MCVTP2UI:
36637   case X86ISD::CVTTP2SI:
36638   case X86ISD::CVTTP2UI:
36639   case X86ISD::MCVTTP2SI:
36640   case X86ISD::MCVTTP2UI:
36641   case X86ISD::MCVTSI2P:
36642   case X86ISD::MCVTUI2P:
36643   case X86ISD::VFPROUND:
36644   case X86ISD::VMFPROUND:
36645   case X86ISD::CVTPS2PH:
36646   case X86ISD::MCVTPS2PH: {
36647     // Truncations/Conversions - upper elements are known zero.
36648     EVT SrcVT = Op.getOperand(0).getValueType();
36649     if (SrcVT.isVector()) {
36650       unsigned NumSrcElts = SrcVT.getVectorNumElements();
36651       if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
36652         Known.setAllZero();
36653     }
36654     break;
36655   }
36656   case X86ISD::STRICT_CVTTP2SI:
36657   case X86ISD::STRICT_CVTTP2UI:
36658   case X86ISD::STRICT_CVTSI2P:
36659   case X86ISD::STRICT_CVTUI2P:
36660   case X86ISD::STRICT_VFPROUND:
36661   case X86ISD::STRICT_CVTPS2PH: {
36662     // Strict Conversions - upper elements are known zero.
36663     EVT SrcVT = Op.getOperand(1).getValueType();
36664     if (SrcVT.isVector()) {
36665       unsigned NumSrcElts = SrcVT.getVectorNumElements();
36666       if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
36667         Known.setAllZero();
36668     }
36669     break;
36670   }
36671   case X86ISD::MOVQ2DQ: {
36672     // Move from MMX to XMM. Upper half of XMM should be 0.
36673     if (DemandedElts.countr_zero() >= (NumElts / 2))
36674       Known.setAllZero();
36675     break;
36676   }
36677   case X86ISD::VBROADCAST_LOAD: {
36678     APInt UndefElts;
36679     SmallVector<APInt, 16> EltBits;
36680     if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
36681                                       /*AllowWholeUndefs*/ false,
36682                                       /*AllowPartialUndefs*/ false)) {
36683       Known.Zero.setAllBits();
36684       Known.One.setAllBits();
36685       for (unsigned I = 0; I != NumElts; ++I) {
36686         if (!DemandedElts[I])
36687           continue;
36688         if (UndefElts[I]) {
36689           Known.resetAll();
36690           break;
36691         }
36692         KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
36693         Known = Known.intersectWith(Known2);
36694       }
36695       return;
36696     }
36697     break;
36698   }
36699   }
36700
36701   // Handle target shuffles.
36702   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
36703   if (isTargetShuffle(Opc)) {
36704     SmallVector<int, 64> Mask;
36705     SmallVector<SDValue, 2> Ops;
36706     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
36707       unsigned NumOps = Ops.size();
36708       unsigned NumElts = VT.getVectorNumElements();
36709       if (Mask.size() == NumElts) {
36710         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
36711         Known.Zero.setAllBits(); Known.One.setAllBits();
36712         for (unsigned i = 0; i != NumElts; ++i) {
36713           if (!DemandedElts[i])
36714             continue;
36715           int M = Mask[i];
36716           if (M == SM_SentinelUndef) {
36717             // For UNDEF elements, we don't know anything about the common state
36718             // of the shuffle result.
36719             Known.resetAll();
36720             break;
36721           }
36722           if (M == SM_SentinelZero) {
36723             Known.One.clearAllBits();
36724             continue;
36725           }
36726           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
36727                  "Shuffle index out of range");
36728
36729           unsigned OpIdx = (unsigned)M / NumElts;
36730           unsigned EltIdx = (unsigned)M % NumElts;
36731           if (Ops[OpIdx].getValueType() != VT) {
36732             // TODO - handle target shuffle ops with different value types.
36733             Known.resetAll();
36734             break;
36735           }
36736           DemandedOps[OpIdx].setBit(EltIdx);
36737         }
36738         // Known bits are the values that are shared by every demanded element.
36739         for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
36740           if (!DemandedOps[i])
36741             continue;
36742           KnownBits Known2 =
36743               DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
36744           Known = Known.intersectWith(Known2);
36745         }
36746       }
36747     }
36748   }
36749 }
36750
36751 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
36752     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
36753     unsigned Depth) const {
36754   EVT VT = Op.getValueType();
36755   unsigned VTBits = VT.getScalarSizeInBits();
36756   unsigned Opcode = Op.getOpcode();
36757   switch (Opcode) {
36758   case X86ISD::SETCC_CARRY:
36759     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
36760     return VTBits;
36761
36762   case X86ISD::VTRUNC: {
36763     SDValue Src = Op.getOperand(0);
36764     MVT SrcVT = Src.getSimpleValueType();
36765     unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
36766     assert(VTBits < NumSrcBits && "Illegal truncation input type");
36767     APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
36768     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
36769     if (Tmp > (NumSrcBits - VTBits))
36770       return Tmp - (NumSrcBits - VTBits);
36771     return 1;
36772   }
36773
36774   case X86ISD::PACKSS: {
36775     // PACKSS is just a truncation if the sign bits extend to the packed size.
36776     APInt DemandedLHS, DemandedRHS;
36777     getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
36778                         DemandedRHS);
36779
36780     // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
36781     // patterns often used to compact vXi64 allsignbit patterns.
36782     auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
36783       SDValue BC = peekThroughBitcasts(V);
36784       if (BC.getOpcode() == X86ISD::PACKSS &&
36785           BC.getScalarValueSizeInBits() == 16 &&
36786           V.getScalarValueSizeInBits() == 32) {
36787         SDValue BC0 = peekThroughBitcasts(BC.getOperand(0));
36788         SDValue BC1 = peekThroughBitcasts(BC.getOperand(1));
36789         if (BC0.getScalarValueSizeInBits() == 64 &&
36790             BC1.getScalarValueSizeInBits() == 64 &&
36791             DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
36792             DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
36793           return 32;
36794       }
36795       return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
36796     };
36797
36798     unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
36799     unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
36800     if (!!DemandedLHS)
36801       Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
36802     if (!!DemandedRHS)
36803       Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
36804     unsigned Tmp = std::min(Tmp0, Tmp1);
36805     if (Tmp > (SrcBits - VTBits))
36806       return Tmp - (SrcBits - VTBits);
36807     return 1;
36808   }
36809
36810   case X86ISD::VBROADCAST: {
36811     SDValue Src = Op.getOperand(0);
36812     if (!Src.getSimpleValueType().isVector())
36813       return DAG.ComputeNumSignBits(Src, Depth + 1);
36814     break;
36815   }
36816
36817   case X86ISD::VSHLI: {
36818     SDValue Src = Op.getOperand(0);
36819     const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
36820     if (ShiftVal.uge(VTBits))
36821       return VTBits; // Shifted all bits out --> zero.
36822     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
36823     if (ShiftVal.uge(Tmp))
36824       return 1; // Shifted all sign bits out --> unknown.
36825     return Tmp - ShiftVal.getZExtValue();
36826   }
36827
36828   case X86ISD::VSRAI: {
36829     SDValue Src = Op.getOperand(0);
36830     APInt ShiftVal = Op.getConstantOperandAPInt(1);
36831     if (ShiftVal.uge(VTBits - 1))
36832       return VTBits; // Sign splat.
36833     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
36834     ShiftVal += Tmp;
36835     return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
36836   }
36837
36838   case X86ISD::FSETCC:
36839     // cmpss/cmpsd return zero/all-bits result values in the bottom element.
36840     if (VT == MVT::f32 || VT == MVT::f64 ||
36841         ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
36842       return VTBits;
36843     break;
36844
36845   case X86ISD::PCMPGT:
36846   case X86ISD::PCMPEQ:
36847   case X86ISD::CMPP:
36848   case X86ISD::VPCOM:
36849   case X86ISD::VPCOMU:
36850     // Vector compares return zero/all-bits result values.
36851     return VTBits;
36852
36853   case X86ISD::ANDNP: {
36854     unsigned Tmp0 =
36855         DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
36856     if (Tmp0 == 1) return 1; // Early out.
36857     unsigned Tmp1 =
36858         DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
36859     return std::min(Tmp0, Tmp1);
36860   }
36861
36862   case X86ISD::CMOV: {
36863     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
36864     if (Tmp0 == 1) return 1;  // Early out.
36865     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
36866     return std::min(Tmp0, Tmp1);
36867   }
36868   }
36869
36870   // Handle target shuffles.
36871   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
36872   if (isTargetShuffle(Opcode)) {
36873     SmallVector<int, 64> Mask;
36874     SmallVector<SDValue, 2> Ops;
36875     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
36876       unsigned NumOps = Ops.size();
36877       unsigned NumElts = VT.getVectorNumElements();
36878       if (Mask.size() == NumElts) {
36879         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
36880         for (unsigned i = 0; i != NumElts; ++i) {
36881           if (!DemandedElts[i])
36882             continue;
36883           int M = Mask[i];
36884           if (M == SM_SentinelUndef) {
36885             // For UNDEF elements, we don't know anything about the common state
36886             // of the shuffle result.
36887             return 1;
36888           } else if (M == SM_SentinelZero) {
36889             // Zero = all sign bits.
36890             continue;
36891           }
36892           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
36893                  "Shuffle index out of range");
36894
36895           unsigned OpIdx = (unsigned)M / NumElts;
36896           unsigned EltIdx = (unsigned)M % NumElts;
36897           if (Ops[OpIdx].getValueType() != VT) {
36898             // TODO - handle target shuffle ops with different value types.
36899             return 1;
36900           }
36901           DemandedOps[OpIdx].setBit(EltIdx);
36902         }
36903         unsigned Tmp0 = VTBits;
36904         for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
36905           if (!DemandedOps[i])
36906             continue;
36907           unsigned Tmp1 =
36908               DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
36909           Tmp0 = std::min(Tmp0, Tmp1);
36910         }
36911         return Tmp0;
36912       }
36913     }
36914   }
36915
36916   // Fallback case.
36917   return 1;
36918 }
36919
36920 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
36921   if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
36922     return N->getOperand(0);
36923   return N;
36924 }
36925
36926 // Helper to look for a normal load that can be narrowed into a vzload with the
36927 // specified VT and memory VT. Returns SDValue() on failure.
36928 static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
36929                                   SelectionDAG &DAG) {
36930   // Can't if the load is volatile or atomic.
36931   if (!LN->isSimple())
36932     return SDValue();
36933
36934   SDVTList Tys = DAG.getVTList(VT, MVT::Other);
36935   SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
36936   return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
36937                                  LN->getPointerInfo(), LN->getOriginalAlign(),
36938                                  LN->getMemOperand()->getFlags());
36939 }
36940
36941 // Attempt to match a combined shuffle mask against supported unary shuffle
36942 // instructions.
36943 // TODO: Investigate sharing more of this with shuffle lowering.
36944 static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
36945                               bool AllowFloatDomain, bool AllowIntDomain,
36946                               SDValue V1, const SelectionDAG &DAG,
36947                               const X86Subtarget &Subtarget, unsigned &Shuffle,
36948                               MVT &SrcVT, MVT &DstVT) {
36949   unsigned NumMaskElts = Mask.size();
36950   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
36951
36952   // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
36953   if (Mask[0] == 0 &&
36954       (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
36955     if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
36956         (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36957          isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
36958       Shuffle = X86ISD::VZEXT_MOVL;
36959       if (MaskEltSize == 16)
36960         SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
36961       else
36962         SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
36963       return true;
36964     }
36965   }
36966
36967   // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
36968   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
36969   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
36970                          (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
36971     unsigned MaxScale = 64 / MaskEltSize;
36972     bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
36973                    DAG.ComputeNumSignBits(V1) == MaskEltSize;
36974     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
36975       bool MatchAny = true;
36976       bool MatchZero = true;
36977       bool MatchSign = UseSign;
36978       unsigned NumDstElts = NumMaskElts / Scale;
36979       for (unsigned i = 0;
36980            i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
36981         if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
36982           MatchAny = MatchSign = MatchZero = false;
36983           break;
36984         }
36985         unsigned Pos = (i * Scale) + 1;
36986         unsigned Len = Scale - 1;
36987         MatchAny &= isUndefInRange(Mask, Pos, Len);
36988         MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
36989         MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
36990       }
36991       if (MatchAny || MatchSign || MatchZero) {
36992         assert((MatchSign || MatchZero) &&
36993                "Failed to match sext/zext but matched aext?");
36994         unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
36995         MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
36996                                           : MVT::getIntegerVT(MaskEltSize);
36997         SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
36998
36999         Shuffle = unsigned(
37000             MatchAny ? ISD::ANY_EXTEND
37001                      : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
37002         if (SrcVT.getVectorNumElements() != NumDstElts)
37003           Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
37004
37005         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
37006         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
37007         return true;
37008       }
37009     }
37010   }
37011
37012   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
37013   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
37014        (MaskEltSize == 16 && Subtarget.hasFP16())) &&
37015       isUndefOrEqual(Mask[0], 0) &&
37016       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
37017     Shuffle = X86ISD::VZEXT_MOVL;
37018     if (MaskEltSize == 16)
37019       SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37020     else
37021       SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37022     return true;
37023   }
37024
37025   // Check if we have SSE3 which will let us use MOVDDUP etc. The
37026   // instructions are no slower than UNPCKLPD but has the option to
37027   // fold the input operand into even an unaligned memory load.
37028   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
37029     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
37030       Shuffle = X86ISD::MOVDDUP;
37031       SrcVT = DstVT = MVT::v2f64;
37032       return true;
37033     }
37034     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37035       Shuffle = X86ISD::MOVSLDUP;
37036       SrcVT = DstVT = MVT::v4f32;
37037       return true;
37038     }
37039     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
37040       Shuffle = X86ISD::MOVSHDUP;
37041       SrcVT = DstVT = MVT::v4f32;
37042       return true;
37043     }
37044   }
37045
37046   if (MaskVT.is256BitVector() && AllowFloatDomain) {
37047     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
37048     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37049       Shuffle = X86ISD::MOVDDUP;
37050       SrcVT = DstVT = MVT::v4f64;
37051       return true;
37052     }
37053     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37054                                   V1)) {
37055       Shuffle = X86ISD::MOVSLDUP;
37056       SrcVT = DstVT = MVT::v8f32;
37057       return true;
37058     }
37059     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
37060                                   V1)) {
37061       Shuffle = X86ISD::MOVSHDUP;
37062       SrcVT = DstVT = MVT::v8f32;
37063       return true;
37064     }
37065   }
37066
37067   if (MaskVT.is512BitVector() && AllowFloatDomain) {
37068     assert(Subtarget.hasAVX512() &&
37069            "AVX512 required for 512-bit vector shuffles");
37070     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37071                                   V1)) {
37072       Shuffle = X86ISD::MOVDDUP;
37073       SrcVT = DstVT = MVT::v8f64;
37074       return true;
37075     }
37076     if (isTargetShuffleEquivalent(
37077             MaskVT, Mask,
37078             {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
37079       Shuffle = X86ISD::MOVSLDUP;
37080       SrcVT = DstVT = MVT::v16f32;
37081       return true;
37082     }
37083     if (isTargetShuffleEquivalent(
37084             MaskVT, Mask,
37085             {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
37086       Shuffle = X86ISD::MOVSHDUP;
37087       SrcVT = DstVT = MVT::v16f32;
37088       return true;
37089     }
37090   }
37091
37092   return false;
37093 }
37094
37095 // Attempt to match a combined shuffle mask against supported unary immediate
37096 // permute instructions.
37097 // TODO: Investigate sharing more of this with shuffle lowering.
37098 static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
37099                                      const APInt &Zeroable,
37100                                      bool AllowFloatDomain, bool AllowIntDomain,
37101                                      const SelectionDAG &DAG,
37102                                      const X86Subtarget &Subtarget,
37103                                      unsigned &Shuffle, MVT &ShuffleVT,
37104                                      unsigned &PermuteImm) {
37105   unsigned NumMaskElts = Mask.size();
37106   unsigned InputSizeInBits = MaskVT.getSizeInBits();
37107   unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
37108   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
37109   bool ContainsZeros = isAnyZero(Mask);
37110
37111   // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
37112   if (!ContainsZeros && MaskScalarSizeInBits == 64) {
37113     // Check for lane crossing permutes.
37114     if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
37115       // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
37116       if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
37117         Shuffle = X86ISD::VPERMI;
37118         ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
37119         PermuteImm = getV4X86ShuffleImm(Mask);
37120         return true;
37121       }
37122       if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
37123         SmallVector<int, 4> RepeatedMask;
37124         if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
37125           Shuffle = X86ISD::VPERMI;
37126           ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
37127           PermuteImm = getV4X86ShuffleImm(RepeatedMask);
37128           return true;
37129         }
37130       }
37131     } else if (AllowFloatDomain && Subtarget.hasAVX()) {
37132       // VPERMILPD can permute with a non-repeating shuffle.
37133       Shuffle = X86ISD::VPERMILPI;
37134       ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
37135       PermuteImm = 0;
37136       for (int i = 0, e = Mask.size(); i != e; ++i) {
37137         int M = Mask[i];
37138         if (M == SM_SentinelUndef)
37139           continue;
37140         assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
37141         PermuteImm |= (M & 1) << i;
37142       }
37143       return true;
37144     }
37145   }
37146
37147   // We are checking for shuffle match or shift match. Loop twice so we can
37148   // order which we try and match first depending on target preference.
37149   for (unsigned Order = 0; Order < 2; ++Order) {
37150     if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
37151       // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
37152       // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
37153       // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
37154       if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
37155           !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
37156         SmallVector<int, 4> RepeatedMask;
37157         if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
37158           // Narrow the repeated mask to create 32-bit element permutes.
37159           SmallVector<int, 4> WordMask = RepeatedMask;
37160           if (MaskScalarSizeInBits == 64)
37161             narrowShuffleMaskElts(2, RepeatedMask, WordMask);
37162
37163           Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
37164           ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
37165           ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
37166           PermuteImm = getV4X86ShuffleImm(WordMask);
37167           return true;
37168         }
37169       }
37170
37171       // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
37172       if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
37173           ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37174            (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37175            (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
37176         SmallVector<int, 4> RepeatedMask;
37177         if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
37178           ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
37179           ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
37180
37181           // PSHUFLW: permute lower 4 elements only.
37182           if (isUndefOrInRange(LoMask, 0, 4) &&
37183               isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
37184             Shuffle = X86ISD::PSHUFLW;
37185             ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
37186             PermuteImm = getV4X86ShuffleImm(LoMask);
37187             return true;
37188           }
37189
37190           // PSHUFHW: permute upper 4 elements only.
37191           if (isUndefOrInRange(HiMask, 4, 8) &&
37192               isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
37193             // Offset the HiMask so that we can create the shuffle immediate.
37194             int OffsetHiMask[4];
37195             for (int i = 0; i != 4; ++i)
37196               OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
37197
37198             Shuffle = X86ISD::PSHUFHW;
37199             ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
37200             PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
37201             return true;
37202           }
37203         }
37204       }
37205     } else {
37206       // Attempt to match against bit rotates.
37207       if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
37208           ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
37209            Subtarget.hasAVX512())) {
37210         int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
37211                                                 Subtarget, Mask);
37212         if (0 < RotateAmt) {
37213           Shuffle = X86ISD::VROTLI;
37214           PermuteImm = (unsigned)RotateAmt;
37215           return true;
37216         }
37217       }
37218     }
37219     // Attempt to match against byte/bit shifts.
37220     if (AllowIntDomain &&
37221         ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37222          (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37223          (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37224       int ShiftAmt =
37225           matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
37226                               Zeroable, Subtarget);
37227       if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
37228                            32 <= ShuffleVT.getScalarSizeInBits())) {
37229         // Byte shifts can be slower so only match them on second attempt.
37230         if (Order == 0 &&
37231             (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
37232           continue;
37233
37234         PermuteImm = (unsigned)ShiftAmt;
37235         return true;
37236       }
37237
37238     }
37239   }
37240
37241   return false;
37242 }
37243
37244 // Attempt to match a combined unary shuffle mask against supported binary
37245 // shuffle instructions.
37246 // TODO: Investigate sharing more of this with shuffle lowering.
37247 static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37248                                bool AllowFloatDomain, bool AllowIntDomain,
37249                                SDValue &V1, SDValue &V2, const SDLoc &DL,
37250                                SelectionDAG &DAG, const X86Subtarget &Subtarget,
37251                                unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
37252                                bool IsUnary) {
37253   unsigned NumMaskElts = Mask.size();
37254   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37255   unsigned SizeInBits = MaskVT.getSizeInBits();
37256
37257   if (MaskVT.is128BitVector()) {
37258     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
37259         AllowFloatDomain) {
37260       V2 = V1;
37261       V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
37262       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
37263       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37264       return true;
37265     }
37266     if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
37267         AllowFloatDomain) {
37268       V2 = V1;
37269       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
37270       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37271       return true;
37272     }
37273     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
37274         Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
37275       std::swap(V1, V2);
37276       Shuffle = X86ISD::MOVSD;
37277       SrcVT = DstVT = MVT::v2f64;
37278       return true;
37279     }
37280     if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
37281         (AllowFloatDomain || !Subtarget.hasSSE41())) {
37282       Shuffle = X86ISD::MOVSS;
37283       SrcVT = DstVT = MVT::v4f32;
37284       return true;
37285     }
37286     if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
37287                                   DAG) &&
37288         Subtarget.hasFP16()) {
37289       Shuffle = X86ISD::MOVSH;
37290       SrcVT = DstVT = MVT::v8f16;
37291       return true;
37292     }
37293   }
37294
37295   // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
37296   if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
37297       ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
37298       ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
37299     if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
37300                              Subtarget)) {
37301       DstVT = MaskVT;
37302       return true;
37303     }
37304   }
37305   // TODO: Can we handle this inside matchShuffleWithPACK?
37306   if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
37307       isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
37308       V1.getScalarValueSizeInBits() == 64 &&
37309       V2.getScalarValueSizeInBits() == 64) {
37310     // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
37311     unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
37312     unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
37313     if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
37314       SrcVT = MVT::v4i32;
37315       DstVT = MVT::v8i16;
37316       Shuffle = X86ISD::PACKUS;
37317       return true;
37318     }
37319     // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
37320     if (MinLZV1 >= 56 && MinLZV2 >= 56) {
37321       SrcVT = MVT::v8i16;
37322       DstVT = MVT::v16i8;
37323       Shuffle = X86ISD::PACKUS;
37324       return true;
37325     }
37326     // Use PACKSSWD if the signbits extend to the lowest 16-bits.
37327     if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
37328       SrcVT = MVT::v4i32;
37329       DstVT = MVT::v8i16;
37330       Shuffle = X86ISD::PACKSS;
37331       return true;
37332     }
37333   }
37334
37335   // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
37336   if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
37337       (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37338       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
37339       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37340       (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
37341        (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
37342     if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
37343                               Subtarget)) {
37344       SrcVT = DstVT = MaskVT;
37345       if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
37346         SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
37347       return true;
37348     }
37349   }
37350
37351   // Attempt to match against a OR if we're performing a blend shuffle and the
37352   // non-blended source element is zero in each case.
37353   // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
37354   if (SizeInBits == V1.getValueSizeInBits() &&
37355       SizeInBits == V2.getValueSizeInBits() &&
37356       (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37357       (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
37358     bool IsBlend = true;
37359     unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
37360     unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
37361     unsigned Scale1 = NumV1Elts / NumMaskElts;
37362     unsigned Scale2 = NumV2Elts / NumMaskElts;
37363     APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
37364     APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
37365     for (unsigned i = 0; i != NumMaskElts; ++i) {
37366       int M = Mask[i];
37367       if (M == SM_SentinelUndef)
37368         continue;
37369       if (M == SM_SentinelZero) {
37370         DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37371         DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37372         continue;
37373       }
37374       if (M == (int)i) {
37375         DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37376         continue;
37377       }
37378       if (M == (int)(i + NumMaskElts)) {
37379         DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37380         continue;
37381       }
37382       IsBlend = false;
37383       break;
37384     }
37385     if (IsBlend) {
37386       if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
37387           DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
37388         Shuffle = ISD::OR;
37389         SrcVT = DstVT = MaskVT.changeTypeToInteger();
37390         return true;
37391       }
37392       if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
37393         // FIXME: handle mismatched sizes?
37394         // TODO: investigate if `ISD::OR` handling in
37395         // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
37396         auto computeKnownBitsElementWise = [&DAG](SDValue V) {
37397           unsigned NumElts = V.getValueType().getVectorNumElements();
37398           KnownBits Known(NumElts);
37399           for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
37400             APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
37401             KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
37402             if (PeepholeKnown.isZero())
37403               Known.Zero.setBit(EltIdx);
37404             if (PeepholeKnown.isAllOnes())
37405               Known.One.setBit(EltIdx);
37406           }
37407           return Known;
37408         };
37409
37410         KnownBits V1Known = computeKnownBitsElementWise(V1);
37411         KnownBits V2Known = computeKnownBitsElementWise(V2);
37412
37413         for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
37414           int M = Mask[i];
37415           if (M == SM_SentinelUndef)
37416             continue;
37417           if (M == SM_SentinelZero) {
37418             IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
37419             continue;
37420           }
37421           if (M == (int)i) {
37422             IsBlend &= V2Known.Zero[i] || V1Known.One[i];
37423             continue;
37424           }
37425           if (M == (int)(i + NumMaskElts)) {
37426             IsBlend &= V1Known.Zero[i] || V2Known.One[i];
37427             continue;
37428           }
37429           llvm_unreachable("will not get here.");
37430         }
37431         if (IsBlend) {
37432           Shuffle = ISD::OR;
37433           SrcVT = DstVT = MaskVT.changeTypeToInteger();
37434           return true;
37435         }
37436       }
37437     }
37438   }
37439
37440   return false;
37441 }
37442
37443 static bool matchBinaryPermuteShuffle(
37444     MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
37445     bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
37446     const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
37447     unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
37448   unsigned NumMaskElts = Mask.size();
37449   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37450
37451   // Attempt to match against VALIGND/VALIGNQ rotate.
37452   if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
37453       ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
37454        (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
37455        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37456     if (!isAnyZero(Mask)) {
37457       int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
37458       if (0 < Rotation) {
37459         Shuffle = X86ISD::VALIGN;
37460         if (EltSizeInBits == 64)
37461           ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
37462         else
37463           ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
37464         PermuteImm = Rotation;
37465         return true;
37466       }
37467     }
37468   }
37469
37470   // Attempt to match against PALIGNR byte rotate.
37471   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
37472                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37473                          (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
37474     int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
37475     if (0 < ByteRotation) {
37476       Shuffle = X86ISD::PALIGNR;
37477       ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
37478       PermuteImm = ByteRotation;
37479       return true;
37480     }
37481   }
37482
37483   // Attempt to combine to X86ISD::BLENDI.
37484   if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
37485                             (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
37486       (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
37487     uint64_t BlendMask = 0;
37488     bool ForceV1Zero = false, ForceV2Zero = false;
37489     SmallVector<int, 8> TargetMask(Mask);
37490     if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
37491                             ForceV2Zero, BlendMask)) {
37492       if (MaskVT == MVT::v16i16) {
37493         // We can only use v16i16 PBLENDW if the lanes are repeated.
37494         SmallVector<int, 8> RepeatedMask;
37495         if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
37496                                         RepeatedMask)) {
37497           assert(RepeatedMask.size() == 8 &&
37498                  "Repeated mask size doesn't match!");
37499           PermuteImm = 0;
37500           for (int i = 0; i < 8; ++i)
37501             if (RepeatedMask[i] >= 8)
37502               PermuteImm |= 1 << i;
37503           V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37504           V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37505           Shuffle = X86ISD::BLENDI;
37506           ShuffleVT = MaskVT;
37507           return true;
37508         }
37509       } else {
37510         V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37511         V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37512         PermuteImm = (unsigned)BlendMask;
37513         Shuffle = X86ISD::BLENDI;
37514         ShuffleVT = MaskVT;
37515         return true;
37516       }
37517     }
37518   }
37519
37520   // Attempt to combine to INSERTPS, but only if it has elements that need to
37521   // be set to zero.
37522   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
37523       MaskVT.is128BitVector() && isAnyZero(Mask) &&
37524       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
37525     Shuffle = X86ISD::INSERTPS;
37526     ShuffleVT = MVT::v4f32;
37527     return true;
37528   }
37529
37530   // Attempt to combine to SHUFPD.
37531   if (AllowFloatDomain && EltSizeInBits == 64 &&
37532       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37533        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
37534        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37535     bool ForceV1Zero = false, ForceV2Zero = false;
37536     if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
37537                                PermuteImm, Mask, Zeroable)) {
37538       V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
37539       V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
37540       Shuffle = X86ISD::SHUFP;
37541       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
37542       return true;
37543     }
37544   }
37545
37546   // Attempt to combine to SHUFPS.
37547   if (AllowFloatDomain && EltSizeInBits == 32 &&
37548       ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
37549        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
37550        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37551     SmallVector<int, 4> RepeatedMask;
37552     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
37553       // Match each half of the repeated mask, to determine if its just
37554       // referencing one of the vectors, is zeroable or entirely undef.
37555       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
37556         int M0 = RepeatedMask[Offset];
37557         int M1 = RepeatedMask[Offset + 1];
37558
37559         if (isUndefInRange(RepeatedMask, Offset, 2)) {
37560           return DAG.getUNDEF(MaskVT);
37561         } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
37562           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
37563           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
37564           return getZeroVector(MaskVT, Subtarget, DAG, DL);
37565         } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
37566           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
37567           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
37568           return V1;
37569         } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
37570           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
37571           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
37572           return V2;
37573         }
37574
37575         return SDValue();
37576       };
37577
37578       int ShufMask[4] = {-1, -1, -1, -1};
37579       SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
37580       SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
37581
37582       if (Lo && Hi) {
37583         V1 = Lo;
37584         V2 = Hi;
37585         Shuffle = X86ISD::SHUFP;
37586         ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
37587         PermuteImm = getV4X86ShuffleImm(ShufMask);
37588         return true;
37589       }
37590     }
37591   }
37592
37593   // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
37594   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
37595       MaskVT.is128BitVector() &&
37596       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
37597     Shuffle = X86ISD::INSERTPS;
37598     ShuffleVT = MVT::v4f32;
37599     return true;
37600   }
37601
37602   return false;
37603 }
37604
37605 static SDValue combineX86ShuffleChainWithExtract(
37606     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
37607     bool HasVariableMask, bool AllowVariableCrossLaneMask,
37608     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
37609     const X86Subtarget &Subtarget);
37610
37611 /// Combine an arbitrary chain of shuffles into a single instruction if
37612 /// possible.
37613 ///
37614 /// This is the leaf of the recursive combine below. When we have found some
37615 /// chain of single-use x86 shuffle instructions and accumulated the combined
37616 /// shuffle mask represented by them, this will try to pattern match that mask
37617 /// into either a single instruction if there is a special purpose instruction
37618 /// for this operation, or into a PSHUFB instruction which is a fully general
37619 /// instruction but should only be used to replace chains over a certain depth.
37620 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
37621                                       ArrayRef<int> BaseMask, int Depth,
37622                                       bool HasVariableMask,
37623                                       bool AllowVariableCrossLaneMask,
37624                                       bool AllowVariablePerLaneMask,
37625                                       SelectionDAG &DAG,
37626                                       const X86Subtarget &Subtarget) {
37627   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
37628   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
37629          "Unexpected number of shuffle inputs!");
37630
37631   SDLoc DL(Root);
37632   MVT RootVT = Root.getSimpleValueType();
37633   unsigned RootSizeInBits = RootVT.getSizeInBits();
37634   unsigned NumRootElts = RootVT.getVectorNumElements();
37635
37636   // Canonicalize shuffle input op to the requested type.
37637   auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
37638     if (VT.getSizeInBits() > Op.getValueSizeInBits())
37639       Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
37640     else if (VT.getSizeInBits() < Op.getValueSizeInBits())
37641       Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
37642     return DAG.getBitcast(VT, Op);
37643   };
37644
37645   // Find the inputs that enter the chain. Note that multiple uses are OK
37646   // here, we're not going to remove the operands we find.
37647   bool UnaryShuffle = (Inputs.size() == 1);
37648   SDValue V1 = peekThroughBitcasts(Inputs[0]);
37649   SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
37650                              : peekThroughBitcasts(Inputs[1]));
37651
37652   MVT VT1 = V1.getSimpleValueType();
37653   MVT VT2 = V2.getSimpleValueType();
37654   assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
37655          (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
37656
37657   SDValue Res;
37658
37659   unsigned NumBaseMaskElts = BaseMask.size();
37660   if (NumBaseMaskElts == 1) {
37661     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
37662     return CanonicalizeShuffleInput(RootVT, V1);
37663   }
37664
37665   bool OptForSize = DAG.shouldOptForSize();
37666   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
37667   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
37668                      (RootVT.isFloatingPoint() && Depth >= 1) ||
37669                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
37670
37671   // Don't combine if we are a AVX512/EVEX target and the mask element size
37672   // is different from the root element size - this would prevent writemasks
37673   // from being reused.
37674   bool IsMaskedShuffle = false;
37675   if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
37676     if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
37677         Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
37678       IsMaskedShuffle = true;
37679     }
37680   }
37681
37682   // If we are shuffling a splat (and not introducing zeros) then we can just
37683   // use it directly. This works for smaller elements as well as they already
37684   // repeat across each mask element.
37685   if (UnaryShuffle && !isAnyZero(BaseMask) &&
37686       V1.getValueSizeInBits() >= RootSizeInBits &&
37687       (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37688       DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
37689     return CanonicalizeShuffleInput(RootVT, V1);
37690   }
37691
37692   SmallVector<int, 64> Mask(BaseMask);
37693
37694   // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
37695   // etc. can be simplified.
37696   if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
37697     SmallVector<int> ScaledMask, IdentityMask;
37698     unsigned NumElts = VT1.getVectorNumElements();
37699     if (Mask.size() <= NumElts &&
37700         scaleShuffleElements(Mask, NumElts, ScaledMask)) {
37701       for (unsigned i = 0; i != NumElts; ++i)
37702         IdentityMask.push_back(i);
37703       if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
37704                                     V2))
37705         return CanonicalizeShuffleInput(RootVT, V1);
37706     }
37707   }
37708
37709   // Handle 128/256-bit lane shuffles of 512-bit vectors.
37710   if (RootVT.is512BitVector() &&
37711       (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
37712     // If the upper subvectors are zeroable, then an extract+insert is more
37713     // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
37714     // to zero the upper subvectors.
37715     if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
37716       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37717         return SDValue(); // Nothing to do!
37718       assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
37719              "Unexpected lane shuffle");
37720       Res = CanonicalizeShuffleInput(RootVT, V1);
37721       unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
37722       bool UseZero = isAnyZero(Mask);
37723       Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
37724       return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
37725     }
37726
37727     // Narrow shuffle mask to v4x128.
37728     SmallVector<int, 4> ScaledMask;
37729     assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
37730     narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
37731
37732     // Try to lower to vshuf64x2/vshuf32x4.
37733     auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
37734                             ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
37735                             SelectionDAG &DAG) {
37736       int PermMask[4] = {-1, -1, -1, -1};
37737       // Ensure elements came from the same Op.
37738       SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
37739       for (int i = 0; i < 4; ++i) {
37740         assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
37741         if (ScaledMask[i] < 0)
37742           continue;
37743
37744         SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
37745         unsigned OpIndex = i / 2;
37746         if (Ops[OpIndex].isUndef())
37747           Ops[OpIndex] = Op;
37748         else if (Ops[OpIndex] != Op)
37749           return SDValue();
37750
37751         PermMask[i] = ScaledMask[i] % 4;
37752       }
37753
37754       return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
37755                          CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
37756                          CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
37757                          getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
37758     };
37759
37760     // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
37761     // doesn't work because our mask is for 128 bits and we don't have an MVT
37762     // to match that.
37763     bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
37764                        isUndefOrInRange(ScaledMask[1], 0, 2) &&
37765                        isUndefOrInRange(ScaledMask[2], 2, 4) &&
37766                        isUndefOrInRange(ScaledMask[3], 2, 4) &&
37767                        (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
37768                         ScaledMask[0] == (ScaledMask[2] % 2)) &&
37769                        (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
37770                         ScaledMask[1] == (ScaledMask[3] % 2));
37771
37772     if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
37773       if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
37774         return SDValue(); // Nothing to do!
37775       MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
37776       if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
37777         return DAG.getBitcast(RootVT, V);
37778     }
37779   }
37780
37781   // Handle 128-bit lane shuffles of 256-bit vectors.
37782   if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
37783     // If the upper half is zeroable, then an extract+insert is more optimal
37784     // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
37785     // zero the upper half.
37786     if (isUndefOrZero(Mask[1])) {
37787       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37788         return SDValue(); // Nothing to do!
37789       assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
37790       Res = CanonicalizeShuffleInput(RootVT, V1);
37791       Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
37792       return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
37793                             256);
37794     }
37795
37796     // If we're inserting the low subvector, an insert-subvector 'concat'
37797     // pattern is quicker than VPERM2X128.
37798     // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
37799     if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
37800         !Subtarget.hasAVX2()) {
37801       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
37802         return SDValue(); // Nothing to do!
37803       SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
37804       SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
37805       Hi = extractSubVector(Hi, 0, DAG, DL, 128);
37806       return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
37807     }
37808
37809     if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
37810       return SDValue(); // Nothing to do!
37811
37812     // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
37813     // we need to use the zeroing feature.
37814     // Prefer blends for sequential shuffles unless we are optimizing for size.
37815     if (UnaryShuffle &&
37816         !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
37817         (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
37818       unsigned PermMask = 0;
37819       PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
37820       PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
37821       return DAG.getNode(
37822           X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
37823           DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
37824     }
37825
37826     if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
37827       return SDValue(); // Nothing to do!
37828
37829     // TODO - handle AVX512VL cases with X86ISD::SHUF128.
37830     if (!UnaryShuffle && !IsMaskedShuffle) {
37831       assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
37832              "Unexpected shuffle sentinel value");
37833       // Prefer blends to X86ISD::VPERM2X128.
37834       if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
37835         unsigned PermMask = 0;
37836         PermMask |= ((Mask[0] & 3) << 0);
37837         PermMask |= ((Mask[1] & 3) << 4);
37838         SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
37839         SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
37840         return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
37841                           CanonicalizeShuffleInput(RootVT, LHS),
37842                           CanonicalizeShuffleInput(RootVT, RHS),
37843                           DAG.getTargetConstant(PermMask, DL, MVT::i8));
37844       }
37845     }
37846   }
37847
37848   // For masks that have been widened to 128-bit elements or more,
37849   // narrow back down to 64-bit elements.
37850   if (BaseMaskEltSizeInBits > 64) {
37851     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
37852     int MaskScale = BaseMaskEltSizeInBits / 64;
37853     SmallVector<int, 64> ScaledMask;
37854     narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
37855     Mask = std::move(ScaledMask);
37856   }
37857
37858   // For masked shuffles, we're trying to match the root width for better
37859   // writemask folding, attempt to scale the mask.
37860   // TODO - variable shuffles might need this to be widened again.
37861   if (IsMaskedShuffle && NumRootElts > Mask.size()) {
37862     assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
37863     int MaskScale = NumRootElts / Mask.size();
37864     SmallVector<int, 64> ScaledMask;
37865     narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
37866     Mask = std::move(ScaledMask);
37867   }
37868
37869   unsigned NumMaskElts = Mask.size();
37870   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
37871
37872   // Determine the effective mask value type.
37873   FloatDomain &= (32 <= MaskEltSizeInBits);
37874   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
37875                            : MVT::getIntegerVT(MaskEltSizeInBits);
37876   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
37877
37878   // Only allow legal mask types.
37879   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
37880     return SDValue();
37881
37882   // Attempt to match the mask against known shuffle patterns.
37883   MVT ShuffleSrcVT, ShuffleVT;
37884   unsigned Shuffle, PermuteImm;
37885
37886   // Which shuffle domains are permitted?
37887   // Permit domain crossing at higher combine depths.
37888   // TODO: Should we indicate which domain is preferred if both are allowed?
37889   bool AllowFloatDomain = FloatDomain || (Depth >= 3);
37890   bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
37891                         (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
37892
37893   // Determine zeroable mask elements.
37894   APInt KnownUndef, KnownZero;
37895   resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
37896   APInt Zeroable = KnownUndef | KnownZero;
37897
37898   if (UnaryShuffle) {
37899     // Attempt to match against broadcast-from-vector.
37900     // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
37901     if ((Subtarget.hasAVX2() ||
37902          (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
37903         (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
37904       if (isUndefOrEqual(Mask, 0)) {
37905         if (V1.getValueType() == MaskVT &&
37906             V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37907             X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
37908           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
37909             return SDValue(); // Nothing to do!
37910           Res = V1.getOperand(0);
37911           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
37912           return DAG.getBitcast(RootVT, Res);
37913         }
37914         if (Subtarget.hasAVX2()) {
37915           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
37916             return SDValue(); // Nothing to do!
37917           Res = CanonicalizeShuffleInput(MaskVT, V1);
37918           Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
37919           return DAG.getBitcast(RootVT, Res);
37920         }
37921       }
37922     }
37923
37924     if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
37925                           DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
37926         (!IsMaskedShuffle ||
37927          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37928       if (Depth == 0 && Root.getOpcode() == Shuffle)
37929         return SDValue(); // Nothing to do!
37930       Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
37931       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
37932       return DAG.getBitcast(RootVT, Res);
37933     }
37934
37935     if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
37936                                  AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
37937                                  PermuteImm) &&
37938         (!IsMaskedShuffle ||
37939          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37940       if (Depth == 0 && Root.getOpcode() == Shuffle)
37941         return SDValue(); // Nothing to do!
37942       Res = CanonicalizeShuffleInput(ShuffleVT, V1);
37943       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
37944                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37945       return DAG.getBitcast(RootVT, Res);
37946     }
37947   }
37948
37949   // Attempt to combine to INSERTPS, but only if the inserted element has come
37950   // from a scalar.
37951   // TODO: Handle other insertions here as well?
37952   if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
37953       Subtarget.hasSSE41() &&
37954       !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
37955     if (MaskEltSizeInBits == 32) {
37956       SDValue SrcV1 = V1, SrcV2 = V2;
37957       if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
37958                                  DAG) &&
37959           SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
37960         if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
37961           return SDValue(); // Nothing to do!
37962         Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
37963                           CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
37964                           CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
37965                           DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37966         return DAG.getBitcast(RootVT, Res);
37967       }
37968     }
37969     if (MaskEltSizeInBits == 64 &&
37970         isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
37971         V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37972         V2.getScalarValueSizeInBits() <= 32) {
37973       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
37974         return SDValue(); // Nothing to do!
37975       PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
37976       Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
37977                         CanonicalizeShuffleInput(MVT::v4f32, V1),
37978                         CanonicalizeShuffleInput(MVT::v4f32, V2),
37979                         DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
37980       return DAG.getBitcast(RootVT, Res);
37981     }
37982   }
37983
37984   SDValue NewV1 = V1; // Save operands in case early exit happens.
37985   SDValue NewV2 = V2;
37986   if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
37987                          NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
37988                          ShuffleVT, UnaryShuffle) &&
37989       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
37990     if (Depth == 0 && Root.getOpcode() == Shuffle)
37991       return SDValue(); // Nothing to do!
37992     NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
37993     NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
37994     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
37995     return DAG.getBitcast(RootVT, Res);
37996   }
37997
37998   NewV1 = V1; // Save operands in case early exit happens.
37999   NewV2 = V2;
38000   if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38001                                 AllowIntDomain, NewV1, NewV2, DL, DAG,
38002                                 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
38003       (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38004     if (Depth == 0 && Root.getOpcode() == Shuffle)
38005       return SDValue(); // Nothing to do!
38006     NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
38007     NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
38008     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
38009                       DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38010     return DAG.getBitcast(RootVT, Res);
38011   }
38012
38013   // Typically from here on, we need an integer version of MaskVT.
38014   MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
38015   IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
38016
38017   // Annoyingly, SSE4A instructions don't map into the above match helpers.
38018   if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
38019     uint64_t BitLen, BitIdx;
38020     if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
38021                             Zeroable)) {
38022       if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
38023         return SDValue(); // Nothing to do!
38024       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38025       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
38026                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
38027                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38028       return DAG.getBitcast(RootVT, Res);
38029     }
38030
38031     if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
38032       if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
38033         return SDValue(); // Nothing to do!
38034       V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38035       V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
38036       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
38037                         DAG.getTargetConstant(BitLen, DL, MVT::i8),
38038                         DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38039       return DAG.getBitcast(RootVT, Res);
38040     }
38041   }
38042
38043   // Match shuffle against TRUNCATE patterns.
38044   if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
38045     // Match against a VTRUNC instruction, accounting for src/dst sizes.
38046     if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
38047                              Subtarget)) {
38048       bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
38049                         ShuffleSrcVT.getVectorNumElements();
38050       unsigned Opc =
38051           IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
38052       if (Depth == 0 && Root.getOpcode() == Opc)
38053         return SDValue(); // Nothing to do!
38054       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38055       Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
38056       if (ShuffleVT.getSizeInBits() < RootSizeInBits)
38057         Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
38058       return DAG.getBitcast(RootVT, Res);
38059     }
38060
38061     // Do we need a more general binary truncation pattern?
38062     if (RootSizeInBits < 512 &&
38063         ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
38064          (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
38065         (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
38066         isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
38067       // Bail if this was already a truncation or PACK node.
38068       // We sometimes fail to match PACK if we demand known undef elements.
38069       if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
38070                          Root.getOpcode() == X86ISD::PACKSS ||
38071                          Root.getOpcode() == X86ISD::PACKUS))
38072         return SDValue(); // Nothing to do!
38073       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38074       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
38075       V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38076       V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
38077       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38078       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
38079       Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
38080       Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
38081       return DAG.getBitcast(RootVT, Res);
38082     }
38083   }
38084
38085   // Don't try to re-form single instruction chains under any circumstances now
38086   // that we've done encoding canonicalization for them.
38087   if (Depth < 1)
38088     return SDValue();
38089
38090   // Depth threshold above which we can efficiently use variable mask shuffles.
38091   int VariableCrossLaneShuffleDepth =
38092       Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
38093   int VariablePerLaneShuffleDepth =
38094       Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
38095   AllowVariableCrossLaneMask &=
38096       (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
38097   AllowVariablePerLaneMask &=
38098       (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
38099   // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
38100   // higher depth before combining them.
38101   bool AllowBWIVPERMV3 =
38102       (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
38103
38104   bool MaskContainsZeros = isAnyZero(Mask);
38105
38106   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
38107     // If we have a single input lane-crossing shuffle then lower to VPERMV.
38108     if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
38109       if (Subtarget.hasAVX2() &&
38110           (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
38111         SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
38112         Res = CanonicalizeShuffleInput(MaskVT, V1);
38113         Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
38114         return DAG.getBitcast(RootVT, Res);
38115       }
38116       // AVX512 variants (non-VLX will pad to 512-bit shuffles).
38117       if ((Subtarget.hasAVX512() &&
38118            (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38119             MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38120           (Subtarget.hasBWI() &&
38121            (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38122           (Subtarget.hasVBMI() &&
38123            (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
38124         V1 = CanonicalizeShuffleInput(MaskVT, V1);
38125         V2 = DAG.getUNDEF(MaskVT);
38126         Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38127         return DAG.getBitcast(RootVT, Res);
38128       }
38129     }
38130
38131     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
38132     // vector as the second source (non-VLX will pad to 512-bit shuffles).
38133     if (UnaryShuffle && AllowVariableCrossLaneMask &&
38134         ((Subtarget.hasAVX512() &&
38135           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38136            MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38137            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
38138            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38139          (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38140           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38141          (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38142           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
38143       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
38144       for (unsigned i = 0; i != NumMaskElts; ++i)
38145         if (Mask[i] == SM_SentinelZero)
38146           Mask[i] = NumMaskElts + i;
38147       V1 = CanonicalizeShuffleInput(MaskVT, V1);
38148       V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
38149       Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38150       return DAG.getBitcast(RootVT, Res);
38151     }
38152
38153     // If that failed and either input is extracted then try to combine as a
38154     // shuffle with the larger type.
38155     if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
38156             Inputs, Root, BaseMask, Depth, HasVariableMask,
38157             AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
38158             Subtarget))
38159       return WideShuffle;
38160
38161     // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
38162     // (non-VLX will pad to 512-bit shuffles).
38163     if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
38164         ((Subtarget.hasAVX512() &&
38165           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38166            MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38167            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
38168            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
38169          (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38170           (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38171          (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38172           (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
38173       V1 = CanonicalizeShuffleInput(MaskVT, V1);
38174       V2 = CanonicalizeShuffleInput(MaskVT, V2);
38175       Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38176       return DAG.getBitcast(RootVT, Res);
38177     }
38178     return SDValue();
38179   }
38180
38181   // See if we can combine a single input shuffle with zeros to a bit-mask,
38182   // which is much simpler than any shuffle.
38183   if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
38184       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
38185       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
38186     APInt Zero = APInt::getZero(MaskEltSizeInBits);
38187     APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
38188     APInt UndefElts(NumMaskElts, 0);
38189     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
38190     for (unsigned i = 0; i != NumMaskElts; ++i) {
38191       int M = Mask[i];
38192       if (M == SM_SentinelUndef) {
38193         UndefElts.setBit(i);
38194         continue;
38195       }
38196       if (M == SM_SentinelZero)
38197         continue;
38198       EltBits[i] = AllOnes;
38199     }
38200     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
38201     Res = CanonicalizeShuffleInput(MaskVT, V1);
38202     unsigned AndOpcode =
38203         MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
38204     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
38205     return DAG.getBitcast(RootVT, Res);
38206   }
38207
38208   // If we have a single input shuffle with different shuffle patterns in the
38209   // the 128-bit lanes use the variable mask to VPERMILPS.
38210   // TODO Combine other mask types at higher depths.
38211   if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38212       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
38213        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
38214     SmallVector<SDValue, 16> VPermIdx;
38215     for (int M : Mask) {
38216       SDValue Idx =
38217           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
38218       VPermIdx.push_back(Idx);
38219     }
38220     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
38221     Res = CanonicalizeShuffleInput(MaskVT, V1);
38222     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
38223     return DAG.getBitcast(RootVT, Res);
38224   }
38225
38226   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
38227   // to VPERMIL2PD/VPERMIL2PS.
38228   if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
38229       (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
38230        MaskVT == MVT::v8f32)) {
38231     // VPERMIL2 Operation.
38232     // Bits[3] - Match Bit.
38233     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
38234     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
38235     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
38236     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
38237     SmallVector<int, 8> VPerm2Idx;
38238     unsigned M2ZImm = 0;
38239     for (int M : Mask) {
38240       if (M == SM_SentinelUndef) {
38241         VPerm2Idx.push_back(-1);
38242         continue;
38243       }
38244       if (M == SM_SentinelZero) {
38245         M2ZImm = 2;
38246         VPerm2Idx.push_back(8);
38247         continue;
38248       }
38249       int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
38250       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
38251       VPerm2Idx.push_back(Index);
38252     }
38253     V1 = CanonicalizeShuffleInput(MaskVT, V1);
38254     V2 = CanonicalizeShuffleInput(MaskVT, V2);
38255     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
38256     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
38257                       DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
38258     return DAG.getBitcast(RootVT, Res);
38259   }
38260
38261   // If we have 3 or more shuffle instructions or a chain involving a variable
38262   // mask, we can replace them with a single PSHUFB instruction profitably.
38263   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
38264   // instructions, but in practice PSHUFB tends to be *very* fast so we're
38265   // more aggressive.
38266   if (UnaryShuffle && AllowVariablePerLaneMask &&
38267       ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
38268        (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
38269        (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
38270     SmallVector<SDValue, 16> PSHUFBMask;
38271     int NumBytes = RootVT.getSizeInBits() / 8;
38272     int Ratio = NumBytes / NumMaskElts;
38273     for (int i = 0; i < NumBytes; ++i) {
38274       int M = Mask[i / Ratio];
38275       if (M == SM_SentinelUndef) {
38276         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
38277         continue;
38278       }
38279       if (M == SM_SentinelZero) {
38280         PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38281         continue;
38282       }
38283       M = Ratio * M + i % Ratio;
38284       assert((M / 16) == (i / 16) && "Lane crossing detected");
38285       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38286     }
38287     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
38288     Res = CanonicalizeShuffleInput(ByteVT, V1);
38289     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
38290     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
38291     return DAG.getBitcast(RootVT, Res);
38292   }
38293
38294   // With XOP, if we have a 128-bit binary input shuffle we can always combine
38295   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
38296   // slower than PSHUFB on targets that support both.
38297   if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
38298       Subtarget.hasXOP()) {
38299     // VPPERM Mask Operation
38300     // Bits[4:0] - Byte Index (0 - 31)
38301     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
38302     SmallVector<SDValue, 16> VPPERMMask;
38303     int NumBytes = 16;
38304     int Ratio = NumBytes / NumMaskElts;
38305     for (int i = 0; i < NumBytes; ++i) {
38306       int M = Mask[i / Ratio];
38307       if (M == SM_SentinelUndef) {
38308         VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
38309         continue;
38310       }
38311       if (M == SM_SentinelZero) {
38312         VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38313         continue;
38314       }
38315       M = Ratio * M + i % Ratio;
38316       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38317     }
38318     MVT ByteVT = MVT::v16i8;
38319     V1 = CanonicalizeShuffleInput(ByteVT, V1);
38320     V2 = CanonicalizeShuffleInput(ByteVT, V2);
38321     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
38322     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
38323     return DAG.getBitcast(RootVT, Res);
38324   }
38325
38326   // If that failed and either input is extracted then try to combine as a
38327   // shuffle with the larger type.
38328   if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
38329           Inputs, Root, BaseMask, Depth, HasVariableMask,
38330           AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
38331     return WideShuffle;
38332
38333   // If we have a dual input shuffle then lower to VPERMV3,
38334   // (non-VLX will pad to 512-bit shuffles)
38335   if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38336       ((Subtarget.hasAVX512() &&
38337         (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
38338          MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
38339          MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
38340          MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
38341          MaskVT == MVT::v16i32)) ||
38342        (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38343         (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
38344          MaskVT == MVT::v32i16)) ||
38345        (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38346         (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
38347          MaskVT == MVT::v64i8)))) {
38348     V1 = CanonicalizeShuffleInput(MaskVT, V1);
38349     V2 = CanonicalizeShuffleInput(MaskVT, V2);
38350     Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38351     return DAG.getBitcast(RootVT, Res);
38352   }
38353
38354   // Failed to find any combines.
38355   return SDValue();
38356 }
38357
38358 // Combine an arbitrary chain of shuffles + extract_subvectors into a single
38359 // instruction if possible.
38360 //
38361 // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
38362 // type size to attempt to combine:
38363 // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
38364 // -->
38365 // extract_subvector(shuffle(x,y,m2),0)
38366 static SDValue combineX86ShuffleChainWithExtract(
38367     ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38368     bool HasVariableMask, bool AllowVariableCrossLaneMask,
38369     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38370     const X86Subtarget &Subtarget) {
38371   unsigned NumMaskElts = BaseMask.size();
38372   unsigned NumInputs = Inputs.size();
38373   if (NumInputs == 0)
38374     return SDValue();
38375
38376   EVT RootVT = Root.getValueType();
38377   unsigned RootSizeInBits = RootVT.getSizeInBits();
38378   unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
38379   assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
38380
38381   // Peek through extract_subvector to find widest legal vector.
38382   // TODO: Handle ISD::TRUNCATE
38383   unsigned WideSizeInBits = RootSizeInBits;
38384   for (unsigned I = 0; I != NumInputs; ++I) {
38385     SDValue Input = peekThroughBitcasts(Inputs[I]);
38386     while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
38387       Input = peekThroughBitcasts(Input.getOperand(0));
38388     if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
38389         WideSizeInBits < Input.getValueSizeInBits())
38390       WideSizeInBits = Input.getValueSizeInBits();
38391   }
38392
38393   // Bail if we fail to find a source larger than the existing root.
38394   unsigned Scale = WideSizeInBits / RootSizeInBits;
38395   if (WideSizeInBits <= RootSizeInBits ||
38396       (WideSizeInBits % RootSizeInBits) != 0)
38397     return SDValue();
38398
38399   // Create new mask for larger type.
38400   SmallVector<int, 64> WideMask(BaseMask);
38401   for (int &M : WideMask) {
38402     if (M < 0)
38403       continue;
38404     M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
38405   }
38406   WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
38407
38408   // Attempt to peek through inputs and adjust mask when we extract from an
38409   // upper subvector.
38410   int AdjustedMasks = 0;
38411   SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
38412   for (unsigned I = 0; I != NumInputs; ++I) {
38413     SDValue &Input = WideInputs[I];
38414     Input = peekThroughBitcasts(Input);
38415     while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
38416            Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
38417       uint64_t Idx = Input.getConstantOperandVal(1);
38418       if (Idx != 0) {
38419         ++AdjustedMasks;
38420         unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
38421         Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
38422
38423         int lo = I * WideMask.size();
38424         int hi = (I + 1) * WideMask.size();
38425         for (int &M : WideMask)
38426           if (lo <= M && M < hi)
38427             M += Idx;
38428       }
38429       Input = peekThroughBitcasts(Input.getOperand(0));
38430     }
38431   }
38432
38433   // Remove unused/repeated shuffle source ops.
38434   resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
38435   assert(!WideInputs.empty() && "Shuffle with no inputs detected");
38436
38437   // Bail if we're always extracting from the lowest subvectors,
38438   // combineX86ShuffleChain should match this for the current width, or the
38439   // shuffle still references too many inputs.
38440   if (AdjustedMasks == 0 || WideInputs.size() > 2)
38441     return SDValue();
38442
38443   // Minor canonicalization of the accumulated shuffle mask to make it easier
38444   // to match below. All this does is detect masks with sequential pairs of
38445   // elements, and shrink them to the half-width mask. It does this in a loop
38446   // so it will reduce the size of the mask to the minimal width mask which
38447   // performs an equivalent shuffle.
38448   while (WideMask.size() > 1) {
38449     SmallVector<int, 64> WidenedMask;
38450     if (!canWidenShuffleElements(WideMask, WidenedMask))
38451       break;
38452     WideMask = std::move(WidenedMask);
38453   }
38454
38455   // Canonicalization of binary shuffle masks to improve pattern matching by
38456   // commuting the inputs.
38457   if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
38458     ShuffleVectorSDNode::commuteMask(WideMask);
38459     std::swap(WideInputs[0], WideInputs[1]);
38460   }
38461
38462   // Increase depth for every upper subvector we've peeked through.
38463   Depth += AdjustedMasks;
38464
38465   // Attempt to combine wider chain.
38466   // TODO: Can we use a better Root?
38467   SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
38468                              WideInputs.back().getValueSizeInBits()
38469                          ? WideInputs.front()
38470                          : WideInputs.back();
38471   assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
38472          "WideRootSize mismatch");
38473
38474   if (SDValue WideShuffle =
38475           combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
38476                                  HasVariableMask, AllowVariableCrossLaneMask,
38477                                  AllowVariablePerLaneMask, DAG, Subtarget)) {
38478     WideShuffle =
38479         extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
38480     return DAG.getBitcast(RootVT, WideShuffle);
38481   }
38482
38483   return SDValue();
38484 }
38485
38486 // Canonicalize the combined shuffle mask chain with horizontal ops.
38487 // NOTE: This may update the Ops and Mask.
38488 static SDValue canonicalizeShuffleMaskWithHorizOp(
38489     MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
38490     unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
38491     const X86Subtarget &Subtarget) {
38492   if (Mask.empty() || Ops.empty())
38493     return SDValue();
38494
38495   SmallVector<SDValue> BC;
38496   for (SDValue Op : Ops)
38497     BC.push_back(peekThroughBitcasts(Op));
38498
38499   // All ops must be the same horizop + type.
38500   SDValue BC0 = BC[0];
38501   EVT VT0 = BC0.getValueType();
38502   unsigned Opcode0 = BC0.getOpcode();
38503   if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
38504         return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
38505       }))
38506     return SDValue();
38507
38508   bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
38509                   Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
38510   bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
38511   if (!isHoriz && !isPack)
38512     return SDValue();
38513
38514   // Do all ops have a single use?
38515   bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
38516     return Op.hasOneUse() &&
38517            peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
38518   });
38519
38520   int NumElts = VT0.getVectorNumElements();
38521   int NumLanes = VT0.getSizeInBits() / 128;
38522   int NumEltsPerLane = NumElts / NumLanes;
38523   int NumHalfEltsPerLane = NumEltsPerLane / 2;
38524   MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
38525   unsigned EltSizeInBits = RootSizeInBits / Mask.size();
38526
38527   if (NumEltsPerLane >= 4 &&
38528       (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
38529     SmallVector<int> LaneMask, ScaledMask;
38530     if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
38531         scaleShuffleElements(LaneMask, 4, ScaledMask)) {
38532       // See if we can remove the shuffle by resorting the HOP chain so that
38533       // the HOP args are pre-shuffled.
38534       // TODO: Generalize to any sized/depth chain.
38535       // TODO: Add support for PACKSS/PACKUS.
38536       if (isHoriz) {
38537         // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
38538         auto GetHOpSrc = [&](int M) {
38539           if (M == SM_SentinelUndef)
38540             return DAG.getUNDEF(VT0);
38541           if (M == SM_SentinelZero)
38542             return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
38543           SDValue Src0 = BC[M / 4];
38544           SDValue Src1 = Src0.getOperand((M % 4) >= 2);
38545           if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
38546             return Src1.getOperand(M % 2);
38547           return SDValue();
38548         };
38549         SDValue M0 = GetHOpSrc(ScaledMask[0]);
38550         SDValue M1 = GetHOpSrc(ScaledMask[1]);
38551         SDValue M2 = GetHOpSrc(ScaledMask[2]);
38552         SDValue M3 = GetHOpSrc(ScaledMask[3]);
38553         if (M0 && M1 && M2 && M3) {
38554           SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
38555           SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
38556           return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
38557         }
38558       }
38559       // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
38560       if (Ops.size() >= 2) {
38561         SDValue LHS, RHS;
38562         auto GetHOpSrc = [&](int M, int &OutM) {
38563           // TODO: Support SM_SentinelZero
38564           if (M < 0)
38565             return M == SM_SentinelUndef;
38566           SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
38567           if (!LHS || LHS == Src) {
38568             LHS = Src;
38569             OutM = (M % 2);
38570             return true;
38571           }
38572           if (!RHS || RHS == Src) {
38573             RHS = Src;
38574             OutM = (M % 2) + 2;
38575             return true;
38576           }
38577           return false;
38578         };
38579         int PostMask[4] = {-1, -1, -1, -1};
38580         if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
38581             GetHOpSrc(ScaledMask[1], PostMask[1]) &&
38582             GetHOpSrc(ScaledMask[2], PostMask[2]) &&
38583             GetHOpSrc(ScaledMask[3], PostMask[3])) {
38584           LHS = DAG.getBitcast(SrcVT, LHS);
38585           RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
38586           SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
38587           // Use SHUFPS for the permute so this will work on SSE2 targets,
38588           // shuffle combining and domain handling will simplify this later on.
38589           MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
38590           Res = DAG.getBitcast(ShuffleVT, Res);
38591           return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
38592                              getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
38593         }
38594       }
38595     }
38596   }
38597
38598   if (2 < Ops.size())
38599     return SDValue();
38600
38601   SDValue BC1 = BC[BC.size() - 1];
38602   if (Mask.size() == VT0.getVectorNumElements()) {
38603     // Canonicalize binary shuffles of horizontal ops that use the
38604     // same sources to an unary shuffle.
38605     // TODO: Try to perform this fold even if the shuffle remains.
38606     if (Ops.size() == 2) {
38607       auto ContainsOps = [](SDValue HOp, SDValue Op) {
38608         return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
38609       };
38610       // Commute if all BC0's ops are contained in BC1.
38611       if (ContainsOps(BC1, BC0.getOperand(0)) &&
38612           ContainsOps(BC1, BC0.getOperand(1))) {
38613         ShuffleVectorSDNode::commuteMask(Mask);
38614         std::swap(Ops[0], Ops[1]);
38615         std::swap(BC0, BC1);
38616       }
38617
38618       // If BC1 can be represented by BC0, then convert to unary shuffle.
38619       if (ContainsOps(BC0, BC1.getOperand(0)) &&
38620           ContainsOps(BC0, BC1.getOperand(1))) {
38621         for (int &M : Mask) {
38622           if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
38623             continue;
38624           int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
38625           M -= NumElts + (SubLane * NumHalfEltsPerLane);
38626           if (BC1.getOperand(SubLane) != BC0.getOperand(0))
38627             M += NumHalfEltsPerLane;
38628         }
38629       }
38630     }
38631
38632     // Canonicalize unary horizontal ops to only refer to lower halves.
38633     for (int i = 0; i != NumElts; ++i) {
38634       int &M = Mask[i];
38635       if (isUndefOrZero(M))
38636         continue;
38637       if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
38638           (M % NumEltsPerLane) >= NumHalfEltsPerLane)
38639         M -= NumHalfEltsPerLane;
38640       if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
38641           (M % NumEltsPerLane) >= NumHalfEltsPerLane)
38642         M -= NumHalfEltsPerLane;
38643     }
38644   }
38645
38646   // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
38647   // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
38648   // represents the LHS/RHS inputs for the lower/upper halves.
38649   SmallVector<int, 16> TargetMask128, WideMask128;
38650   if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
38651       scaleShuffleElements(TargetMask128, 2, WideMask128)) {
38652     assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
38653     bool SingleOp = (Ops.size() == 1);
38654     if (isPack || OneUseOps ||
38655         shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
38656       SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
38657       SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
38658       Lo = Lo.getOperand(WideMask128[0] & 1);
38659       Hi = Hi.getOperand(WideMask128[1] & 1);
38660       if (SingleOp) {
38661         SDValue Undef = DAG.getUNDEF(SrcVT);
38662         SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
38663         Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
38664         Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
38665         Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
38666         Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
38667       }
38668       return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
38669     }
38670   }
38671
38672   // If we are post-shuffling a 256-bit hop and not requiring the upper
38673   // elements, then try to narrow to a 128-bit hop directly.
38674   SmallVector<int, 16> WideMask64;
38675   if (Ops.size() == 1 && NumLanes == 2 &&
38676       scaleShuffleElements(Mask, 4, WideMask64) &&
38677       isUndefInRange(WideMask64, 2, 2)) {
38678     int M0 = WideMask64[0];
38679     int M1 = WideMask64[1];
38680     if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
38681       MVT HalfVT = VT0.getSimpleVT().getHalfNumVectorElementsVT();
38682       unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
38683       unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
38684       SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
38685       SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
38686       SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
38687       return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
38688     }
38689   }
38690
38691   return SDValue();
38692 }
38693
38694 // Attempt to constant fold all of the constant source ops.
38695 // Returns true if the entire shuffle is folded to a constant.
38696 // TODO: Extend this to merge multiple constant Ops and update the mask.
38697 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
38698                                            ArrayRef<int> Mask, SDValue Root,
38699                                            bool HasVariableMask,
38700                                            SelectionDAG &DAG,
38701                                            const X86Subtarget &Subtarget) {
38702   MVT VT = Root.getSimpleValueType();
38703
38704   unsigned SizeInBits = VT.getSizeInBits();
38705   unsigned NumMaskElts = Mask.size();
38706   unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
38707   unsigned NumOps = Ops.size();
38708
38709   // Extract constant bits from each source op.
38710   SmallVector<APInt, 16> UndefEltsOps(NumOps);
38711   SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
38712   for (unsigned I = 0; I != NumOps; ++I)
38713     if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
38714                                        RawBitsOps[I]))
38715       return SDValue();
38716
38717   // If we're optimizing for size, only fold if at least one of the constants is
38718   // only used once or the combined shuffle has included a variable mask
38719   // shuffle, this is to avoid constant pool bloat.
38720   bool IsOptimizingSize = DAG.shouldOptForSize();
38721   if (IsOptimizingSize && !HasVariableMask &&
38722       llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
38723     return SDValue();
38724
38725   // Shuffle the constant bits according to the mask.
38726   SDLoc DL(Root);
38727   APInt UndefElts(NumMaskElts, 0);
38728   APInt ZeroElts(NumMaskElts, 0);
38729   APInt ConstantElts(NumMaskElts, 0);
38730   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
38731                                         APInt::getZero(MaskSizeInBits));
38732   for (unsigned i = 0; i != NumMaskElts; ++i) {
38733     int M = Mask[i];
38734     if (M == SM_SentinelUndef) {
38735       UndefElts.setBit(i);
38736       continue;
38737     } else if (M == SM_SentinelZero) {
38738       ZeroElts.setBit(i);
38739       continue;
38740     }
38741     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
38742
38743     unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
38744     unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
38745
38746     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
38747     if (SrcUndefElts[SrcMaskIdx]) {
38748       UndefElts.setBit(i);
38749       continue;
38750     }
38751
38752     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
38753     APInt &Bits = SrcEltBits[SrcMaskIdx];
38754     if (!Bits) {
38755       ZeroElts.setBit(i);
38756       continue;
38757     }
38758
38759     ConstantElts.setBit(i);
38760     ConstantBitData[i] = Bits;
38761   }
38762   assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
38763
38764   // Attempt to create a zero vector.
38765   if ((UndefElts | ZeroElts).isAllOnes())
38766     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
38767
38768   // Create the constant data.
38769   MVT MaskSVT;
38770   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
38771     MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
38772   else
38773     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
38774
38775   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
38776   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
38777     return SDValue();
38778
38779   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
38780   return DAG.getBitcast(VT, CstOp);
38781 }
38782
38783 namespace llvm {
38784   namespace X86 {
38785     enum {
38786       MaxShuffleCombineDepth = 8
38787     };
38788   } // namespace X86
38789 } // namespace llvm
38790
38791 /// Fully generic combining of x86 shuffle instructions.
38792 ///
38793 /// This should be the last combine run over the x86 shuffle instructions. Once
38794 /// they have been fully optimized, this will recursively consider all chains
38795 /// of single-use shuffle instructions, build a generic model of the cumulative
38796 /// shuffle operation, and check for simpler instructions which implement this
38797 /// operation. We use this primarily for two purposes:
38798 ///
38799 /// 1) Collapse generic shuffles to specialized single instructions when
38800 ///    equivalent. In most cases, this is just an encoding size win, but
38801 ///    sometimes we will collapse multiple generic shuffles into a single
38802 ///    special-purpose shuffle.
38803 /// 2) Look for sequences of shuffle instructions with 3 or more total
38804 ///    instructions, and replace them with the slightly more expensive SSSE3
38805 ///    PSHUFB instruction if available. We do this as the last combining step
38806 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
38807 ///    a suitable short sequence of other instructions. The PSHUFB will either
38808 ///    use a register or have to read from memory and so is slightly (but only
38809 ///    slightly) more expensive than the other shuffle instructions.
38810 ///
38811 /// Because this is inherently a quadratic operation (for each shuffle in
38812 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
38813 /// This should never be an issue in practice as the shuffle lowering doesn't
38814 /// produce sequences of more than 8 instructions.
38815 ///
38816 /// FIXME: We will currently miss some cases where the redundant shuffling
38817 /// would simplify under the threshold for PSHUFB formation because of
38818 /// combine-ordering. To fix this, we should do the redundant instruction
38819 /// combining in this recursive walk.
38820 static SDValue combineX86ShufflesRecursively(
38821     ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
38822     ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
38823     unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
38824     bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38825     const X86Subtarget &Subtarget) {
38826   assert(!RootMask.empty() &&
38827          (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
38828          "Illegal shuffle root mask");
38829   MVT RootVT = Root.getSimpleValueType();
38830   assert(RootVT.isVector() && "Shuffles operate on vector types!");
38831   unsigned RootSizeInBits = RootVT.getSizeInBits();
38832
38833   // Bound the depth of our recursive combine because this is ultimately
38834   // quadratic in nature.
38835   if (Depth >= MaxDepth)
38836     return SDValue();
38837
38838   // Directly rip through bitcasts to find the underlying operand.
38839   SDValue Op = SrcOps[SrcOpIndex];
38840   Op = peekThroughOneUseBitcasts(Op);
38841
38842   EVT VT = Op.getValueType();
38843   if (!VT.isVector() || !VT.isSimple())
38844     return SDValue(); // Bail if we hit a non-simple non-vector.
38845
38846   // FIXME: Just bail on f16 for now.
38847   if (VT.getVectorElementType() == MVT::f16)
38848     return SDValue();
38849
38850   assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
38851          "Can only combine shuffles upto size of the root op.");
38852
38853   // Create a demanded elts mask from the referenced elements of Op.
38854   APInt OpDemandedElts = APInt::getZero(RootMask.size());
38855   for (int M : RootMask) {
38856     int BaseIdx = RootMask.size() * SrcOpIndex;
38857     if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
38858       OpDemandedElts.setBit(M - BaseIdx);
38859   }
38860   if (RootSizeInBits != VT.getSizeInBits()) {
38861     // Op is smaller than Root - extract the demanded elts for the subvector.
38862     unsigned Scale = RootSizeInBits / VT.getSizeInBits();
38863     unsigned NumOpMaskElts = RootMask.size() / Scale;
38864     assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
38865     assert(OpDemandedElts
38866                .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
38867                .isZero() &&
38868            "Out of range elements referenced in root mask");
38869     OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
38870   }
38871   OpDemandedElts =
38872       APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
38873
38874   // Extract target shuffle mask and resolve sentinels and inputs.
38875   SmallVector<int, 64> OpMask;
38876   SmallVector<SDValue, 2> OpInputs;
38877   APInt OpUndef, OpZero;
38878   bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
38879   if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
38880                              OpZero, DAG, Depth, false)) {
38881     // Shuffle inputs must not be larger than the shuffle result.
38882     // TODO: Relax this for single input faux shuffles (e.g. trunc).
38883     if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
38884           return OpInput.getValueSizeInBits() > VT.getSizeInBits();
38885         }))
38886       return SDValue();
38887   } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
38888              (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
38889              !isNullConstant(Op.getOperand(1))) {
38890     SDValue SrcVec = Op.getOperand(0);
38891     int ExtractIdx = Op.getConstantOperandVal(1);
38892     unsigned NumElts = VT.getVectorNumElements();
38893     OpInputs.assign({SrcVec});
38894     OpMask.assign(NumElts, SM_SentinelUndef);
38895     std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
38896     OpZero = OpUndef = APInt::getZero(NumElts);
38897   } else {
38898     return SDValue();
38899   }
38900
38901   // If the shuffle result was smaller than the root, we need to adjust the
38902   // mask indices and pad the mask with undefs.
38903   if (RootSizeInBits > VT.getSizeInBits()) {
38904     unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
38905     unsigned OpMaskSize = OpMask.size();
38906     if (OpInputs.size() > 1) {
38907       unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
38908       for (int &M : OpMask) {
38909         if (M < 0)
38910           continue;
38911         int EltIdx = M % OpMaskSize;
38912         int OpIdx = M / OpMaskSize;
38913         M = (PaddedMaskSize * OpIdx) + EltIdx;
38914       }
38915     }
38916     OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
38917     OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
38918     OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
38919   }
38920
38921   SmallVector<int, 64> Mask;
38922   SmallVector<SDValue, 16> Ops;
38923
38924   // We don't need to merge masks if the root is empty.
38925   bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
38926   if (EmptyRoot) {
38927     // Only resolve zeros if it will remove an input, otherwise we might end
38928     // up in an infinite loop.
38929     bool ResolveKnownZeros = true;
38930     if (!OpZero.isZero()) {
38931       APInt UsedInputs = APInt::getZero(OpInputs.size());
38932       for (int i = 0, e = OpMask.size(); i != e; ++i) {
38933         int M = OpMask[i];
38934         if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
38935           continue;
38936         UsedInputs.setBit(M / OpMask.size());
38937         if (UsedInputs.isAllOnes()) {
38938           ResolveKnownZeros = false;
38939           break;
38940         }
38941       }
38942     }
38943     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
38944                                       ResolveKnownZeros);
38945
38946     Mask = OpMask;
38947     Ops.append(OpInputs.begin(), OpInputs.end());
38948   } else {
38949     resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
38950
38951     // Add the inputs to the Ops list, avoiding duplicates.
38952     Ops.append(SrcOps.begin(), SrcOps.end());
38953
38954     auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
38955       // Attempt to find an existing match.
38956       SDValue InputBC = peekThroughBitcasts(Input);
38957       for (int i = 0, e = Ops.size(); i < e; ++i)
38958         if (InputBC == peekThroughBitcasts(Ops[i]))
38959           return i;
38960       // Match failed - should we replace an existing Op?
38961       if (InsertionPoint >= 0) {
38962         Ops[InsertionPoint] = Input;
38963         return InsertionPoint;
38964       }
38965       // Add to the end of the Ops list.
38966       Ops.push_back(Input);
38967       return Ops.size() - 1;
38968     };
38969
38970     SmallVector<int, 2> OpInputIdx;
38971     for (SDValue OpInput : OpInputs)
38972       OpInputIdx.push_back(
38973           AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
38974
38975     assert(((RootMask.size() > OpMask.size() &&
38976              RootMask.size() % OpMask.size() == 0) ||
38977             (OpMask.size() > RootMask.size() &&
38978              OpMask.size() % RootMask.size() == 0) ||
38979             OpMask.size() == RootMask.size()) &&
38980            "The smaller number of elements must divide the larger.");
38981
38982     // This function can be performance-critical, so we rely on the power-of-2
38983     // knowledge that we have about the mask sizes to replace div/rem ops with
38984     // bit-masks and shifts.
38985     assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
38986            "Non-power-of-2 shuffle mask sizes");
38987     assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
38988            "Non-power-of-2 shuffle mask sizes");
38989     unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
38990     unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
38991
38992     unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
38993     unsigned RootRatio =
38994         std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
38995     unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
38996     assert((RootRatio == 1 || OpRatio == 1) &&
38997            "Must not have a ratio for both incoming and op masks!");
38998
38999     assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
39000     assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
39001     assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
39002     unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
39003     unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
39004
39005     Mask.resize(MaskWidth, SM_SentinelUndef);
39006
39007     // Merge this shuffle operation's mask into our accumulated mask. Note that
39008     // this shuffle's mask will be the first applied to the input, followed by
39009     // the root mask to get us all the way to the root value arrangement. The
39010     // reason for this order is that we are recursing up the operation chain.
39011     for (unsigned i = 0; i < MaskWidth; ++i) {
39012       unsigned RootIdx = i >> RootRatioLog2;
39013       if (RootMask[RootIdx] < 0) {
39014         // This is a zero or undef lane, we're done.
39015         Mask[i] = RootMask[RootIdx];
39016         continue;
39017       }
39018
39019       unsigned RootMaskedIdx =
39020           RootRatio == 1
39021               ? RootMask[RootIdx]
39022               : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
39023
39024       // Just insert the scaled root mask value if it references an input other
39025       // than the SrcOp we're currently inserting.
39026       if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
39027           (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
39028         Mask[i] = RootMaskedIdx;
39029         continue;
39030       }
39031
39032       RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
39033       unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
39034       if (OpMask[OpIdx] < 0) {
39035         // The incoming lanes are zero or undef, it doesn't matter which ones we
39036         // are using.
39037         Mask[i] = OpMask[OpIdx];
39038         continue;
39039       }
39040
39041       // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
39042       unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
39043                                           : (OpMask[OpIdx] << OpRatioLog2) +
39044                                                 (RootMaskedIdx & (OpRatio - 1));
39045
39046       OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
39047       int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
39048       assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
39049       OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
39050
39051       Mask[i] = OpMaskedIdx;
39052     }
39053   }
39054
39055   // Peek through vector widenings and set out of bounds mask indices to undef.
39056   // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
39057   for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
39058     SDValue &Op = Ops[I];
39059     if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
39060         isNullConstant(Op.getOperand(2))) {
39061       Op = Op.getOperand(1);
39062       unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
39063       int Lo = I * Mask.size();
39064       int Hi = (I + 1) * Mask.size();
39065       int NewHi = Lo + (Mask.size() / Scale);
39066       for (int &M : Mask) {
39067         if (Lo <= M && NewHi <= M && M < Hi)
39068           M = SM_SentinelUndef;
39069       }
39070     }
39071   }
39072
39073   // Peek through any free extract_subvector nodes back to root size.
39074   for (SDValue &Op : Ops)
39075     while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39076            (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39077            isNullConstant(Op.getOperand(1)))
39078       Op = Op.getOperand(0);
39079
39080   // Remove unused/repeated shuffle source ops.
39081   resolveTargetShuffleInputsAndMask(Ops, Mask);
39082
39083   // Handle the all undef/zero/ones cases early.
39084   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
39085     return DAG.getUNDEF(RootVT);
39086   if (all_of(Mask, [](int Idx) { return Idx < 0; }))
39087     return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
39088   if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
39089       !llvm::is_contained(Mask, SM_SentinelZero))
39090     return getOnesVector(RootVT, DAG, SDLoc(Root));
39091
39092   assert(!Ops.empty() && "Shuffle with no inputs detected");
39093   HasVariableMask |= IsOpVariableMask;
39094
39095   // Update the list of shuffle nodes that have been combined so far.
39096   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
39097                                                 SrcNodes.end());
39098   CombinedNodes.push_back(Op.getNode());
39099
39100   // See if we can recurse into each shuffle source op (if it's a target
39101   // shuffle). The source op should only be generally combined if it either has
39102   // a single use (i.e. current Op) or all its users have already been combined,
39103   // if not then we can still combine but should prevent generation of variable
39104   // shuffles to avoid constant pool bloat.
39105   // Don't recurse if we already have more source ops than we can combine in
39106   // the remaining recursion depth.
39107   if (Ops.size() < (MaxDepth - Depth)) {
39108     for (int i = 0, e = Ops.size(); i < e; ++i) {
39109       // For empty roots, we need to resolve zeroable elements before combining
39110       // them with other shuffles.
39111       SmallVector<int, 64> ResolvedMask = Mask;
39112       if (EmptyRoot)
39113         resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
39114       bool AllowCrossLaneVar = false;
39115       bool AllowPerLaneVar = false;
39116       if (Ops[i].getNode()->hasOneUse() ||
39117           SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
39118         AllowCrossLaneVar = AllowVariableCrossLaneMask;
39119         AllowPerLaneVar = AllowVariablePerLaneMask;
39120       }
39121       if (SDValue Res = combineX86ShufflesRecursively(
39122               Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
39123               HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
39124               Subtarget))
39125         return Res;
39126     }
39127   }
39128
39129   // Attempt to constant fold all of the constant source ops.
39130   if (SDValue Cst = combineX86ShufflesConstants(
39131           Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
39132     return Cst;
39133
39134   // If constant fold failed and we only have constants - then we have
39135   // multiple uses by a single non-variable shuffle - just bail.
39136   if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
39137         APInt UndefElts;
39138         SmallVector<APInt> RawBits;
39139         unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39140         return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
39141                                              RawBits);
39142       })) {
39143     return SDValue();
39144   }
39145
39146   // Canonicalize the combined shuffle mask chain with horizontal ops.
39147   // NOTE: This will update the Ops and Mask.
39148   if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
39149           Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
39150     return DAG.getBitcast(RootVT, HOp);
39151
39152   // Try to refine our inputs given our knowledge of target shuffle mask.
39153   for (auto I : enumerate(Ops)) {
39154     int OpIdx = I.index();
39155     SDValue &Op = I.value();
39156
39157     // What range of shuffle mask element values results in picking from Op?
39158     int Lo = OpIdx * Mask.size();
39159     int Hi = Lo + Mask.size();
39160
39161     // Which elements of Op do we demand, given the mask's granularity?
39162     APInt OpDemandedElts(Mask.size(), 0);
39163     for (int MaskElt : Mask) {
39164       if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
39165         int OpEltIdx = MaskElt - Lo;
39166         OpDemandedElts.setBit(OpEltIdx);
39167       }
39168     }
39169
39170     // Is the shuffle result smaller than the root?
39171     if (Op.getValueSizeInBits() < RootSizeInBits) {
39172       // We padded the mask with undefs. But we now need to undo that.
39173       unsigned NumExpectedVectorElts = Mask.size();
39174       unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
39175       unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
39176       assert(!OpDemandedElts.extractBits(
39177                  NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
39178              "Demanding the virtual undef widening padding?");
39179       OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
39180     }
39181
39182     // The Op itself may be of different VT, so we need to scale the mask.
39183     unsigned NumOpElts = Op.getValueType().getVectorNumElements();
39184     APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
39185
39186     // Can this operand be simplified any further, given it's demanded elements?
39187     if (SDValue NewOp =
39188             DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(
39189                 Op, OpScaledDemandedElts, DAG))
39190       Op = NewOp;
39191   }
39192   // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
39193
39194   // Widen any subvector shuffle inputs we've collected.
39195   // TODO: Remove this to avoid generating temporary nodes, we should only
39196   // widen once combineX86ShuffleChain has found a match.
39197   if (any_of(Ops, [RootSizeInBits](SDValue Op) {
39198         return Op.getValueSizeInBits() < RootSizeInBits;
39199       })) {
39200     for (SDValue &Op : Ops)
39201       if (Op.getValueSizeInBits() < RootSizeInBits)
39202         Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
39203                             RootSizeInBits);
39204     // Reresolve - we might have repeated subvector sources.
39205     resolveTargetShuffleInputsAndMask(Ops, Mask);
39206   }
39207
39208   // We can only combine unary and binary shuffle mask cases.
39209   if (Ops.size() <= 2) {
39210     // Minor canonicalization of the accumulated shuffle mask to make it easier
39211     // to match below. All this does is detect masks with sequential pairs of
39212     // elements, and shrink them to the half-width mask. It does this in a loop
39213     // so it will reduce the size of the mask to the minimal width mask which
39214     // performs an equivalent shuffle.
39215     while (Mask.size() > 1) {
39216       SmallVector<int, 64> WidenedMask;
39217       if (!canWidenShuffleElements(Mask, WidenedMask))
39218         break;
39219       Mask = std::move(WidenedMask);
39220     }
39221
39222     // Canonicalization of binary shuffle masks to improve pattern matching by
39223     // commuting the inputs.
39224     if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
39225       ShuffleVectorSDNode::commuteMask(Mask);
39226       std::swap(Ops[0], Ops[1]);
39227     }
39228
39229     // Try to combine into a single shuffle instruction.
39230     if (SDValue Shuffle = combineX86ShuffleChain(
39231             Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
39232             AllowVariablePerLaneMask, DAG, Subtarget))
39233       return Shuffle;
39234
39235     // If all the operands come from the same larger vector, fallthrough and try
39236     // to use combineX86ShuffleChainWithExtract.
39237     SDValue LHS = peekThroughBitcasts(Ops.front());
39238     SDValue RHS = peekThroughBitcasts(Ops.back());
39239     if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
39240         (RootSizeInBits / Mask.size()) != 64 ||
39241         LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
39242         RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
39243         LHS.getOperand(0) != RHS.getOperand(0))
39244       return SDValue();
39245   }
39246
39247   // If that failed and any input is extracted then try to combine as a
39248   // shuffle with the larger type.
39249   return combineX86ShuffleChainWithExtract(
39250       Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
39251       AllowVariablePerLaneMask, DAG, Subtarget);
39252 }
39253
39254 /// Helper entry wrapper to combineX86ShufflesRecursively.
39255 static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
39256                                              const X86Subtarget &Subtarget) {
39257   return combineX86ShufflesRecursively(
39258       {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
39259       /*HasVarMask*/ false,
39260       /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
39261       Subtarget);
39262 }
39263
39264 /// Get the PSHUF-style mask from PSHUF node.
39265 ///
39266 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
39267 /// PSHUF-style masks that can be reused with such instructions.
39268 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
39269   MVT VT = N.getSimpleValueType();
39270   SmallVector<int, 4> Mask;
39271   SmallVector<SDValue, 2> Ops;
39272   bool HaveMask =
39273       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
39274   (void)HaveMask;
39275   assert(HaveMask);
39276
39277   // If we have more than 128-bits, only the low 128-bits of shuffle mask
39278   // matter. Check that the upper masks are repeats and remove them.
39279   if (VT.getSizeInBits() > 128) {
39280     int LaneElts = 128 / VT.getScalarSizeInBits();
39281 #ifndef NDEBUG
39282     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
39283       for (int j = 0; j < LaneElts; ++j)
39284         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
39285                "Mask doesn't repeat in high 128-bit lanes!");
39286 #endif
39287     Mask.resize(LaneElts);
39288   }
39289
39290   switch (N.getOpcode()) {
39291   case X86ISD::PSHUFD:
39292     return Mask;
39293   case X86ISD::PSHUFLW:
39294     Mask.resize(4);
39295     return Mask;
39296   case X86ISD::PSHUFHW:
39297     Mask.erase(Mask.begin(), Mask.begin() + 4);
39298     for (int &M : Mask)
39299       M -= 4;
39300     return Mask;
39301   default:
39302     llvm_unreachable("No valid shuffle instruction found!");
39303   }
39304 }
39305
39306 /// Search for a combinable shuffle across a chain ending in pshufd.
39307 ///
39308 /// We walk up the chain and look for a combinable shuffle, skipping over
39309 /// shuffles that we could hoist this shuffle's transformation past without
39310 /// altering anything.
39311 static SDValue
39312 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
39313                              SelectionDAG &DAG) {
39314   assert(N.getOpcode() == X86ISD::PSHUFD &&
39315          "Called with something other than an x86 128-bit half shuffle!");
39316   SDLoc DL(N);
39317
39318   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
39319   // of the shuffles in the chain so that we can form a fresh chain to replace
39320   // this one.
39321   SmallVector<SDValue, 8> Chain;
39322   SDValue V = N.getOperand(0);
39323   for (; V.hasOneUse(); V = V.getOperand(0)) {
39324     switch (V.getOpcode()) {
39325     default:
39326       return SDValue(); // Nothing combined!
39327
39328     case ISD::BITCAST:
39329       // Skip bitcasts as we always know the type for the target specific
39330       // instructions.
39331       continue;
39332
39333     case X86ISD::PSHUFD:
39334       // Found another dword shuffle.
39335       break;
39336
39337     case X86ISD::PSHUFLW:
39338       // Check that the low words (being shuffled) are the identity in the
39339       // dword shuffle, and the high words are self-contained.
39340       if (Mask[0] != 0 || Mask[1] != 1 ||
39341           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
39342         return SDValue();
39343
39344       Chain.push_back(V);
39345       continue;
39346
39347     case X86ISD::PSHUFHW:
39348       // Check that the high words (being shuffled) are the identity in the
39349       // dword shuffle, and the low words are self-contained.
39350       if (Mask[2] != 2 || Mask[3] != 3 ||
39351           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
39352         return SDValue();
39353
39354       Chain.push_back(V);
39355       continue;
39356
39357     case X86ISD::UNPCKL:
39358     case X86ISD::UNPCKH:
39359       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
39360       // shuffle into a preceding word shuffle.
39361       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
39362           V.getSimpleValueType().getVectorElementType() != MVT::i16)
39363         return SDValue();
39364
39365       // Search for a half-shuffle which we can combine with.
39366       unsigned CombineOp =
39367           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
39368       if (V.getOperand(0) != V.getOperand(1) ||
39369           !V->isOnlyUserOf(V.getOperand(0).getNode()))
39370         return SDValue();
39371       Chain.push_back(V);
39372       V = V.getOperand(0);
39373       do {
39374         switch (V.getOpcode()) {
39375         default:
39376           return SDValue(); // Nothing to combine.
39377
39378         case X86ISD::PSHUFLW:
39379         case X86ISD::PSHUFHW:
39380           if (V.getOpcode() == CombineOp)
39381             break;
39382
39383           Chain.push_back(V);
39384
39385           [[fallthrough]];
39386         case ISD::BITCAST:
39387           V = V.getOperand(0);
39388           continue;
39389         }
39390         break;
39391       } while (V.hasOneUse());
39392       break;
39393     }
39394     // Break out of the loop if we break out of the switch.
39395     break;
39396   }
39397
39398   if (!V.hasOneUse())
39399     // We fell out of the loop without finding a viable combining instruction.
39400     return SDValue();
39401
39402   // Merge this node's mask and our incoming mask.
39403   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
39404   for (int &M : Mask)
39405     M = VMask[M];
39406   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
39407                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
39408
39409   // Rebuild the chain around this new shuffle.
39410   while (!Chain.empty()) {
39411     SDValue W = Chain.pop_back_val();
39412
39413     if (V.getValueType() != W.getOperand(0).getValueType())
39414       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
39415
39416     switch (W.getOpcode()) {
39417     default:
39418       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
39419
39420     case X86ISD::UNPCKL:
39421     case X86ISD::UNPCKH:
39422       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
39423       break;
39424
39425     case X86ISD::PSHUFD:
39426     case X86ISD::PSHUFLW:
39427     case X86ISD::PSHUFHW:
39428       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
39429       break;
39430     }
39431   }
39432   if (V.getValueType() != N.getValueType())
39433     V = DAG.getBitcast(N.getValueType(), V);
39434
39435   // Return the new chain to replace N.
39436   return V;
39437 }
39438
39439 // Attempt to commute shufps LHS loads:
39440 // permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
39441 static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
39442                                       SelectionDAG &DAG) {
39443   // TODO: Add vXf64 support.
39444   if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
39445     return SDValue();
39446
39447   // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
39448   auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
39449     if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
39450       return SDValue();
39451     SDValue N0 = V.getOperand(0);
39452     SDValue N1 = V.getOperand(1);
39453     unsigned Imm = V.getConstantOperandVal(2);
39454     const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
39455     if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
39456         X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
39457       return SDValue();
39458     Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
39459     return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
39460                        DAG.getTargetConstant(Imm, DL, MVT::i8));
39461   };
39462
39463   switch (N.getOpcode()) {
39464   case X86ISD::VPERMILPI:
39465     if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
39466       unsigned Imm = N.getConstantOperandVal(1);
39467       return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
39468                          DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
39469     }
39470     break;
39471   case X86ISD::SHUFP: {
39472     SDValue N0 = N.getOperand(0);
39473     SDValue N1 = N.getOperand(1);
39474     unsigned Imm = N.getConstantOperandVal(2);
39475     if (N0 == N1) {
39476       if (SDValue NewSHUFP = commuteSHUFP(N, N0))
39477         return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
39478                            DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
39479     } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
39480       return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
39481                          DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
39482     } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
39483       return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
39484                          DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
39485     }
39486     break;
39487   }
39488   }
39489
39490   return SDValue();
39491 }
39492
39493 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
39494 static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
39495                                              const SDLoc &DL) {
39496   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39497   EVT ShuffleVT = N.getValueType();
39498
39499   auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {
39500     // AllZeros/AllOnes constants are freely shuffled and will peek through
39501     // bitcasts. Other constant build vectors do not peek through bitcasts. Only
39502     // merge with target shuffles if it has one use so shuffle combining is
39503     // likely to kick in. Shuffles of splats are expected to be removed.
39504     return ISD::isBuildVectorAllOnes(Op.getNode()) ||
39505            ISD::isBuildVectorAllZeros(Op.getNode()) ||
39506            ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
39507            ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
39508            (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
39509            (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
39510            (FoldLoad && isShuffleFoldableLoad(Op)) ||
39511            DAG.isSplatValue(Op, /*AllowUndefs*/ false);
39512   };
39513   auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
39514     // Ensure we only shuffle whole vector src elements, unless its a logical
39515     // binops where we can more aggressively move shuffles from dst to src.
39516     return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
39517            BinOp == X86ISD::ANDNP ||
39518            (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
39519   };
39520
39521   unsigned Opc = N.getOpcode();
39522   switch (Opc) {
39523   // Unary and Unary+Permute Shuffles.
39524   case X86ISD::PSHUFB: {
39525     // Don't merge PSHUFB if it contains zero'd elements.
39526     SmallVector<int> Mask;
39527     SmallVector<SDValue> Ops;
39528     if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
39529                               Mask))
39530       break;
39531     [[fallthrough]];
39532   }
39533   case X86ISD::VBROADCAST:
39534   case X86ISD::MOVDDUP:
39535   case X86ISD::PSHUFD:
39536   case X86ISD::PSHUFHW:
39537   case X86ISD::PSHUFLW:
39538   case X86ISD::VPERMI:
39539   case X86ISD::VPERMILPI: {
39540     if (N.getOperand(0).getValueType() == ShuffleVT &&
39541         N->isOnlyUserOf(N.getOperand(0).getNode())) {
39542       SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
39543       unsigned SrcOpcode = N0.getOpcode();
39544       if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
39545         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
39546         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
39547         if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||
39548             IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {
39549           SDValue LHS, RHS;
39550           Op00 = DAG.getBitcast(ShuffleVT, Op00);
39551           Op01 = DAG.getBitcast(ShuffleVT, Op01);
39552           if (N.getNumOperands() == 2) {
39553             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
39554             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
39555           } else {
39556             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
39557             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
39558           }
39559           EVT OpVT = N0.getValueType();
39560           return DAG.getBitcast(ShuffleVT,
39561                                 DAG.getNode(SrcOpcode, DL, OpVT,
39562                                             DAG.getBitcast(OpVT, LHS),
39563                                             DAG.getBitcast(OpVT, RHS)));
39564         }
39565       }
39566     }
39567     break;
39568   }
39569   // Binary and Binary+Permute Shuffles.
39570   case X86ISD::INSERTPS: {
39571     // Don't merge INSERTPS if it contains zero'd elements.
39572     unsigned InsertPSMask = N.getConstantOperandVal(2);
39573     unsigned ZeroMask = InsertPSMask & 0xF;
39574     if (ZeroMask != 0)
39575       break;
39576     [[fallthrough]];
39577   }
39578   case X86ISD::MOVSD:
39579   case X86ISD::MOVSS:
39580   case X86ISD::BLENDI:
39581   case X86ISD::SHUFP:
39582   case X86ISD::UNPCKH:
39583   case X86ISD::UNPCKL: {
39584     if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
39585         N->isOnlyUserOf(N.getOperand(1).getNode())) {
39586       SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
39587       SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
39588       unsigned SrcOpcode = N0.getOpcode();
39589       if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
39590           N0.getValueType() == N1.getValueType() &&
39591           IsSafeToMoveShuffle(N0, SrcOpcode) &&
39592           IsSafeToMoveShuffle(N1, SrcOpcode)) {
39593         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
39594         SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
39595         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
39596         SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
39597         // Ensure the total number of shuffles doesn't increase by folding this
39598         // shuffle through to the source ops.
39599         if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
39600              (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
39601             ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
39602              (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
39603           SDValue LHS, RHS;
39604           Op00 = DAG.getBitcast(ShuffleVT, Op00);
39605           Op10 = DAG.getBitcast(ShuffleVT, Op10);
39606           Op01 = DAG.getBitcast(ShuffleVT, Op01);
39607           Op11 = DAG.getBitcast(ShuffleVT, Op11);
39608           if (N.getNumOperands() == 3) {
39609             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
39610             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
39611           } else {
39612             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
39613             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
39614           }
39615           EVT OpVT = N0.getValueType();
39616           return DAG.getBitcast(ShuffleVT,
39617                                 DAG.getNode(SrcOpcode, DL, OpVT,
39618                                             DAG.getBitcast(OpVT, LHS),
39619                                             DAG.getBitcast(OpVT, RHS)));
39620         }
39621       }
39622     }
39623     break;
39624   }
39625   }
39626   return SDValue();
39627 }
39628
39629 /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
39630 static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
39631                                                       SelectionDAG &DAG,
39632                                                       const SDLoc &DL) {
39633   assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
39634
39635   MVT VT = V.getSimpleValueType();
39636   SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
39637   SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
39638   unsigned SrcOpc0 = Src0.getOpcode();
39639   unsigned SrcOpc1 = Src1.getOpcode();
39640   EVT SrcVT0 = Src0.getValueType();
39641   EVT SrcVT1 = Src1.getValueType();
39642
39643   if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
39644     return SDValue();
39645
39646   switch (SrcOpc0) {
39647   case X86ISD::MOVDDUP: {
39648     SDValue LHS = Src0.getOperand(0);
39649     SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
39650     SDValue Res =
39651         DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
39652     Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
39653     return DAG.getBitcast(VT, Res);
39654   }
39655   case X86ISD::VPERMILPI:
39656     // TODO: Handle v4f64 permutes with different low/high lane masks.
39657     if (SrcVT0 == MVT::v4f64) {
39658       uint64_t Mask = Src0.getConstantOperandVal(1);
39659       if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
39660         break;
39661     }
39662     [[fallthrough]];
39663   case X86ISD::VSHLI:
39664   case X86ISD::VSRLI:
39665   case X86ISD::VSRAI:
39666   case X86ISD::PSHUFD:
39667     if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
39668       SDValue LHS = Src0.getOperand(0);
39669       SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
39670       SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
39671                                 V.getOperand(2));
39672       Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
39673       return DAG.getBitcast(VT, Res);
39674     }
39675     break;
39676   }
39677
39678   return SDValue();
39679 }
39680
39681 /// Try to combine x86 target specific shuffles.
39682 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
39683                                     TargetLowering::DAGCombinerInfo &DCI,
39684                                     const X86Subtarget &Subtarget) {
39685   SDLoc DL(N);
39686   MVT VT = N.getSimpleValueType();
39687   SmallVector<int, 4> Mask;
39688   unsigned Opcode = N.getOpcode();
39689
39690   if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
39691     return R;
39692
39693   // Handle specific target shuffles.
39694   switch (Opcode) {
39695   case X86ISD::MOVDDUP: {
39696     SDValue Src = N.getOperand(0);
39697     // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
39698     if (VT == MVT::v2f64 && Src.hasOneUse() &&
39699         ISD::isNormalLoad(Src.getNode())) {
39700       LoadSDNode *LN = cast<LoadSDNode>(Src);
39701       if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
39702         SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
39703         DCI.CombineTo(N.getNode(), Movddup);
39704         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39705         DCI.recursivelyDeleteUnusedNodes(LN);
39706         return N; // Return N so it doesn't get rechecked!
39707       }
39708     }
39709
39710     return SDValue();
39711   }
39712   case X86ISD::VBROADCAST: {
39713     SDValue Src = N.getOperand(0);
39714     SDValue BC = peekThroughBitcasts(Src);
39715     EVT SrcVT = Src.getValueType();
39716     EVT BCVT = BC.getValueType();
39717
39718     // If broadcasting from another shuffle, attempt to simplify it.
39719     // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
39720     if (isTargetShuffle(BC.getOpcode()) &&
39721         VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
39722       unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
39723       SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
39724                                         SM_SentinelUndef);
39725       for (unsigned i = 0; i != Scale; ++i)
39726         DemandedMask[i] = i;
39727       if (SDValue Res = combineX86ShufflesRecursively(
39728               {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
39729               X86::MaxShuffleCombineDepth,
39730               /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
39731               /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
39732         return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
39733                            DAG.getBitcast(SrcVT, Res));
39734     }
39735
39736     // broadcast(bitcast(src)) -> bitcast(broadcast(src))
39737     // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
39738     if (Src.getOpcode() == ISD::BITCAST &&
39739         SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
39740         DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
39741         FixedVectorType::isValidElementType(
39742             BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
39743       EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
39744                                    VT.getVectorNumElements());
39745       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
39746     }
39747
39748     // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
39749     // If we're re-broadcasting a smaller type then broadcast with that type and
39750     // bitcast.
39751     // TODO: Do this for any splat?
39752     if (Src.getOpcode() == ISD::BITCAST &&
39753         (BC.getOpcode() == X86ISD::VBROADCAST ||
39754          BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&
39755         (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
39756         (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
39757       MVT NewVT =
39758           MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),
39759                            VT.getSizeInBits() / BCVT.getScalarSizeInBits());
39760       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
39761     }
39762
39763     // Reduce broadcast source vector to lowest 128-bits.
39764     if (SrcVT.getSizeInBits() > 128)
39765       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
39766                          extract128BitVector(Src, 0, DAG, DL));
39767
39768     // broadcast(scalar_to_vector(x)) -> broadcast(x).
39769     if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39770         Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
39771       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
39772
39773     // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
39774     if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
39775         isNullConstant(Src.getOperand(1)) &&
39776         Src.getValueType() ==
39777             Src.getOperand(0).getValueType().getScalarType() &&
39778         DAG.getTargetLoweringInfo().isTypeLegal(
39779             Src.getOperand(0).getValueType()))
39780       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
39781
39782     // Share broadcast with the longest vector and extract low subvector (free).
39783     // Ensure the same SDValue from the SDNode use is being used.
39784     for (SDNode *User : Src->uses())
39785       if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
39786           Src == User->getOperand(0) &&
39787           User->getValueSizeInBits(0).getFixedValue() >
39788               VT.getFixedSizeInBits()) {
39789         return extractSubVector(SDValue(User, 0), 0, DAG, DL,
39790                                 VT.getSizeInBits());
39791       }
39792
39793     // vbroadcast(scalarload X) -> vbroadcast_load X
39794     // For float loads, extract other uses of the scalar from the broadcast.
39795     if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
39796         ISD::isNormalLoad(Src.getNode())) {
39797       LoadSDNode *LN = cast<LoadSDNode>(Src);
39798       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39799       SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39800       SDValue BcastLd =
39801           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39802                                   LN->getMemoryVT(), LN->getMemOperand());
39803       // If the load value is used only by N, replace it via CombineTo N.
39804       bool NoReplaceExtract = Src.hasOneUse();
39805       DCI.CombineTo(N.getNode(), BcastLd);
39806       if (NoReplaceExtract) {
39807         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39808         DCI.recursivelyDeleteUnusedNodes(LN);
39809       } else {
39810         SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
39811                                   DAG.getIntPtrConstant(0, DL));
39812         DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
39813       }
39814       return N; // Return N so it doesn't get rechecked!
39815     }
39816
39817     // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
39818     // i16. So shrink it ourselves if we can make a broadcast_load.
39819     if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
39820         Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
39821       assert(Subtarget.hasAVX2() && "Expected AVX2");
39822       SDValue TruncIn = Src.getOperand(0);
39823
39824       // If this is a truncate of a non extending load we can just narrow it to
39825       // use a broadcast_load.
39826       if (ISD::isNormalLoad(TruncIn.getNode())) {
39827         LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
39828         // Unless its volatile or atomic.
39829         if (LN->isSimple()) {
39830           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39831           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39832           SDValue BcastLd = DAG.getMemIntrinsicNode(
39833               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
39834               LN->getPointerInfo(), LN->getOriginalAlign(),
39835               LN->getMemOperand()->getFlags());
39836           DCI.CombineTo(N.getNode(), BcastLd);
39837           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39838           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39839           return N; // Return N so it doesn't get rechecked!
39840         }
39841       }
39842
39843       // If this is a truncate of an i16 extload, we can directly replace it.
39844       if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
39845           ISD::isEXTLoad(Src.getOperand(0).getNode())) {
39846         LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
39847         if (LN->getMemoryVT().getSizeInBits() == 16) {
39848           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39849           SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39850           SDValue BcastLd =
39851               DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39852                                       LN->getMemoryVT(), LN->getMemOperand());
39853           DCI.CombineTo(N.getNode(), BcastLd);
39854           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39855           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39856           return N; // Return N so it doesn't get rechecked!
39857         }
39858       }
39859
39860       // If this is a truncate of load that has been shifted right, we can
39861       // offset the pointer and use a narrower load.
39862       if (TruncIn.getOpcode() == ISD::SRL &&
39863           TruncIn.getOperand(0).hasOneUse() &&
39864           isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
39865           ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
39866         LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
39867         unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
39868         // Make sure the shift amount and the load size are divisible by 16.
39869         // Don't do this if the load is volatile or atomic.
39870         if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
39871             LN->isSimple()) {
39872           unsigned Offset = ShiftAmt / 8;
39873           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39874           SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
39875                                                  TypeSize::Fixed(Offset), DL);
39876           SDValue Ops[] = { LN->getChain(), Ptr };
39877           SDValue BcastLd = DAG.getMemIntrinsicNode(
39878               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
39879               LN->getPointerInfo().getWithOffset(Offset),
39880               LN->getOriginalAlign(),
39881               LN->getMemOperand()->getFlags());
39882           DCI.CombineTo(N.getNode(), BcastLd);
39883           DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39884           DCI.recursivelyDeleteUnusedNodes(Src.getNode());
39885           return N; // Return N so it doesn't get rechecked!
39886         }
39887       }
39888     }
39889
39890     // vbroadcast(vzload X) -> vbroadcast_load X
39891     if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
39892       MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
39893       if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
39894         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39895         SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
39896         SDValue BcastLd =
39897             DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
39898                                     LN->getMemoryVT(), LN->getMemOperand());
39899         DCI.CombineTo(N.getNode(), BcastLd);
39900         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39901         DCI.recursivelyDeleteUnusedNodes(LN);
39902         return N; // Return N so it doesn't get rechecked!
39903       }
39904     }
39905
39906     // vbroadcast(vector load X) -> vbroadcast_load
39907     if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
39908          SrcVT == MVT::v4i32) &&
39909         Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
39910       LoadSDNode *LN = cast<LoadSDNode>(Src);
39911       // Unless the load is volatile or atomic.
39912       if (LN->isSimple()) {
39913         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39914         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39915         SDValue BcastLd = DAG.getMemIntrinsicNode(
39916             X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
39917             LN->getPointerInfo(), LN->getOriginalAlign(),
39918             LN->getMemOperand()->getFlags());
39919         DCI.CombineTo(N.getNode(), BcastLd);
39920         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
39921         DCI.recursivelyDeleteUnusedNodes(LN);
39922         return N; // Return N so it doesn't get rechecked!
39923       }
39924     }
39925
39926     return SDValue();
39927   }
39928   case X86ISD::VZEXT_MOVL: {
39929     SDValue N0 = N.getOperand(0);
39930
39931     // If this a vzmovl of a full vector load, replace it with a vzload, unless
39932     // the load is volatile.
39933     if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
39934       auto *LN = cast<LoadSDNode>(N0);
39935       if (SDValue VZLoad =
39936               narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
39937         DCI.CombineTo(N.getNode(), VZLoad);
39938         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39939         DCI.recursivelyDeleteUnusedNodes(LN);
39940         return N;
39941       }
39942     }
39943
39944     // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
39945     // and can just use a VZEXT_LOAD.
39946     // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
39947     if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
39948       auto *LN = cast<MemSDNode>(N0);
39949       if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
39950         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39951         SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39952         SDValue VZLoad =
39953             DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
39954                                     LN->getMemoryVT(), LN->getMemOperand());
39955         DCI.CombineTo(N.getNode(), VZLoad);
39956         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
39957         DCI.recursivelyDeleteUnusedNodes(LN);
39958         return N;
39959       }
39960     }
39961
39962     // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
39963     // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
39964     // if the upper bits of the i64 are zero.
39965     if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39966         N0.getOperand(0).hasOneUse() &&
39967         N0.getOperand(0).getValueType() == MVT::i64) {
39968       SDValue In = N0.getOperand(0);
39969       APInt Mask = APInt::getHighBitsSet(64, 32);
39970       if (DAG.MaskedValueIsZero(In, Mask)) {
39971         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
39972         MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
39973         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
39974         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
39975         return DAG.getBitcast(VT, Movl);
39976       }
39977     }
39978
39979     // Load a scalar integer constant directly to XMM instead of transferring an
39980     // immediate value from GPR.
39981     // vzext_movl (scalar_to_vector C) --> load [C,0...]
39982     if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
39983       if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
39984         // Create a vector constant - scalar constant followed by zeros.
39985         EVT ScalarVT = N0.getOperand(0).getValueType();
39986         Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
39987         unsigned NumElts = VT.getVectorNumElements();
39988         Constant *Zero = ConstantInt::getNullValue(ScalarTy);
39989         SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
39990         ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
39991
39992         // Load the vector constant from constant pool.
39993         MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
39994         SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
39995         MachinePointerInfo MPI =
39996             MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
39997         Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
39998         return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
39999                            MachineMemOperand::MOLoad);
40000       }
40001     }
40002
40003     // Pull subvector inserts into undef through VZEXT_MOVL by making it an
40004     // insert into a zero vector. This helps get VZEXT_MOVL closer to
40005     // scalar_to_vectors where 256/512 are canonicalized to an insert and a
40006     // 128-bit scalar_to_vector. This reduces the number of isel patterns.
40007     if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
40008       SDValue V = peekThroughOneUseBitcasts(N0);
40009
40010       if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
40011           isNullConstant(V.getOperand(2))) {
40012         SDValue In = V.getOperand(1);
40013         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
40014                                      In.getValueSizeInBits() /
40015                                          VT.getScalarSizeInBits());
40016         In = DAG.getBitcast(SubVT, In);
40017         SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
40018         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
40019                            getZeroVector(VT, Subtarget, DAG, DL), Movl,
40020                            V.getOperand(2));
40021       }
40022     }
40023
40024     return SDValue();
40025   }
40026   case X86ISD::BLENDI: {
40027     SDValue N0 = N.getOperand(0);
40028     SDValue N1 = N.getOperand(1);
40029
40030     // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
40031     // TODO: Handle MVT::v16i16 repeated blend mask.
40032     if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
40033         N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
40034       MVT SrcVT = N0.getOperand(0).getSimpleValueType();
40035       if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
40036           SrcVT.getScalarSizeInBits() >= 32) {
40037         unsigned BlendMask = N.getConstantOperandVal(2);
40038         unsigned Size = VT.getVectorNumElements();
40039         unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
40040         BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
40041         return DAG.getBitcast(
40042             VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
40043                             N1.getOperand(0),
40044                             DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
40045       }
40046     }
40047     return SDValue();
40048   }
40049   case X86ISD::SHUFP: {
40050     // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
40051     // This is a more relaxed shuffle combiner that can ignore oneuse limits.
40052     // TODO: Support types other than v4f32.
40053     if (VT == MVT::v4f32) {
40054       bool Updated = false;
40055       SmallVector<int> Mask;
40056       SmallVector<SDValue> Ops;
40057       if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&
40058           Ops.size() == 2) {
40059         for (int i = 0; i != 2; ++i) {
40060           SmallVector<SDValue> SubOps;
40061           SmallVector<int> SubMask, SubScaledMask;
40062           SDValue Sub = peekThroughBitcasts(Ops[i]);
40063           // TODO: Scaling might be easier if we specify the demanded elts.
40064           if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
40065               scaleShuffleElements(SubMask, 4, SubScaledMask) &&
40066               SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
40067             int Ofs = i * 2;
40068             Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
40069             Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
40070             Ops[i] = DAG.getBitcast(VT, SubOps[0]);
40071             Updated = true;
40072           }
40073         }
40074       }
40075       if (Updated) {
40076         for (int &M : Mask)
40077           M %= 4;
40078         Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
40079         return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
40080       }
40081     }
40082     return SDValue();
40083   }
40084   case X86ISD::VPERMI: {
40085     // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
40086     // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
40087     SDValue N0 = N.getOperand(0);
40088     SDValue N1 = N.getOperand(1);
40089     unsigned EltSizeInBits = VT.getScalarSizeInBits();
40090     if (N0.getOpcode() == ISD::BITCAST &&
40091         N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
40092       SDValue Src = N0.getOperand(0);
40093       EVT SrcVT = Src.getValueType();
40094       SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
40095       return DAG.getBitcast(VT, Res);
40096     }
40097     return SDValue();
40098   }
40099   case X86ISD::VPERM2X128: {
40100     // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
40101     SDValue LHS = N->getOperand(0);
40102     SDValue RHS = N->getOperand(1);
40103     if (LHS.getOpcode() == ISD::BITCAST &&
40104         (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
40105       EVT SrcVT = LHS.getOperand(0).getValueType();
40106       if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
40107         return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
40108                                               DAG.getBitcast(SrcVT, LHS),
40109                                               DAG.getBitcast(SrcVT, RHS),
40110                                               N->getOperand(2)));
40111       }
40112     }
40113
40114     // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
40115     if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
40116       return Res;
40117
40118     // Fold vperm2x128 subvector shuffle with an inner concat pattern.
40119     // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
40120     auto FindSubVector128 = [&](unsigned Idx) {
40121       if (Idx > 3)
40122         return SDValue();
40123       SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
40124       SmallVector<SDValue> SubOps;
40125       if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
40126         return SubOps[Idx & 1];
40127       unsigned NumElts = Src.getValueType().getVectorNumElements();
40128       if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
40129           Src.getOperand(1).getValueSizeInBits() == 128 &&
40130           Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
40131         return Src.getOperand(1);
40132       }
40133       return SDValue();
40134     };
40135     unsigned Imm = N.getConstantOperandVal(2);
40136     if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
40137       if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
40138         MVT SubVT = VT.getHalfNumVectorElementsVT();
40139         SubLo = DAG.getBitcast(SubVT, SubLo);
40140         SubHi = DAG.getBitcast(SubVT, SubHi);
40141         return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
40142       }
40143     }
40144     return SDValue();
40145   }
40146   case X86ISD::PSHUFD:
40147   case X86ISD::PSHUFLW:
40148   case X86ISD::PSHUFHW: {
40149     SDValue N0 = N.getOperand(0);
40150     SDValue N1 = N.getOperand(1);
40151     if (N0->hasOneUse()) {
40152       SDValue V = peekThroughOneUseBitcasts(N0);
40153       switch (V.getOpcode()) {
40154       case X86ISD::VSHL:
40155       case X86ISD::VSRL:
40156       case X86ISD::VSRA:
40157       case X86ISD::VSHLI:
40158       case X86ISD::VSRLI:
40159       case X86ISD::VSRAI:
40160       case X86ISD::VROTLI:
40161       case X86ISD::VROTRI: {
40162         MVT InnerVT = V.getSimpleValueType();
40163         if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
40164           SDValue Res = DAG.getNode(Opcode, DL, VT,
40165                                     DAG.getBitcast(VT, V.getOperand(0)), N1);
40166           Res = DAG.getBitcast(InnerVT, Res);
40167           Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
40168           return DAG.getBitcast(VT, Res);
40169         }
40170         break;
40171       }
40172       }
40173     }
40174
40175     Mask = getPSHUFShuffleMask(N);
40176     assert(Mask.size() == 4);
40177     break;
40178   }
40179   case X86ISD::MOVSD:
40180   case X86ISD::MOVSH:
40181   case X86ISD::MOVSS: {
40182     SDValue N0 = N.getOperand(0);
40183     SDValue N1 = N.getOperand(1);
40184
40185     // Canonicalize scalar FPOps:
40186     // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
40187     // If commutable, allow OP(N1[0], N0[0]).
40188     unsigned Opcode1 = N1.getOpcode();
40189     if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
40190         Opcode1 == ISD::FDIV) {
40191       SDValue N10 = N1.getOperand(0);
40192       SDValue N11 = N1.getOperand(1);
40193       if (N10 == N0 ||
40194           (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
40195         if (N10 != N0)
40196           std::swap(N10, N11);
40197         MVT SVT = VT.getVectorElementType();
40198         SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
40199         N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
40200         N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
40201         SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
40202         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
40203         return DAG.getNode(Opcode, DL, VT, N0, SclVec);
40204       }
40205     }
40206
40207     return SDValue();
40208   }
40209   case X86ISD::INSERTPS: {
40210     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
40211     SDValue Op0 = N.getOperand(0);
40212     SDValue Op1 = N.getOperand(1);
40213     unsigned InsertPSMask = N.getConstantOperandVal(2);
40214     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
40215     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
40216     unsigned ZeroMask = InsertPSMask & 0xF;
40217
40218     // If we zero out all elements from Op0 then we don't need to reference it.
40219     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
40220       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
40221                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40222
40223     // If we zero out the element from Op1 then we don't need to reference it.
40224     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
40225       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
40226                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40227
40228     // Attempt to merge insertps Op1 with an inner target shuffle node.
40229     SmallVector<int, 8> TargetMask1;
40230     SmallVector<SDValue, 2> Ops1;
40231     APInt KnownUndef1, KnownZero1;
40232     if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
40233                                      KnownZero1)) {
40234       if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
40235         // Zero/UNDEF insertion - zero out element and remove dependency.
40236         InsertPSMask |= (1u << DstIdx);
40237         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
40238                            DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40239       }
40240       // Update insertps mask srcidx and reference the source input directly.
40241       int M = TargetMask1[SrcIdx];
40242       assert(0 <= M && M < 8 && "Shuffle index out of range");
40243       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
40244       Op1 = Ops1[M < 4 ? 0 : 1];
40245       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
40246                          DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40247     }
40248
40249     // Attempt to merge insertps Op0 with an inner target shuffle node.
40250     SmallVector<int, 8> TargetMask0;
40251     SmallVector<SDValue, 2> Ops0;
40252     APInt KnownUndef0, KnownZero0;
40253     if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
40254                                      KnownZero0)) {
40255       bool Updated = false;
40256       bool UseInput00 = false;
40257       bool UseInput01 = false;
40258       for (int i = 0; i != 4; ++i) {
40259         if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
40260           // No change if element is already zero or the inserted element.
40261           continue;
40262         }
40263
40264         if (KnownUndef0[i] || KnownZero0[i]) {
40265           // If the target mask is undef/zero then we must zero the element.
40266           InsertPSMask |= (1u << i);
40267           Updated = true;
40268           continue;
40269         }
40270
40271         // The input vector element must be inline.
40272         int M = TargetMask0[i];
40273         if (M != i && M != (i + 4))
40274           return SDValue();
40275
40276         // Determine which inputs of the target shuffle we're using.
40277         UseInput00 |= (0 <= M && M < 4);
40278         UseInput01 |= (4 <= M);
40279       }
40280
40281       // If we're not using both inputs of the target shuffle then use the
40282       // referenced input directly.
40283       if (UseInput00 && !UseInput01) {
40284         Updated = true;
40285         Op0 = Ops0[0];
40286       } else if (!UseInput00 && UseInput01) {
40287         Updated = true;
40288         Op0 = Ops0[1];
40289       }
40290
40291       if (Updated)
40292         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
40293                            DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40294     }
40295
40296     // If we're inserting an element from a vbroadcast load, fold the
40297     // load into the X86insertps instruction. We need to convert the scalar
40298     // load to a vector and clear the source lane of the INSERTPS control.
40299     if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
40300       auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
40301       if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
40302         SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
40303                                    MemIntr->getBasePtr(),
40304                                    MemIntr->getMemOperand());
40305         SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
40306                            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
40307                                        Load),
40308                            DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
40309         DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
40310         return Insert;
40311       }
40312     }
40313
40314     return SDValue();
40315   }
40316   default:
40317     return SDValue();
40318   }
40319
40320   // Nuke no-op shuffles that show up after combining.
40321   if (isNoopShuffleMask(Mask))
40322     return N.getOperand(0);
40323
40324   // Look for simplifications involving one or two shuffle instructions.
40325   SDValue V = N.getOperand(0);
40326   switch (N.getOpcode()) {
40327   default:
40328     break;
40329   case X86ISD::PSHUFLW:
40330   case X86ISD::PSHUFHW:
40331     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
40332
40333     // See if this reduces to a PSHUFD which is no more expensive and can
40334     // combine with more operations. Note that it has to at least flip the
40335     // dwords as otherwise it would have been removed as a no-op.
40336     if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
40337       int DMask[] = {0, 1, 2, 3};
40338       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
40339       DMask[DOffset + 0] = DOffset + 1;
40340       DMask[DOffset + 1] = DOffset + 0;
40341       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
40342       V = DAG.getBitcast(DVT, V);
40343       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
40344                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
40345       return DAG.getBitcast(VT, V);
40346     }
40347
40348     // Look for shuffle patterns which can be implemented as a single unpack.
40349     // FIXME: This doesn't handle the location of the PSHUFD generically, and
40350     // only works when we have a PSHUFD followed by two half-shuffles.
40351     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
40352         (V.getOpcode() == X86ISD::PSHUFLW ||
40353          V.getOpcode() == X86ISD::PSHUFHW) &&
40354         V.getOpcode() != N.getOpcode() &&
40355         V.hasOneUse() && V.getOperand(0).hasOneUse()) {
40356       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
40357       if (D.getOpcode() == X86ISD::PSHUFD) {
40358         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
40359         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
40360         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
40361         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
40362         int WordMask[8];
40363         for (int i = 0; i < 4; ++i) {
40364           WordMask[i + NOffset] = Mask[i] + NOffset;
40365           WordMask[i + VOffset] = VMask[i] + VOffset;
40366         }
40367         // Map the word mask through the DWord mask.
40368         int MappedMask[8];
40369         for (int i = 0; i < 8; ++i)
40370           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
40371         if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
40372             ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
40373           // We can replace all three shuffles with an unpack.
40374           V = DAG.getBitcast(VT, D.getOperand(0));
40375           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
40376                                                 : X86ISD::UNPCKH,
40377                              DL, VT, V, V);
40378         }
40379       }
40380     }
40381
40382     break;
40383
40384   case X86ISD::PSHUFD:
40385     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
40386       return NewN;
40387
40388     break;
40389   }
40390
40391   return SDValue();
40392 }
40393
40394 /// Checks if the shuffle mask takes subsequent elements
40395 /// alternately from two vectors.
40396 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
40397 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
40398
40399   int ParitySrc[2] = {-1, -1};
40400   unsigned Size = Mask.size();
40401   for (unsigned i = 0; i != Size; ++i) {
40402     int M = Mask[i];
40403     if (M < 0)
40404       continue;
40405
40406     // Make sure we are using the matching element from the input.
40407     if ((M % Size) != i)
40408       return false;
40409
40410     // Make sure we use the same input for all elements of the same parity.
40411     int Src = M / Size;
40412     if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
40413       return false;
40414     ParitySrc[i % 2] = Src;
40415   }
40416
40417   // Make sure each input is used.
40418   if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
40419     return false;
40420
40421   Op0Even = ParitySrc[0] == 0;
40422   return true;
40423 }
40424
40425 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
40426 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
40427 /// are written to the parameters \p Opnd0 and \p Opnd1.
40428 ///
40429 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
40430 /// so it is easier to generically match. We also insert dummy vector shuffle
40431 /// nodes for the operands which explicitly discard the lanes which are unused
40432 /// by this operation to try to flow through the rest of the combiner
40433 /// the fact that they're unused.
40434 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
40435                              SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
40436                              bool &IsSubAdd) {
40437
40438   EVT VT = N->getValueType(0);
40439   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40440   if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
40441       !VT.getSimpleVT().isFloatingPoint())
40442     return false;
40443
40444   // We only handle target-independent shuffles.
40445   // FIXME: It would be easy and harmless to use the target shuffle mask
40446   // extraction tool to support more.
40447   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
40448     return false;
40449
40450   SDValue V1 = N->getOperand(0);
40451   SDValue V2 = N->getOperand(1);
40452
40453   // Make sure we have an FADD and an FSUB.
40454   if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
40455       (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
40456       V1.getOpcode() == V2.getOpcode())
40457     return false;
40458
40459   // If there are other uses of these operations we can't fold them.
40460   if (!V1->hasOneUse() || !V2->hasOneUse())
40461     return false;
40462
40463   // Ensure that both operations have the same operands. Note that we can
40464   // commute the FADD operands.
40465   SDValue LHS, RHS;
40466   if (V1.getOpcode() == ISD::FSUB) {
40467     LHS = V1->getOperand(0); RHS = V1->getOperand(1);
40468     if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
40469         (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
40470       return false;
40471   } else {
40472     assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
40473     LHS = V2->getOperand(0); RHS = V2->getOperand(1);
40474     if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
40475         (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
40476       return false;
40477   }
40478
40479   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
40480   bool Op0Even;
40481   if (!isAddSubOrSubAddMask(Mask, Op0Even))
40482     return false;
40483
40484   // It's a subadd if the vector in the even parity is an FADD.
40485   IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
40486                      : V2->getOpcode() == ISD::FADD;
40487
40488   Opnd0 = LHS;
40489   Opnd1 = RHS;
40490   return true;
40491 }
40492
40493 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
40494 static SDValue combineShuffleToFMAddSub(SDNode *N,
40495                                         const X86Subtarget &Subtarget,
40496                                         SelectionDAG &DAG) {
40497   // We only handle target-independent shuffles.
40498   // FIXME: It would be easy and harmless to use the target shuffle mask
40499   // extraction tool to support more.
40500   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
40501     return SDValue();
40502
40503   MVT VT = N->getSimpleValueType(0);
40504   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40505   if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
40506     return SDValue();
40507
40508   // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
40509   SDValue Op0 = N->getOperand(0);
40510   SDValue Op1 = N->getOperand(1);
40511   SDValue FMAdd = Op0, FMSub = Op1;
40512   if (FMSub.getOpcode() != X86ISD::FMSUB)
40513     std::swap(FMAdd, FMSub);
40514
40515   if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
40516       FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
40517       FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
40518       FMAdd.getOperand(2) != FMSub.getOperand(2))
40519     return SDValue();
40520
40521   // Check for correct shuffle mask.
40522   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
40523   bool Op0Even;
40524   if (!isAddSubOrSubAddMask(Mask, Op0Even))
40525     return SDValue();
40526
40527   // FMAddSub takes zeroth operand from FMSub node.
40528   SDLoc DL(N);
40529   bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
40530   unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
40531   return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
40532                      FMAdd.getOperand(2));
40533 }
40534
40535 /// Try to combine a shuffle into a target-specific add-sub or
40536 /// mul-add-sub node.
40537 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
40538                                                 const X86Subtarget &Subtarget,
40539                                                 SelectionDAG &DAG) {
40540   if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
40541     return V;
40542
40543   SDValue Opnd0, Opnd1;
40544   bool IsSubAdd;
40545   if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
40546     return SDValue();
40547
40548   MVT VT = N->getSimpleValueType(0);
40549   SDLoc DL(N);
40550
40551   // Try to generate X86ISD::FMADDSUB node here.
40552   SDValue Opnd2;
40553   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
40554     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
40555     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
40556   }
40557
40558   if (IsSubAdd)
40559     return SDValue();
40560
40561   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
40562   // the ADDSUB idiom has been successfully recognized. There are no known
40563   // X86 targets with 512-bit ADDSUB instructions!
40564   if (VT.is512BitVector())
40565     return SDValue();
40566
40567   // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
40568   // the ADDSUB idiom has been successfully recognized. There are no known
40569   // X86 targets with FP16 ADDSUB instructions!
40570   if (VT.getVectorElementType() == MVT::f16)
40571     return SDValue();
40572
40573   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
40574 }
40575
40576 // We are looking for a shuffle where both sources are concatenated with undef
40577 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
40578 // if we can express this as a single-source shuffle, that's preferable.
40579 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
40580                                            const X86Subtarget &Subtarget) {
40581   if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
40582     return SDValue();
40583
40584   EVT VT = N->getValueType(0);
40585
40586   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
40587   if (!VT.is128BitVector() && !VT.is256BitVector())
40588     return SDValue();
40589
40590   if (VT.getVectorElementType() != MVT::i32 &&
40591       VT.getVectorElementType() != MVT::i64 &&
40592       VT.getVectorElementType() != MVT::f32 &&
40593       VT.getVectorElementType() != MVT::f64)
40594     return SDValue();
40595
40596   SDValue N0 = N->getOperand(0);
40597   SDValue N1 = N->getOperand(1);
40598
40599   // Check that both sources are concats with undef.
40600   if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
40601       N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
40602       N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
40603       !N1.getOperand(1).isUndef())
40604     return SDValue();
40605
40606   // Construct the new shuffle mask. Elements from the first source retain their
40607   // index, but elements from the second source no longer need to skip an undef.
40608   SmallVector<int, 8> Mask;
40609   int NumElts = VT.getVectorNumElements();
40610
40611   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
40612   for (int Elt : SVOp->getMask())
40613     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
40614
40615   SDLoc DL(N);
40616   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
40617                                N1.getOperand(0));
40618   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
40619 }
40620
40621 /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
40622 /// low half of each source vector and does not set any high half elements in
40623 /// the destination vector, narrow the shuffle to half its original size.
40624 static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
40625   EVT VT = Shuf->getValueType(0);
40626   if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
40627     return SDValue();
40628   if (!VT.is256BitVector() && !VT.is512BitVector())
40629     return SDValue();
40630
40631   // See if we can ignore all of the high elements of the shuffle.
40632   ArrayRef<int> Mask = Shuf->getMask();
40633   if (!isUndefUpperHalf(Mask))
40634     return SDValue();
40635
40636   // Check if the shuffle mask accesses only the low half of each input vector
40637   // (half-index output is 0 or 2).
40638   int HalfIdx1, HalfIdx2;
40639   SmallVector<int, 8> HalfMask(Mask.size() / 2);
40640   if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
40641       (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
40642     return SDValue();
40643
40644   // Create a half-width shuffle to replace the unnecessarily wide shuffle.
40645   // The trick is knowing that all of the insert/extract are actually free
40646   // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
40647   // of narrow inputs into a narrow output, and that is always cheaper than
40648   // the wide shuffle that we started with.
40649   return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
40650                                Shuf->getOperand(1), HalfMask, HalfIdx1,
40651                                HalfIdx2, false, DAG, /*UseConcat*/ true);
40652 }
40653
40654 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
40655                               TargetLowering::DAGCombinerInfo &DCI,
40656                               const X86Subtarget &Subtarget) {
40657   if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
40658     if (SDValue V = narrowShuffle(Shuf, DAG))
40659       return V;
40660
40661   // If we have legalized the vector types, look for blends of FADD and FSUB
40662   // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
40663   SDLoc dl(N);
40664   EVT VT = N->getValueType(0);
40665   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40666   if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
40667     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
40668       return AddSub;
40669
40670   // Attempt to combine into a vector load/broadcast.
40671   if (SDValue LD = combineToConsecutiveLoads(
40672           VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
40673     return LD;
40674
40675   // For AVX2, we sometimes want to combine
40676   // (vector_shuffle <mask> (concat_vectors t1, undef)
40677   //                        (concat_vectors t2, undef))
40678   // Into:
40679   // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
40680   // Since the latter can be efficiently lowered with VPERMD/VPERMQ
40681   if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
40682     return ShufConcat;
40683
40684   if (isTargetShuffle(N->getOpcode())) {
40685     SDValue Op(N, 0);
40686     if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
40687       return Shuffle;
40688
40689     // Try recursively combining arbitrary sequences of x86 shuffle
40690     // instructions into higher-order shuffles. We do this after combining
40691     // specific PSHUF instruction sequences into their minimal form so that we
40692     // can evaluate how many specialized shuffle instructions are involved in
40693     // a particular chain.
40694     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
40695       return Res;
40696
40697     // Simplify source operands based on shuffle mask.
40698     // TODO - merge this into combineX86ShufflesRecursively.
40699     APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
40700     if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
40701       return SDValue(N, 0);
40702
40703     // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40704     // Perform this after other shuffle combines to allow inner shuffles to be
40705     // combined away first.
40706     if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))
40707       return BinOp;
40708   }
40709
40710   return SDValue();
40711 }
40712
40713 // Simplify variable target shuffle masks based on the demanded elements.
40714 // TODO: Handle DemandedBits in mask indices as well?
40715 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
40716     SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
40717     TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
40718   // If we're demanding all elements don't bother trying to simplify the mask.
40719   unsigned NumElts = DemandedElts.getBitWidth();
40720   if (DemandedElts.isAllOnes())
40721     return false;
40722
40723   SDValue Mask = Op.getOperand(MaskIndex);
40724   if (!Mask.hasOneUse())
40725     return false;
40726
40727   // Attempt to generically simplify the variable shuffle mask.
40728   APInt MaskUndef, MaskZero;
40729   if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
40730                                  Depth + 1))
40731     return true;
40732
40733   // Attempt to extract+simplify a (constant pool load) shuffle mask.
40734   // TODO: Support other types from getTargetShuffleMaskIndices?
40735   SDValue BC = peekThroughOneUseBitcasts(Mask);
40736   EVT BCVT = BC.getValueType();
40737   auto *Load = dyn_cast<LoadSDNode>(BC);
40738   if (!Load)
40739     return false;
40740
40741   const Constant *C = getTargetConstantFromNode(Load);
40742   if (!C)
40743     return false;
40744
40745   Type *CTy = C->getType();
40746   if (!CTy->isVectorTy() ||
40747       CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
40748     return false;
40749
40750   // Handle scaling for i64 elements on 32-bit targets.
40751   unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
40752   if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
40753     return false;
40754   unsigned Scale = NumCstElts / NumElts;
40755
40756   // Simplify mask if we have an undemanded element that is not undef.
40757   bool Simplified = false;
40758   SmallVector<Constant *, 32> ConstVecOps;
40759   for (unsigned i = 0; i != NumCstElts; ++i) {
40760     Constant *Elt = C->getAggregateElement(i);
40761     if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
40762       ConstVecOps.push_back(UndefValue::get(Elt->getType()));
40763       Simplified = true;
40764       continue;
40765     }
40766     ConstVecOps.push_back(Elt);
40767   }
40768   if (!Simplified)
40769     return false;
40770
40771   // Generate new constant pool entry + legalize immediately for the load.
40772   SDLoc DL(Op);
40773   SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
40774   SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
40775   SDValue NewMask = TLO.DAG.getLoad(
40776       BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
40777       MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
40778       Load->getAlign());
40779   return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
40780 }
40781
40782 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
40783     SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
40784     TargetLoweringOpt &TLO, unsigned Depth) const {
40785   int NumElts = DemandedElts.getBitWidth();
40786   unsigned Opc = Op.getOpcode();
40787   EVT VT = Op.getValueType();
40788
40789   // Handle special case opcodes.
40790   switch (Opc) {
40791   case X86ISD::PMULDQ:
40792   case X86ISD::PMULUDQ: {
40793     APInt LHSUndef, LHSZero;
40794     APInt RHSUndef, RHSZero;
40795     SDValue LHS = Op.getOperand(0);
40796     SDValue RHS = Op.getOperand(1);
40797     if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
40798                                    Depth + 1))
40799       return true;
40800     if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
40801                                    Depth + 1))
40802       return true;
40803     // Multiply by zero.
40804     KnownZero = LHSZero | RHSZero;
40805     break;
40806   }
40807   case X86ISD::VPMADDWD: {
40808     APInt LHSUndef, LHSZero;
40809     APInt RHSUndef, RHSZero;
40810     SDValue LHS = Op.getOperand(0);
40811     SDValue RHS = Op.getOperand(1);
40812     APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
40813
40814     if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
40815                                    Depth + 1))
40816       return true;
40817     if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
40818                                    Depth + 1))
40819       return true;
40820
40821     // TODO: Multiply by zero.
40822
40823     // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
40824     APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
40825     if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
40826                                    Depth + 1))
40827       return true;
40828     APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
40829     if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
40830                                    Depth + 1))
40831       return true;
40832     break;
40833   }
40834   case X86ISD::PSADBW: {
40835     SDValue LHS = Op.getOperand(0);
40836     SDValue RHS = Op.getOperand(1);
40837     assert(VT.getScalarType() == MVT::i64 &&
40838            LHS.getValueType() == RHS.getValueType() &&
40839            LHS.getValueType().getScalarType() == MVT::i8 &&
40840            "Unexpected PSADBW types");
40841
40842     // Aggressively peek through ops to get at the demanded elts.
40843     if (!DemandedElts.isAllOnes()) {
40844       unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
40845       APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
40846       SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(
40847           LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
40848       SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(
40849           RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
40850       if (NewLHS || NewRHS) {
40851         NewLHS = NewLHS ? NewLHS : LHS;
40852         NewRHS = NewRHS ? NewRHS : RHS;
40853         return TLO.CombineTo(
40854             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
40855       }
40856     }
40857     break;
40858   }
40859   case X86ISD::VSHL:
40860   case X86ISD::VSRL:
40861   case X86ISD::VSRA: {
40862     // We only need the bottom 64-bits of the (128-bit) shift amount.
40863     SDValue Amt = Op.getOperand(1);
40864     MVT AmtVT = Amt.getSimpleValueType();
40865     assert(AmtVT.is128BitVector() && "Unexpected value type");
40866
40867     // If we reuse the shift amount just for sse shift amounts then we know that
40868     // only the bottom 64-bits are only ever used.
40869     bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
40870       unsigned UseOpc = Use->getOpcode();
40871       return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
40872               UseOpc == X86ISD::VSRA) &&
40873              Use->getOperand(0) != Amt;
40874     });
40875
40876     APInt AmtUndef, AmtZero;
40877     unsigned NumAmtElts = AmtVT.getVectorNumElements();
40878     APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
40879     if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
40880                                    Depth + 1, AssumeSingleUse))
40881       return true;
40882     [[fallthrough]];
40883   }
40884   case X86ISD::VSHLI:
40885   case X86ISD::VSRLI:
40886   case X86ISD::VSRAI: {
40887     SDValue Src = Op.getOperand(0);
40888     APInt SrcUndef;
40889     if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
40890                                    Depth + 1))
40891       return true;
40892
40893     // Fold shift(0,x) -> 0
40894     if (DemandedElts.isSubsetOf(KnownZero))
40895       return TLO.CombineTo(
40896           Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
40897
40898     // Aggressively peek through ops to get at the demanded elts.
40899     if (!DemandedElts.isAllOnes())
40900       if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
40901               Src, DemandedElts, TLO.DAG, Depth + 1))
40902         return TLO.CombineTo(
40903             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
40904     break;
40905   }
40906   case X86ISD::VPSHA:
40907   case X86ISD::VPSHL:
40908   case X86ISD::VSHLV:
40909   case X86ISD::VSRLV:
40910   case X86ISD::VSRAV: {
40911     APInt LHSUndef, LHSZero;
40912     APInt RHSUndef, RHSZero;
40913     SDValue LHS = Op.getOperand(0);
40914     SDValue RHS = Op.getOperand(1);
40915     if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
40916                                    Depth + 1))
40917       return true;
40918
40919     // Fold shift(0,x) -> 0
40920     if (DemandedElts.isSubsetOf(LHSZero))
40921       return TLO.CombineTo(
40922           Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
40923
40924     if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
40925                                    Depth + 1))
40926       return true;
40927
40928     KnownZero = LHSZero;
40929     break;
40930   }
40931   case X86ISD::KSHIFTL: {
40932     SDValue Src = Op.getOperand(0);
40933     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
40934     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
40935     unsigned ShiftAmt = Amt->getZExtValue();
40936
40937     if (ShiftAmt == 0)
40938       return TLO.CombineTo(Op, Src);
40939
40940     // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
40941     // single shift.  We can do this if the bottom bits (which are shifted
40942     // out) are never demanded.
40943     if (Src.getOpcode() == X86ISD::KSHIFTR) {
40944       if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
40945         unsigned C1 = Src.getConstantOperandVal(1);
40946         unsigned NewOpc = X86ISD::KSHIFTL;
40947         int Diff = ShiftAmt - C1;
40948         if (Diff < 0) {
40949           Diff = -Diff;
40950           NewOpc = X86ISD::KSHIFTR;
40951         }
40952
40953         SDLoc dl(Op);
40954         SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
40955         return TLO.CombineTo(
40956             Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
40957       }
40958     }
40959
40960     APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
40961     if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
40962                                    Depth + 1))
40963       return true;
40964
40965     KnownUndef <<= ShiftAmt;
40966     KnownZero <<= ShiftAmt;
40967     KnownZero.setLowBits(ShiftAmt);
40968     break;
40969   }
40970   case X86ISD::KSHIFTR: {
40971     SDValue Src = Op.getOperand(0);
40972     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
40973     assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
40974     unsigned ShiftAmt = Amt->getZExtValue();
40975
40976     if (ShiftAmt == 0)
40977       return TLO.CombineTo(Op, Src);
40978
40979     // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
40980     // single shift.  We can do this if the top bits (which are shifted
40981     // out) are never demanded.
40982     if (Src.getOpcode() == X86ISD::KSHIFTL) {
40983       if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
40984         unsigned C1 = Src.getConstantOperandVal(1);
40985         unsigned NewOpc = X86ISD::KSHIFTR;
40986         int Diff = ShiftAmt - C1;
40987         if (Diff < 0) {
40988           Diff = -Diff;
40989           NewOpc = X86ISD::KSHIFTL;
40990         }
40991
40992         SDLoc dl(Op);
40993         SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
40994         return TLO.CombineTo(
40995             Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
40996       }
40997     }
40998
40999     APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
41000     if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
41001                                    Depth + 1))
41002       return true;
41003
41004     KnownUndef.lshrInPlace(ShiftAmt);
41005     KnownZero.lshrInPlace(ShiftAmt);
41006     KnownZero.setHighBits(ShiftAmt);
41007     break;
41008   }
41009   case X86ISD::ANDNP: {
41010     // ANDNP = (~LHS & RHS);
41011     SDValue LHS = Op.getOperand(0);
41012     SDValue RHS = Op.getOperand(1);
41013
41014     auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
41015       APInt UndefElts;
41016       SmallVector<APInt> EltBits;
41017       int NumElts = VT.getVectorNumElements();
41018       int EltSizeInBits = VT.getScalarSizeInBits();
41019       APInt OpBits = APInt::getAllOnes(EltSizeInBits);
41020       APInt OpElts = DemandedElts;
41021       if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41022                                         EltBits)) {
41023         OpBits.clearAllBits();
41024         OpElts.clearAllBits();
41025         for (int I = 0; I != NumElts; ++I) {
41026           if (!DemandedElts[I])
41027             continue;
41028           if (UndefElts[I]) {
41029             // We can't assume an undef src element gives an undef dst - the
41030             // other src might be zero.
41031             OpBits.setAllBits();
41032             OpElts.setBit(I);
41033           } else if ((Invert && !EltBits[I].isAllOnes()) ||
41034                      (!Invert && !EltBits[I].isZero())) {
41035             OpBits |= Invert ? ~EltBits[I] : EltBits[I];
41036             OpElts.setBit(I);
41037           }
41038         }
41039       }
41040       return std::make_pair(OpBits, OpElts);
41041     };
41042     APInt BitsLHS, EltsLHS;
41043     APInt BitsRHS, EltsRHS;
41044     std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
41045     std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
41046
41047     APInt LHSUndef, LHSZero;
41048     APInt RHSUndef, RHSZero;
41049     if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
41050                                    Depth + 1))
41051       return true;
41052     if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
41053                                    Depth + 1))
41054       return true;
41055
41056     if (!DemandedElts.isAllOnes()) {
41057       SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
41058                                                        TLO.DAG, Depth + 1);
41059       SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
41060                                                        TLO.DAG, Depth + 1);
41061       if (NewLHS || NewRHS) {
41062         NewLHS = NewLHS ? NewLHS : LHS;
41063         NewRHS = NewRHS ? NewRHS : RHS;
41064         return TLO.CombineTo(
41065             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
41066       }
41067     }
41068     break;
41069   }
41070   case X86ISD::CVTSI2P:
41071   case X86ISD::CVTUI2P: {
41072     SDValue Src = Op.getOperand(0);
41073     MVT SrcVT = Src.getSimpleValueType();
41074     APInt SrcUndef, SrcZero;
41075     APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
41076     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
41077                                    Depth + 1))
41078       return true;
41079     break;
41080   }
41081   case X86ISD::PACKSS:
41082   case X86ISD::PACKUS: {
41083     SDValue N0 = Op.getOperand(0);
41084     SDValue N1 = Op.getOperand(1);
41085
41086     APInt DemandedLHS, DemandedRHS;
41087     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
41088
41089     APInt LHSUndef, LHSZero;
41090     if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
41091                                    Depth + 1))
41092       return true;
41093     APInt RHSUndef, RHSZero;
41094     if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
41095                                    Depth + 1))
41096       return true;
41097
41098     // TODO - pass on known zero/undef.
41099
41100     // Aggressively peek through ops to get at the demanded elts.
41101     // TODO - we should do this for all target/faux shuffles ops.
41102     if (!DemandedElts.isAllOnes()) {
41103       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
41104                                                             TLO.DAG, Depth + 1);
41105       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
41106                                                             TLO.DAG, Depth + 1);
41107       if (NewN0 || NewN1) {
41108         NewN0 = NewN0 ? NewN0 : N0;
41109         NewN1 = NewN1 ? NewN1 : N1;
41110         return TLO.CombineTo(Op,
41111                              TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
41112       }
41113     }
41114     break;
41115   }
41116   case X86ISD::HADD:
41117   case X86ISD::HSUB:
41118   case X86ISD::FHADD:
41119   case X86ISD::FHSUB: {
41120     SDValue N0 = Op.getOperand(0);
41121     SDValue N1 = Op.getOperand(1);
41122
41123     APInt DemandedLHS, DemandedRHS;
41124     getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
41125
41126     APInt LHSUndef, LHSZero;
41127     if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
41128                                    Depth + 1))
41129       return true;
41130     APInt RHSUndef, RHSZero;
41131     if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
41132                                    Depth + 1))
41133       return true;
41134
41135     // TODO - pass on known zero/undef.
41136
41137     // Aggressively peek through ops to get at the demanded elts.
41138     // TODO: Handle repeated operands.
41139     if (N0 != N1 && !DemandedElts.isAllOnes()) {
41140       SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
41141                                                             TLO.DAG, Depth + 1);
41142       SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
41143                                                             TLO.DAG, Depth + 1);
41144       if (NewN0 || NewN1) {
41145         NewN0 = NewN0 ? NewN0 : N0;
41146         NewN1 = NewN1 ? NewN1 : N1;
41147         return TLO.CombineTo(Op,
41148                              TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
41149       }
41150     }
41151     break;
41152   }
41153   case X86ISD::VTRUNC:
41154   case X86ISD::VTRUNCS:
41155   case X86ISD::VTRUNCUS: {
41156     SDValue Src = Op.getOperand(0);
41157     MVT SrcVT = Src.getSimpleValueType();
41158     APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
41159     APInt SrcUndef, SrcZero;
41160     if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
41161                                    Depth + 1))
41162       return true;
41163     KnownZero = SrcZero.zextOrTrunc(NumElts);
41164     KnownUndef = SrcUndef.zextOrTrunc(NumElts);
41165     break;
41166   }
41167   case X86ISD::BLENDV: {
41168     APInt SelUndef, SelZero;
41169     if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
41170                                    SelZero, TLO, Depth + 1))
41171       return true;
41172
41173     // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
41174     APInt LHSUndef, LHSZero;
41175     if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
41176                                    LHSZero, TLO, Depth + 1))
41177       return true;
41178
41179     APInt RHSUndef, RHSZero;
41180     if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
41181                                    RHSZero, TLO, Depth + 1))
41182       return true;
41183
41184     KnownZero = LHSZero & RHSZero;
41185     KnownUndef = LHSUndef & RHSUndef;
41186     break;
41187   }
41188   case X86ISD::VZEXT_MOVL: {
41189     // If upper demanded elements are already zero then we have nothing to do.
41190     SDValue Src = Op.getOperand(0);
41191     APInt DemandedUpperElts = DemandedElts;
41192     DemandedUpperElts.clearLowBits(1);
41193     if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
41194       return TLO.CombineTo(Op, Src);
41195     break;
41196   }
41197   case X86ISD::VBROADCAST: {
41198     SDValue Src = Op.getOperand(0);
41199     MVT SrcVT = Src.getSimpleValueType();
41200     if (!SrcVT.isVector())
41201       break;
41202     // Don't bother broadcasting if we just need the 0'th element.
41203     if (DemandedElts == 1) {
41204       if (Src.getValueType() != VT)
41205         Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
41206                              SDLoc(Op));
41207       return TLO.CombineTo(Op, Src);
41208     }
41209     APInt SrcUndef, SrcZero;
41210     APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
41211     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
41212                                    Depth + 1))
41213       return true;
41214     // Aggressively peek through src to get at the demanded elt.
41215     // TODO - we should do this for all target/faux shuffles ops.
41216     if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
41217             Src, SrcElts, TLO.DAG, Depth + 1))
41218       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41219     break;
41220   }
41221   case X86ISD::VPERMV:
41222     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
41223                                                    Depth))
41224       return true;
41225     break;
41226   case X86ISD::PSHUFB:
41227   case X86ISD::VPERMV3:
41228   case X86ISD::VPERMILPV:
41229     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
41230                                                    Depth))
41231       return true;
41232     break;
41233   case X86ISD::VPPERM:
41234   case X86ISD::VPERMIL2:
41235     if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
41236                                                    Depth))
41237       return true;
41238     break;
41239   }
41240
41241   // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
41242   // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
41243   // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
41244   if ((VT.is256BitVector() || VT.is512BitVector()) &&
41245       DemandedElts.lshr(NumElts / 2) == 0) {
41246     unsigned SizeInBits = VT.getSizeInBits();
41247     unsigned ExtSizeInBits = SizeInBits / 2;
41248
41249     // See if 512-bit ops only use the bottom 128-bits.
41250     if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
41251       ExtSizeInBits = SizeInBits / 4;
41252
41253     switch (Opc) {
41254       // Scalar broadcast.
41255     case X86ISD::VBROADCAST: {
41256       SDLoc DL(Op);
41257       SDValue Src = Op.getOperand(0);
41258       if (Src.getValueSizeInBits() > ExtSizeInBits)
41259         Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
41260       EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41261                                     ExtSizeInBits / VT.getScalarSizeInBits());
41262       SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
41263       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
41264                                                TLO.DAG, DL, ExtSizeInBits));
41265     }
41266     case X86ISD::VBROADCAST_LOAD: {
41267       SDLoc DL(Op);
41268       auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
41269       EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41270                                     ExtSizeInBits / VT.getScalarSizeInBits());
41271       SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
41272       SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
41273       SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
41274           X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
41275           MemIntr->getMemOperand());
41276       TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
41277                                            Bcst.getValue(1));
41278       return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
41279                                                TLO.DAG, DL, ExtSizeInBits));
41280     }
41281       // Subvector broadcast.
41282     case X86ISD::SUBV_BROADCAST_LOAD: {
41283       auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
41284       EVT MemVT = MemIntr->getMemoryVT();
41285       if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
41286         SDLoc DL(Op);
41287         SDValue Ld =
41288             TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
41289                             MemIntr->getBasePtr(), MemIntr->getMemOperand());
41290         TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
41291                                              Ld.getValue(1));
41292         return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
41293                                                  TLO.DAG, DL, ExtSizeInBits));
41294       } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
41295         SDLoc DL(Op);
41296         EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41297                                       ExtSizeInBits / VT.getScalarSizeInBits());
41298         if (SDValue BcstLd =
41299                 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
41300           return TLO.CombineTo(Op,
41301                                insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
41302                                                TLO.DAG, DL, ExtSizeInBits));
41303       }
41304       break;
41305     }
41306       // Byte shifts by immediate.
41307     case X86ISD::VSHLDQ:
41308     case X86ISD::VSRLDQ:
41309       // Shift by uniform.
41310     case X86ISD::VSHL:
41311     case X86ISD::VSRL:
41312     case X86ISD::VSRA:
41313       // Shift by immediate.
41314     case X86ISD::VSHLI:
41315     case X86ISD::VSRLI:
41316     case X86ISD::VSRAI: {
41317       SDLoc DL(Op);
41318       SDValue Ext0 =
41319           extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
41320       SDValue ExtOp =
41321           TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
41322       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41323       SDValue Insert =
41324           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
41325       return TLO.CombineTo(Op, Insert);
41326     }
41327     case X86ISD::VPERMI: {
41328       // Simplify PERMPD/PERMQ to extract_subvector.
41329       // TODO: This should be done in shuffle combining.
41330       if (VT == MVT::v4f64 || VT == MVT::v4i64) {
41331         SmallVector<int, 4> Mask;
41332         DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
41333         if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
41334           SDLoc DL(Op);
41335           SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
41336           SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41337           SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
41338           return TLO.CombineTo(Op, Insert);
41339         }
41340       }
41341       break;
41342     }
41343     case X86ISD::VPERM2X128: {
41344       // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
41345       SDLoc DL(Op);
41346       unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
41347       if (LoMask & 0x8)
41348         return TLO.CombineTo(
41349             Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
41350       unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
41351       unsigned SrcIdx = (LoMask & 0x2) >> 1;
41352       SDValue ExtOp =
41353           extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
41354       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41355       SDValue Insert =
41356           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
41357       return TLO.CombineTo(Op, Insert);
41358     }
41359       // Zero upper elements.
41360     case X86ISD::VZEXT_MOVL:
41361       // Target unary shuffles by immediate:
41362     case X86ISD::PSHUFD:
41363     case X86ISD::PSHUFLW:
41364     case X86ISD::PSHUFHW:
41365     case X86ISD::VPERMILPI:
41366       // (Non-Lane Crossing) Target Shuffles.
41367     case X86ISD::VPERMILPV:
41368     case X86ISD::VPERMIL2:
41369     case X86ISD::PSHUFB:
41370     case X86ISD::UNPCKL:
41371     case X86ISD::UNPCKH:
41372     case X86ISD::BLENDI:
41373       // Integer ops.
41374     case X86ISD::PACKSS:
41375     case X86ISD::PACKUS:
41376       // Horizontal Ops.
41377     case X86ISD::HADD:
41378     case X86ISD::HSUB:
41379     case X86ISD::FHADD:
41380     case X86ISD::FHSUB: {
41381       SDLoc DL(Op);
41382       SmallVector<SDValue, 4> Ops;
41383       for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
41384         SDValue SrcOp = Op.getOperand(i);
41385         EVT SrcVT = SrcOp.getValueType();
41386         assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
41387                "Unsupported vector size");
41388         Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
41389                                                           ExtSizeInBits)
41390                                        : SrcOp);
41391       }
41392       MVT ExtVT = VT.getSimpleVT();
41393       ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
41394                                ExtSizeInBits / ExtVT.getScalarSizeInBits());
41395       SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
41396       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41397       SDValue Insert =
41398           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
41399       return TLO.CombineTo(Op, Insert);
41400     }
41401     }
41402   }
41403
41404   // For splats, unless we *only* demand the 0'th element,
41405   // stop attempts at simplification here, we aren't going to improve things,
41406   // this is better than any potential shuffle.
41407   if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
41408     return false;
41409
41410   // Get target/faux shuffle mask.
41411   APInt OpUndef, OpZero;
41412   SmallVector<int, 64> OpMask;
41413   SmallVector<SDValue, 2> OpInputs;
41414   if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
41415                               OpZero, TLO.DAG, Depth, false))
41416     return false;
41417
41418   // Shuffle inputs must be the same size as the result.
41419   if (OpMask.size() != (unsigned)NumElts ||
41420       llvm::any_of(OpInputs, [VT](SDValue V) {
41421         return VT.getSizeInBits() != V.getValueSizeInBits() ||
41422                !V.getValueType().isVector();
41423       }))
41424     return false;
41425
41426   KnownZero = OpZero;
41427   KnownUndef = OpUndef;
41428
41429   // Check if shuffle mask can be simplified to undef/zero/identity.
41430   int NumSrcs = OpInputs.size();
41431   for (int i = 0; i != NumElts; ++i)
41432     if (!DemandedElts[i])
41433       OpMask[i] = SM_SentinelUndef;
41434
41435   if (isUndefInRange(OpMask, 0, NumElts)) {
41436     KnownUndef.setAllBits();
41437     return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
41438   }
41439   if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
41440     KnownZero.setAllBits();
41441     return TLO.CombineTo(
41442         Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41443   }
41444   for (int Src = 0; Src != NumSrcs; ++Src)
41445     if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
41446       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
41447
41448   // Attempt to simplify inputs.
41449   for (int Src = 0; Src != NumSrcs; ++Src) {
41450     // TODO: Support inputs of different types.
41451     if (OpInputs[Src].getValueType() != VT)
41452       continue;
41453
41454     int Lo = Src * NumElts;
41455     APInt SrcElts = APInt::getZero(NumElts);
41456     for (int i = 0; i != NumElts; ++i)
41457       if (DemandedElts[i]) {
41458         int M = OpMask[i] - Lo;
41459         if (0 <= M && M < NumElts)
41460           SrcElts.setBit(M);
41461       }
41462
41463     // TODO - Propagate input undef/zero elts.
41464     APInt SrcUndef, SrcZero;
41465     if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
41466                                    TLO, Depth + 1))
41467       return true;
41468   }
41469
41470   // If we don't demand all elements, then attempt to combine to a simpler
41471   // shuffle.
41472   // We need to convert the depth to something combineX86ShufflesRecursively
41473   // can handle - so pretend its Depth == 0 again, and reduce the max depth
41474   // to match. This prevents combineX86ShuffleChain from returning a
41475   // combined shuffle that's the same as the original root, causing an
41476   // infinite loop.
41477   if (!DemandedElts.isAllOnes()) {
41478     assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
41479
41480     SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
41481     for (int i = 0; i != NumElts; ++i)
41482       if (DemandedElts[i])
41483         DemandedMask[i] = i;
41484
41485     SDValue NewShuffle = combineX86ShufflesRecursively(
41486         {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
41487         /*HasVarMask*/ false,
41488         /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
41489         Subtarget);
41490     if (NewShuffle)
41491       return TLO.CombineTo(Op, NewShuffle);
41492   }
41493
41494   return false;
41495 }
41496
41497 bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
41498     SDValue Op, const APInt &OriginalDemandedBits,
41499     const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
41500     unsigned Depth) const {
41501   EVT VT = Op.getValueType();
41502   unsigned BitWidth = OriginalDemandedBits.getBitWidth();
41503   unsigned Opc = Op.getOpcode();
41504   switch(Opc) {
41505   case X86ISD::VTRUNC: {
41506     KnownBits KnownOp;
41507     SDValue Src = Op.getOperand(0);
41508     MVT SrcVT = Src.getSimpleValueType();
41509
41510     // Simplify the input, using demanded bit information.
41511     APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
41512     APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
41513     if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
41514       return true;
41515     break;
41516   }
41517   case X86ISD::PMULDQ:
41518   case X86ISD::PMULUDQ: {
41519     // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
41520     KnownBits KnownLHS, KnownRHS;
41521     SDValue LHS = Op.getOperand(0);
41522     SDValue RHS = Op.getOperand(1);
41523
41524     // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
41525     // FIXME: Can we bound this better?
41526     APInt DemandedMask = APInt::getLowBitsSet(64, 32);
41527     APInt DemandedMaskLHS = APInt::getAllOnes(64);
41528     APInt DemandedMaskRHS = APInt::getAllOnes(64);
41529
41530     bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
41531     if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
41532       DemandedMaskLHS = DemandedMask;
41533     if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
41534       DemandedMaskRHS = DemandedMask;
41535
41536     if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
41537                              KnownLHS, TLO, Depth + 1))
41538       return true;
41539     if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
41540                              KnownRHS, TLO, Depth + 1))
41541       return true;
41542
41543     // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
41544     KnownRHS = KnownRHS.trunc(32);
41545     if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
41546         KnownRHS.getConstant().isOne()) {
41547       SDLoc DL(Op);
41548       SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
41549       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
41550     }
41551
41552     // Aggressively peek through ops to get at the demanded low bits.
41553     SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
41554         LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
41555     SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
41556         RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
41557     if (DemandedLHS || DemandedRHS) {
41558       DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
41559       DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
41560       return TLO.CombineTo(
41561           Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
41562     }
41563     break;
41564   }
41565   case X86ISD::ANDNP: {
41566     KnownBits Known2;
41567     SDValue Op0 = Op.getOperand(0);
41568     SDValue Op1 = Op.getOperand(1);
41569
41570     if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
41571                              Known, TLO, Depth + 1))
41572       return true;
41573     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
41574
41575     if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
41576                              OriginalDemandedElts, Known2, TLO, Depth + 1))
41577       return true;
41578     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
41579
41580     // If the RHS is a constant, see if we can simplify it.
41581     if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
41582                                OriginalDemandedElts, TLO))
41583       return true;
41584
41585     // ANDNP = (~Op0 & Op1);
41586     Known.One &= Known2.Zero;
41587     Known.Zero |= Known2.One;
41588     break;
41589   }
41590   case X86ISD::VSHLI: {
41591     SDValue Op0 = Op.getOperand(0);
41592
41593     unsigned ShAmt = Op.getConstantOperandVal(1);
41594     if (ShAmt >= BitWidth)
41595       break;
41596
41597     APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
41598
41599     // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
41600     // single shift.  We can do this if the bottom bits (which are shifted
41601     // out) are never demanded.
41602     if (Op0.getOpcode() == X86ISD::VSRLI &&
41603         OriginalDemandedBits.countr_zero() >= ShAmt) {
41604       unsigned Shift2Amt = Op0.getConstantOperandVal(1);
41605       if (Shift2Amt < BitWidth) {
41606         int Diff = ShAmt - Shift2Amt;
41607         if (Diff == 0)
41608           return TLO.CombineTo(Op, Op0.getOperand(0));
41609
41610         unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
41611         SDValue NewShift = TLO.DAG.getNode(
41612             NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
41613             TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
41614         return TLO.CombineTo(Op, NewShift);
41615       }
41616     }
41617
41618     // If we are only demanding sign bits then we can use the shift source directly.
41619     unsigned NumSignBits =
41620         TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
41621     unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
41622     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
41623       return TLO.CombineTo(Op, Op0);
41624
41625     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
41626                              TLO, Depth + 1))
41627       return true;
41628
41629     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
41630     Known.Zero <<= ShAmt;
41631     Known.One <<= ShAmt;
41632
41633     // Low bits known zero.
41634     Known.Zero.setLowBits(ShAmt);
41635     return false;
41636   }
41637   case X86ISD::VSRLI: {
41638     unsigned ShAmt = Op.getConstantOperandVal(1);
41639     if (ShAmt >= BitWidth)
41640       break;
41641
41642     APInt DemandedMask = OriginalDemandedBits << ShAmt;
41643
41644     if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
41645                              OriginalDemandedElts, Known, TLO, Depth + 1))
41646       return true;
41647
41648     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
41649     Known.Zero.lshrInPlace(ShAmt);
41650     Known.One.lshrInPlace(ShAmt);
41651
41652     // High bits known zero.
41653     Known.Zero.setHighBits(ShAmt);
41654     return false;
41655   }
41656   case X86ISD::VSRAI: {
41657     SDValue Op0 = Op.getOperand(0);
41658     SDValue Op1 = Op.getOperand(1);
41659
41660     unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
41661     if (ShAmt >= BitWidth)
41662       break;
41663
41664     APInt DemandedMask = OriginalDemandedBits << ShAmt;
41665
41666     // If we just want the sign bit then we don't need to shift it.
41667     if (OriginalDemandedBits.isSignMask())
41668       return TLO.CombineTo(Op, Op0);
41669
41670     // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
41671     if (Op0.getOpcode() == X86ISD::VSHLI &&
41672         Op.getOperand(1) == Op0.getOperand(1)) {
41673       SDValue Op00 = Op0.getOperand(0);
41674       unsigned NumSignBits =
41675           TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
41676       if (ShAmt < NumSignBits)
41677         return TLO.CombineTo(Op, Op00);
41678     }
41679
41680     // If any of the demanded bits are produced by the sign extension, we also
41681     // demand the input sign bit.
41682     if (OriginalDemandedBits.countl_zero() < ShAmt)
41683       DemandedMask.setSignBit();
41684
41685     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
41686                              TLO, Depth + 1))
41687       return true;
41688
41689     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
41690     Known.Zero.lshrInPlace(ShAmt);
41691     Known.One.lshrInPlace(ShAmt);
41692
41693     // If the input sign bit is known to be zero, or if none of the top bits
41694     // are demanded, turn this into an unsigned shift right.
41695     if (Known.Zero[BitWidth - ShAmt - 1] ||
41696         OriginalDemandedBits.countl_zero() >= ShAmt)
41697       return TLO.CombineTo(
41698           Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
41699
41700     // High bits are known one.
41701     if (Known.One[BitWidth - ShAmt - 1])
41702       Known.One.setHighBits(ShAmt);
41703     return false;
41704   }
41705   case X86ISD::BLENDV: {
41706     SDValue Sel = Op.getOperand(0);
41707     SDValue LHS = Op.getOperand(1);
41708     SDValue RHS = Op.getOperand(2);
41709
41710     APInt SignMask = APInt::getSignMask(BitWidth);
41711     SDValue NewSel = SimplifyMultipleUseDemandedBits(
41712         Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
41713     SDValue NewLHS = SimplifyMultipleUseDemandedBits(
41714         LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
41715     SDValue NewRHS = SimplifyMultipleUseDemandedBits(
41716         RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
41717
41718     if (NewSel || NewLHS || NewRHS) {
41719       NewSel = NewSel ? NewSel : Sel;
41720       NewLHS = NewLHS ? NewLHS : LHS;
41721       NewRHS = NewRHS ? NewRHS : RHS;
41722       return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
41723                                                NewSel, NewLHS, NewRHS));
41724     }
41725     break;
41726   }
41727   case X86ISD::PEXTRB:
41728   case X86ISD::PEXTRW: {
41729     SDValue Vec = Op.getOperand(0);
41730     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
41731     MVT VecVT = Vec.getSimpleValueType();
41732     unsigned NumVecElts = VecVT.getVectorNumElements();
41733
41734     if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
41735       unsigned Idx = CIdx->getZExtValue();
41736       unsigned VecBitWidth = VecVT.getScalarSizeInBits();
41737
41738       // If we demand no bits from the vector then we must have demanded
41739       // bits from the implict zext - simplify to zero.
41740       APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
41741       if (DemandedVecBits == 0)
41742         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41743
41744       APInt KnownUndef, KnownZero;
41745       APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
41746       if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
41747                                      KnownZero, TLO, Depth + 1))
41748         return true;
41749
41750       KnownBits KnownVec;
41751       if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
41752                                KnownVec, TLO, Depth + 1))
41753         return true;
41754
41755       if (SDValue V = SimplifyMultipleUseDemandedBits(
41756               Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
41757         return TLO.CombineTo(
41758             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
41759
41760       Known = KnownVec.zext(BitWidth);
41761       return false;
41762     }
41763     break;
41764   }
41765   case X86ISD::PINSRB:
41766   case X86ISD::PINSRW: {
41767     SDValue Vec = Op.getOperand(0);
41768     SDValue Scl = Op.getOperand(1);
41769     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
41770     MVT VecVT = Vec.getSimpleValueType();
41771
41772     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
41773       unsigned Idx = CIdx->getZExtValue();
41774       if (!OriginalDemandedElts[Idx])
41775         return TLO.CombineTo(Op, Vec);
41776
41777       KnownBits KnownVec;
41778       APInt DemandedVecElts(OriginalDemandedElts);
41779       DemandedVecElts.clearBit(Idx);
41780       if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
41781                                KnownVec, TLO, Depth + 1))
41782         return true;
41783
41784       KnownBits KnownScl;
41785       unsigned NumSclBits = Scl.getScalarValueSizeInBits();
41786       APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
41787       if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
41788         return true;
41789
41790       KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
41791       Known = KnownVec.intersectWith(KnownScl);
41792       return false;
41793     }
41794     break;
41795   }
41796   case X86ISD::PACKSS:
41797     // PACKSS saturates to MIN/MAX integer values. So if we just want the
41798     // sign bit then we can just ask for the source operands sign bit.
41799     // TODO - add known bits handling.
41800     if (OriginalDemandedBits.isSignMask()) {
41801       APInt DemandedLHS, DemandedRHS;
41802       getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
41803
41804       KnownBits KnownLHS, KnownRHS;
41805       APInt SignMask = APInt::getSignMask(BitWidth * 2);
41806       if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
41807                                KnownLHS, TLO, Depth + 1))
41808         return true;
41809       if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
41810                                KnownRHS, TLO, Depth + 1))
41811         return true;
41812
41813       // Attempt to avoid multi-use ops if we don't need anything from them.
41814       SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
41815           Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
41816       SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
41817           Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
41818       if (DemandedOp0 || DemandedOp1) {
41819         SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
41820         SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
41821         return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
41822       }
41823     }
41824     // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
41825     break;
41826   case X86ISD::VBROADCAST: {
41827     SDValue Src = Op.getOperand(0);
41828     MVT SrcVT = Src.getSimpleValueType();
41829     APInt DemandedElts = APInt::getOneBitSet(
41830         SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
41831     if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
41832                              TLO, Depth + 1))
41833       return true;
41834     // If we don't need the upper bits, attempt to narrow the broadcast source.
41835     // Don't attempt this on AVX512 as it might affect broadcast folding.
41836     // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
41837     if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
41838         OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
41839         Src->hasOneUse()) {
41840       MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
41841       SDValue NewSrc =
41842           TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
41843       MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
41844       SDValue NewBcst =
41845           TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
41846       return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
41847     }
41848     break;
41849   }
41850   case X86ISD::PCMPGT:
41851     // icmp sgt(0, R) == ashr(R, BitWidth-1).
41852     // iff we only need the sign bit then we can use R directly.
41853     if (OriginalDemandedBits.isSignMask() &&
41854         ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
41855       return TLO.CombineTo(Op, Op.getOperand(1));
41856     break;
41857   case X86ISD::MOVMSK: {
41858     SDValue Src = Op.getOperand(0);
41859     MVT SrcVT = Src.getSimpleValueType();
41860     unsigned SrcBits = SrcVT.getScalarSizeInBits();
41861     unsigned NumElts = SrcVT.getVectorNumElements();
41862
41863     // If we don't need the sign bits at all just return zero.
41864     if (OriginalDemandedBits.countr_zero() >= NumElts)
41865       return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41866
41867     // See if we only demand bits from the lower 128-bit vector.
41868     if (SrcVT.is256BitVector() &&
41869         OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
41870       SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
41871       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41872     }
41873
41874     // Only demand the vector elements of the sign bits we need.
41875     APInt KnownUndef, KnownZero;
41876     APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
41877     if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
41878                                    TLO, Depth + 1))
41879       return true;
41880
41881     Known.Zero = KnownZero.zext(BitWidth);
41882     Known.Zero.setHighBits(BitWidth - NumElts);
41883
41884     // MOVMSK only uses the MSB from each vector element.
41885     KnownBits KnownSrc;
41886     APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
41887     if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
41888                              Depth + 1))
41889       return true;
41890
41891     if (KnownSrc.One[SrcBits - 1])
41892       Known.One.setLowBits(NumElts);
41893     else if (KnownSrc.Zero[SrcBits - 1])
41894       Known.Zero.setLowBits(NumElts);
41895
41896     // Attempt to avoid multi-use os if we don't need anything from it.
41897     if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
41898             Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
41899       return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41900     return false;
41901   }
41902   case X86ISD::TESTP: {
41903     SDValue Op0 = Op.getOperand(0);
41904     SDValue Op1 = Op.getOperand(1);
41905     MVT OpVT = Op0.getSimpleValueType();
41906     assert((OpVT.getVectorElementType() == MVT::f32 ||
41907             OpVT.getVectorElementType() == MVT::f64) &&
41908            "Illegal vector type for X86ISD::TESTP");
41909
41910     // TESTPS/TESTPD only demands the sign bits of ALL the elements.
41911     KnownBits KnownSrc;
41912     APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
41913     bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
41914     return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
41915                                 AssumeSingleUse) ||
41916            SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
41917                                 AssumeSingleUse);
41918   }
41919   case X86ISD::BEXTR:
41920   case X86ISD::BEXTRI: {
41921     SDValue Op0 = Op.getOperand(0);
41922     SDValue Op1 = Op.getOperand(1);
41923
41924     // Only bottom 16-bits of the control bits are required.
41925     if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
41926       // NOTE: SimplifyDemandedBits won't do this for constants.
41927       uint64_t Val1 = Cst1->getZExtValue();
41928       uint64_t MaskedVal1 = Val1 & 0xFFFF;
41929       if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
41930         SDLoc DL(Op);
41931         return TLO.CombineTo(
41932             Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
41933                                 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
41934       }
41935
41936       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
41937       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
41938
41939       // If the length is 0, the result is 0.
41940       if (Length == 0) {
41941         Known.setAllZero();
41942         return false;
41943       }
41944
41945       if ((Shift + Length) <= BitWidth) {
41946         APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
41947         if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
41948           return true;
41949
41950         Known = Known.extractBits(Length, Shift);
41951         Known = Known.zextOrTrunc(BitWidth);
41952         return false;
41953       }
41954     } else {
41955       assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
41956       KnownBits Known1;
41957       APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
41958       if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
41959         return true;
41960
41961       // If the length is 0, replace with 0.
41962       KnownBits LengthBits = Known1.extractBits(8, 8);
41963       if (LengthBits.isZero())
41964         return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
41965     }
41966
41967     break;
41968   }
41969   case X86ISD::PDEP: {
41970     SDValue Op0 = Op.getOperand(0);
41971     SDValue Op1 = Op.getOperand(1);
41972
41973     unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
41974     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
41975
41976     // If the demanded bits has leading zeroes, we don't demand those from the
41977     // mask.
41978     if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
41979       return true;
41980
41981     // The number of possible 1s in the mask determines the number of LSBs of
41982     // operand 0 used. Undemanded bits from the mask don't matter so filter
41983     // them before counting.
41984     KnownBits Known2;
41985     uint64_t Count = (~Known.Zero & LoMask).popcount();
41986     APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
41987     if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
41988       return true;
41989
41990     // Zeroes are retained from the mask, but not ones.
41991     Known.One.clearAllBits();
41992     // The result will have at least as many trailing zeros as the non-mask
41993     // operand since bits can only map to the same or higher bit position.
41994     Known.Zero.setLowBits(Known2.countMinTrailingZeros());
41995     return false;
41996   }
41997   }
41998
41999   return TargetLowering::SimplifyDemandedBitsForTargetNode(
42000       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
42001 }
42002
42003 SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
42004     SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
42005     SelectionDAG &DAG, unsigned Depth) const {
42006   int NumElts = DemandedElts.getBitWidth();
42007   unsigned Opc = Op.getOpcode();
42008   EVT VT = Op.getValueType();
42009
42010   switch (Opc) {
42011   case X86ISD::PINSRB:
42012   case X86ISD::PINSRW: {
42013     // If we don't demand the inserted element, return the base vector.
42014     SDValue Vec = Op.getOperand(0);
42015     auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42016     MVT VecVT = Vec.getSimpleValueType();
42017     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
42018         !DemandedElts[CIdx->getZExtValue()])
42019       return Vec;
42020     break;
42021   }
42022   case X86ISD::VSHLI: {
42023     // If we are only demanding sign bits then we can use the shift source
42024     // directly.
42025     SDValue Op0 = Op.getOperand(0);
42026     unsigned ShAmt = Op.getConstantOperandVal(1);
42027     unsigned BitWidth = DemandedBits.getBitWidth();
42028     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
42029     unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
42030     if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42031       return Op0;
42032     break;
42033   }
42034   case X86ISD::VSRAI:
42035     // iff we only need the sign bit then we can use the source directly.
42036     // TODO: generalize where we only demand extended signbits.
42037     if (DemandedBits.isSignMask())
42038       return Op.getOperand(0);
42039     break;
42040   case X86ISD::PCMPGT:
42041     // icmp sgt(0, R) == ashr(R, BitWidth-1).
42042     // iff we only need the sign bit then we can use R directly.
42043     if (DemandedBits.isSignMask() &&
42044         ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42045       return Op.getOperand(1);
42046     break;
42047   case X86ISD::BLENDV: {
42048     // BLENDV: Cond (MSB) ? LHS : RHS
42049     SDValue Cond = Op.getOperand(0);
42050     SDValue LHS = Op.getOperand(1);
42051     SDValue RHS = Op.getOperand(2);
42052
42053     KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
42054     if (CondKnown.isNegative())
42055       return LHS;
42056     if (CondKnown.isNonNegative())
42057       return RHS;
42058     break;
42059   }
42060   case X86ISD::ANDNP: {
42061     // ANDNP = (~LHS & RHS);
42062     SDValue LHS = Op.getOperand(0);
42063     SDValue RHS = Op.getOperand(1);
42064
42065     KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
42066     KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
42067
42068     // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
42069     // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
42070     // this context, so return RHS.
42071     if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
42072       return RHS;
42073     break;
42074   }
42075   }
42076
42077   APInt ShuffleUndef, ShuffleZero;
42078   SmallVector<int, 16> ShuffleMask;
42079   SmallVector<SDValue, 2> ShuffleOps;
42080   if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
42081                              ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
42082     // If all the demanded elts are from one operand and are inline,
42083     // then we can use the operand directly.
42084     int NumOps = ShuffleOps.size();
42085     if (ShuffleMask.size() == (unsigned)NumElts &&
42086         llvm::all_of(ShuffleOps, [VT](SDValue V) {
42087           return VT.getSizeInBits() == V.getValueSizeInBits();
42088         })) {
42089
42090       if (DemandedElts.isSubsetOf(ShuffleUndef))
42091         return DAG.getUNDEF(VT);
42092       if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
42093         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
42094
42095       // Bitmask that indicates which ops have only been accessed 'inline'.
42096       APInt IdentityOp = APInt::getAllOnes(NumOps);
42097       for (int i = 0; i != NumElts; ++i) {
42098         int M = ShuffleMask[i];
42099         if (!DemandedElts[i] || ShuffleUndef[i])
42100           continue;
42101         int OpIdx = M / NumElts;
42102         int EltIdx = M % NumElts;
42103         if (M < 0 || EltIdx != i) {
42104           IdentityOp.clearAllBits();
42105           break;
42106         }
42107         IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
42108         if (IdentityOp == 0)
42109           break;
42110       }
42111       assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
42112              "Multiple identity shuffles detected");
42113
42114       if (IdentityOp != 0)
42115         return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
42116     }
42117   }
42118
42119   return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
42120       Op, DemandedBits, DemandedElts, DAG, Depth);
42121 }
42122
42123 bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
42124     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
42125     bool PoisonOnly, unsigned Depth) const {
42126   unsigned EltsBits = Op.getScalarValueSizeInBits();
42127   unsigned NumElts = DemandedElts.getBitWidth();
42128
42129   // TODO: Add more target shuffles.
42130   switch (Op.getOpcode()) {
42131   case X86ISD::PSHUFD:
42132   case X86ISD::VPERMILPI: {
42133     SmallVector<int, 8> Mask;
42134     DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);
42135
42136     APInt DemandedSrcElts = APInt::getZero(NumElts);
42137     for (unsigned I = 0; I != NumElts; ++I)
42138       if (DemandedElts[I])
42139         DemandedSrcElts.setBit(Mask[I]);
42140
42141     return DAG.isGuaranteedNotToBeUndefOrPoison(
42142         Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);
42143   }
42144   }
42145   return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
42146       Op, DemandedElts, DAG, PoisonOnly, Depth);
42147 }
42148
42149 bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
42150     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
42151     bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
42152
42153   // TODO: Add more target shuffles.
42154   switch (Op.getOpcode()) {
42155   case X86ISD::PSHUFD:
42156   case X86ISD::VPERMILPI:
42157     return false;
42158   }
42159   return TargetLowering::canCreateUndefOrPoisonForTargetNode(
42160       Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
42161 }
42162
42163 bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
42164                                                   const APInt &DemandedElts,
42165                                                   APInt &UndefElts,
42166                                                   const SelectionDAG &DAG,
42167                                                   unsigned Depth) const {
42168   unsigned NumElts = DemandedElts.getBitWidth();
42169   unsigned Opc = Op.getOpcode();
42170
42171   switch (Opc) {
42172   case X86ISD::VBROADCAST:
42173   case X86ISD::VBROADCAST_LOAD:
42174     UndefElts = APInt::getZero(NumElts);
42175     return true;
42176   }
42177
42178   return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
42179                                                    DAG, Depth);
42180 }
42181
42182 // Helper to peek through bitops/trunc/setcc to determine size of source vector.
42183 // Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
42184 static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
42185                                       bool AllowTruncate) {
42186   switch (Src.getOpcode()) {
42187   case ISD::TRUNCATE:
42188     if (!AllowTruncate)
42189       return false;
42190     [[fallthrough]];
42191   case ISD::SETCC:
42192     return Src.getOperand(0).getValueSizeInBits() == Size;
42193   case ISD::AND:
42194   case ISD::XOR:
42195   case ISD::OR:
42196     return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
42197            checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
42198   case ISD::SELECT:
42199   case ISD::VSELECT:
42200     return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
42201            checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
42202            checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
42203   case ISD::BUILD_VECTOR:
42204     return ISD::isBuildVectorAllZeros(Src.getNode()) ||
42205            ISD::isBuildVectorAllOnes(Src.getNode());
42206   }
42207   return false;
42208 }
42209
42210 // Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
42211 static unsigned getAltBitOpcode(unsigned Opcode) {
42212   switch(Opcode) {
42213   case ISD::AND: return X86ISD::FAND;
42214   case ISD::OR: return X86ISD::FOR;
42215   case ISD::XOR: return X86ISD::FXOR;
42216   case X86ISD::ANDNP: return X86ISD::FANDN;
42217   }
42218   llvm_unreachable("Unknown bitwise opcode");
42219 }
42220
42221 // Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
42222 static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
42223                                           const SDLoc &DL) {
42224   EVT SrcVT = Src.getValueType();
42225   if (SrcVT != MVT::v4i1)
42226     return SDValue();
42227
42228   switch (Src.getOpcode()) {
42229   case ISD::SETCC:
42230     if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
42231         ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
42232         cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
42233       SDValue Op0 = Src.getOperand(0);
42234       if (ISD::isNormalLoad(Op0.getNode()))
42235         return DAG.getBitcast(MVT::v4f32, Op0);
42236       if (Op0.getOpcode() == ISD::BITCAST &&
42237           Op0.getOperand(0).getValueType() == MVT::v4f32)
42238         return Op0.getOperand(0);
42239     }
42240     break;
42241   case ISD::AND:
42242   case ISD::XOR:
42243   case ISD::OR: {
42244     SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
42245     SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
42246     if (Op0 && Op1)
42247       return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
42248                          Op1);
42249     break;
42250   }
42251   }
42252   return SDValue();
42253 }
42254
42255 // Helper to push sign extension of vXi1 SETCC result through bitops.
42256 static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
42257                                           SDValue Src, const SDLoc &DL) {
42258   switch (Src.getOpcode()) {
42259   case ISD::SETCC:
42260   case ISD::TRUNCATE:
42261   case ISD::BUILD_VECTOR:
42262     return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
42263   case ISD::AND:
42264   case ISD::XOR:
42265   case ISD::OR:
42266     return DAG.getNode(
42267         Src.getOpcode(), DL, SExtVT,
42268         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
42269         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
42270   case ISD::SELECT:
42271   case ISD::VSELECT:
42272     return DAG.getSelect(
42273         DL, SExtVT, Src.getOperand(0),
42274         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
42275         signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
42276   }
42277   llvm_unreachable("Unexpected node type for vXi1 sign extension");
42278 }
42279
42280 // Try to match patterns such as
42281 // (i16 bitcast (v16i1 x))
42282 // ->
42283 // (i16 movmsk (16i8 sext (v16i1 x)))
42284 // before the illegal vector is scalarized on subtargets that don't have legal
42285 // vxi1 types.
42286 static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
42287                                   const SDLoc &DL,
42288                                   const X86Subtarget &Subtarget) {
42289   EVT SrcVT = Src.getValueType();
42290   if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
42291     return SDValue();
42292
42293   // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
42294   // legalization destroys the v4i32 type.
42295   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
42296     if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
42297       V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
42298                       DAG.getBitcast(MVT::v4f32, V));
42299       return DAG.getZExtOrTrunc(V, DL, VT);
42300     }
42301   }
42302
42303   // If the input is a truncate from v16i8 or v32i8 go ahead and use a
42304   // movmskb even with avx512. This will be better than truncating to vXi1 and
42305   // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
42306   // vpcmpeqb/vpcmpgtb.
42307   bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
42308                       (Src.getOperand(0).getValueType() == MVT::v16i8 ||
42309                        Src.getOperand(0).getValueType() == MVT::v32i8 ||
42310                        Src.getOperand(0).getValueType() == MVT::v64i8);
42311
42312   // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
42313   // directly with vpmovmskb/vmovmskps/vmovmskpd.
42314   if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
42315       cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
42316       ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
42317     EVT CmpVT = Src.getOperand(0).getValueType();
42318     EVT EltVT = CmpVT.getVectorElementType();
42319     if (CmpVT.getSizeInBits() <= 256 &&
42320         (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
42321       PreferMovMsk = true;
42322   }
42323
42324   // With AVX512 vxi1 types are legal and we prefer using k-regs.
42325   // MOVMSK is supported in SSE2 or later.
42326   if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
42327     return SDValue();
42328
42329   // If the upper ops of a concatenation are undef, then try to bitcast the
42330   // lower op and extend.
42331   SmallVector<SDValue, 4> SubSrcOps;
42332   if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
42333       SubSrcOps.size() >= 2) {
42334     SDValue LowerOp = SubSrcOps[0];
42335     ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
42336     if (LowerOp.getOpcode() == ISD::SETCC &&
42337         all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
42338       EVT SubVT = VT.getIntegerVT(
42339           *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
42340       if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
42341         EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
42342         return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
42343       }
42344     }
42345   }
42346
42347   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
42348   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
42349   // v8i16 and v16i16.
42350   // For these two cases, we can shuffle the upper element bytes to a
42351   // consecutive sequence at the start of the vector and treat the results as
42352   // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
42353   // for v16i16 this is not the case, because the shuffle is expensive, so we
42354   // avoid sign-extending to this type entirely.
42355   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
42356   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
42357   MVT SExtVT;
42358   bool PropagateSExt = false;
42359   switch (SrcVT.getSimpleVT().SimpleTy) {
42360   default:
42361     return SDValue();
42362   case MVT::v2i1:
42363     SExtVT = MVT::v2i64;
42364     break;
42365   case MVT::v4i1:
42366     SExtVT = MVT::v4i32;
42367     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
42368     // sign-extend to a 256-bit operation to avoid truncation.
42369     if (Subtarget.hasAVX() &&
42370         checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
42371       SExtVT = MVT::v4i64;
42372       PropagateSExt = true;
42373     }
42374     break;
42375   case MVT::v8i1:
42376     SExtVT = MVT::v8i16;
42377     // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
42378     // sign-extend to a 256-bit operation to match the compare.
42379     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
42380     // 256-bit because the shuffle is cheaper than sign extending the result of
42381     // the compare.
42382     if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
42383                                checkBitcastSrcVectorSize(Src, 512, true))) {
42384       SExtVT = MVT::v8i32;
42385       PropagateSExt = true;
42386     }
42387     break;
42388   case MVT::v16i1:
42389     SExtVT = MVT::v16i8;
42390     // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
42391     // it is not profitable to sign-extend to 256-bit because this will
42392     // require an extra cross-lane shuffle which is more expensive than
42393     // truncating the result of the compare to 128-bits.
42394     break;
42395   case MVT::v32i1:
42396     SExtVT = MVT::v32i8;
42397     break;
42398   case MVT::v64i1:
42399     // If we have AVX512F, but not AVX512BW and the input is truncated from
42400     // v64i8 checked earlier. Then split the input and make two pmovmskbs.
42401     if (Subtarget.hasAVX512()) {
42402       if (Subtarget.hasBWI())
42403         return SDValue();
42404       SExtVT = MVT::v64i8;
42405       break;
42406     }
42407     // Split if this is a <64 x i8> comparison result.
42408     if (checkBitcastSrcVectorSize(Src, 512, false)) {
42409       SExtVT = MVT::v64i8;
42410       break;
42411     }
42412     return SDValue();
42413   };
42414
42415   SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
42416                             : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
42417
42418   if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
42419     V = getPMOVMSKB(DL, V, DAG, Subtarget);
42420   } else {
42421     if (SExtVT == MVT::v8i16) {
42422       V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
42423       V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
42424     }
42425     V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
42426   }
42427
42428   EVT IntVT =
42429       EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
42430   V = DAG.getZExtOrTrunc(V, DL, IntVT);
42431   return DAG.getBitcast(VT, V);
42432 }
42433
42434 // Convert a vXi1 constant build vector to the same width scalar integer.
42435 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
42436   EVT SrcVT = Op.getValueType();
42437   assert(SrcVT.getVectorElementType() == MVT::i1 &&
42438          "Expected a vXi1 vector");
42439   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
42440          "Expected a constant build vector");
42441
42442   APInt Imm(SrcVT.getVectorNumElements(), 0);
42443   for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
42444     SDValue In = Op.getOperand(Idx);
42445     if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
42446       Imm.setBit(Idx);
42447   }
42448   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
42449   return DAG.getConstant(Imm, SDLoc(Op), IntVT);
42450 }
42451
42452 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
42453                                            TargetLowering::DAGCombinerInfo &DCI,
42454                                            const X86Subtarget &Subtarget) {
42455   assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
42456
42457   if (!DCI.isBeforeLegalizeOps())
42458     return SDValue();
42459
42460   // Only do this if we have k-registers.
42461   if (!Subtarget.hasAVX512())
42462     return SDValue();
42463
42464   EVT DstVT = N->getValueType(0);
42465   SDValue Op = N->getOperand(0);
42466   EVT SrcVT = Op.getValueType();
42467
42468   if (!Op.hasOneUse())
42469     return SDValue();
42470
42471   // Look for logic ops.
42472   if (Op.getOpcode() != ISD::AND &&
42473       Op.getOpcode() != ISD::OR &&
42474       Op.getOpcode() != ISD::XOR)
42475     return SDValue();
42476
42477   // Make sure we have a bitcast between mask registers and a scalar type.
42478   if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
42479         DstVT.isScalarInteger()) &&
42480       !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
42481         SrcVT.isScalarInteger()))
42482     return SDValue();
42483
42484   SDValue LHS = Op.getOperand(0);
42485   SDValue RHS = Op.getOperand(1);
42486
42487   if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
42488       LHS.getOperand(0).getValueType() == DstVT)
42489     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
42490                        DAG.getBitcast(DstVT, RHS));
42491
42492   if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
42493       RHS.getOperand(0).getValueType() == DstVT)
42494     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
42495                        DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
42496
42497   // If the RHS is a vXi1 build vector, this is a good reason to flip too.
42498   // Most of these have to move a constant from the scalar domain anyway.
42499   if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
42500     RHS = combinevXi1ConstantToInteger(RHS, DAG);
42501     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
42502                        DAG.getBitcast(DstVT, LHS), RHS);
42503   }
42504
42505   return SDValue();
42506 }
42507
42508 static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
42509                                     const X86Subtarget &Subtarget) {
42510   SDLoc DL(BV);
42511   unsigned NumElts = BV->getNumOperands();
42512   SDValue Splat = BV->getSplatValue();
42513
42514   // Build MMX element from integer GPR or SSE float values.
42515   auto CreateMMXElement = [&](SDValue V) {
42516     if (V.isUndef())
42517       return DAG.getUNDEF(MVT::x86mmx);
42518     if (V.getValueType().isFloatingPoint()) {
42519       if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
42520         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
42521         V = DAG.getBitcast(MVT::v2i64, V);
42522         return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
42523       }
42524       V = DAG.getBitcast(MVT::i32, V);
42525     } else {
42526       V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
42527     }
42528     return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
42529   };
42530
42531   // Convert build vector ops to MMX data in the bottom elements.
42532   SmallVector<SDValue, 8> Ops;
42533
42534   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42535
42536   // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
42537   if (Splat) {
42538     if (Splat.isUndef())
42539       return DAG.getUNDEF(MVT::x86mmx);
42540
42541     Splat = CreateMMXElement(Splat);
42542
42543     if (Subtarget.hasSSE1()) {
42544       // Unpack v8i8 to splat i8 elements to lowest 16-bits.
42545       if (NumElts == 8)
42546         Splat = DAG.getNode(
42547             ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
42548             DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
42549                                   TLI.getPointerTy(DAG.getDataLayout())),
42550             Splat, Splat);
42551
42552       // Use PSHUFW to repeat 16-bit elements.
42553       unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
42554       return DAG.getNode(
42555           ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
42556           DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
42557                                 TLI.getPointerTy(DAG.getDataLayout())),
42558           Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
42559     }
42560     Ops.append(NumElts, Splat);
42561   } else {
42562     for (unsigned i = 0; i != NumElts; ++i)
42563       Ops.push_back(CreateMMXElement(BV->getOperand(i)));
42564   }
42565
42566   // Use tree of PUNPCKLs to build up general MMX vector.
42567   while (Ops.size() > 1) {
42568     unsigned NumOps = Ops.size();
42569     unsigned IntrinOp =
42570         (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
42571                      : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
42572                                     : Intrinsic::x86_mmx_punpcklbw));
42573     SDValue Intrin = DAG.getTargetConstant(
42574         IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
42575     for (unsigned i = 0; i != NumOps; i += 2)
42576       Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
42577                                Ops[i], Ops[i + 1]);
42578     Ops.resize(NumOps / 2);
42579   }
42580
42581   return Ops[0];
42582 }
42583
42584 // Recursive function that attempts to find if a bool vector node was originally
42585 // a vector/float/double that got truncated/extended/bitcast to/from a scalar
42586 // integer. If so, replace the scalar ops with bool vector equivalents back down
42587 // the chain.
42588 static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
42589                                           SelectionDAG &DAG,
42590                                           const X86Subtarget &Subtarget) {
42591   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42592   unsigned Opc = V.getOpcode();
42593   switch (Opc) {
42594   case ISD::BITCAST: {
42595     // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
42596     SDValue Src = V.getOperand(0);
42597     EVT SrcVT = Src.getValueType();
42598     if (SrcVT.isVector() || SrcVT.isFloatingPoint())
42599       return DAG.getBitcast(VT, Src);
42600     break;
42601   }
42602   case ISD::TRUNCATE: {
42603     // If we find a suitable source, a truncated scalar becomes a subvector.
42604     SDValue Src = V.getOperand(0);
42605     EVT NewSrcVT =
42606         EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
42607     if (TLI.isTypeLegal(NewSrcVT))
42608       if (SDValue N0 =
42609               combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
42610         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
42611                            DAG.getIntPtrConstant(0, DL));
42612     break;
42613   }
42614   case ISD::ANY_EXTEND:
42615   case ISD::ZERO_EXTEND: {
42616     // If we find a suitable source, an extended scalar becomes a subvector.
42617     SDValue Src = V.getOperand(0);
42618     EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
42619                                     Src.getScalarValueSizeInBits());
42620     if (TLI.isTypeLegal(NewSrcVT))
42621       if (SDValue N0 =
42622               combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
42623         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42624                            Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
42625                                                   : DAG.getConstant(0, DL, VT),
42626                            N0, DAG.getIntPtrConstant(0, DL));
42627     break;
42628   }
42629   case ISD::OR: {
42630     // If we find suitable sources, we can just move an OR to the vector domain.
42631     SDValue Src0 = V.getOperand(0);
42632     SDValue Src1 = V.getOperand(1);
42633     if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
42634       if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
42635         return DAG.getNode(Opc, DL, VT, N0, N1);
42636     break;
42637   }
42638   case ISD::SHL: {
42639     // If we find a suitable source, a SHL becomes a KSHIFTL.
42640     SDValue Src0 = V.getOperand(0);
42641     if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
42642         ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
42643       break;
42644
42645     if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
42646       if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
42647         return DAG.getNode(
42648             X86ISD::KSHIFTL, DL, VT, N0,
42649             DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
42650     break;
42651   }
42652   }
42653   return SDValue();
42654 }
42655
42656 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
42657                               TargetLowering::DAGCombinerInfo &DCI,
42658                               const X86Subtarget &Subtarget) {
42659   SDValue N0 = N->getOperand(0);
42660   EVT VT = N->getValueType(0);
42661   EVT SrcVT = N0.getValueType();
42662   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42663
42664   // Try to match patterns such as
42665   // (i16 bitcast (v16i1 x))
42666   // ->
42667   // (i16 movmsk (16i8 sext (v16i1 x)))
42668   // before the setcc result is scalarized on subtargets that don't have legal
42669   // vxi1 types.
42670   if (DCI.isBeforeLegalize()) {
42671     SDLoc dl(N);
42672     if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
42673       return V;
42674
42675     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
42676     // type, widen both sides to avoid a trip through memory.
42677     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
42678         Subtarget.hasAVX512()) {
42679       N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
42680       N0 = DAG.getBitcast(MVT::v8i1, N0);
42681       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
42682                          DAG.getIntPtrConstant(0, dl));
42683     }
42684
42685     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
42686     // type, widen both sides to avoid a trip through memory.
42687     if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
42688         Subtarget.hasAVX512()) {
42689       // Use zeros for the widening if we already have some zeroes. This can
42690       // allow SimplifyDemandedBits to remove scalar ANDs that may be down
42691       // stream of this.
42692       // FIXME: It might make sense to detect a concat_vectors with a mix of
42693       // zeroes and undef and turn it into insert_subvector for i1 vectors as
42694       // a separate combine. What we can't do is canonicalize the operands of
42695       // such a concat or we'll get into a loop with SimplifyDemandedBits.
42696       if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
42697         SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
42698         if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
42699           SrcVT = LastOp.getValueType();
42700           unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
42701           SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
42702           Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
42703           N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
42704           N0 = DAG.getBitcast(MVT::i8, N0);
42705           return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
42706         }
42707       }
42708
42709       unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
42710       SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
42711       Ops[0] = N0;
42712       N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
42713       N0 = DAG.getBitcast(MVT::i8, N0);
42714       return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
42715     }
42716   } else {
42717     // If we're bitcasting from iX to vXi1, see if the integer originally
42718     // began as a vXi1 and whether we can remove the bitcast entirely.
42719     if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
42720         SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
42721       if (SDValue V =
42722               combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
42723         return V;
42724     }
42725   }
42726
42727   // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
42728   // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
42729   // due to insert_subvector legalization on KNL. By promoting the copy to i16
42730   // we can help with known bits propagation from the vXi1 domain to the
42731   // scalar domain.
42732   if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
42733       !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42734       N0.getOperand(0).getValueType() == MVT::v16i1 &&
42735       isNullConstant(N0.getOperand(1)))
42736     return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
42737                        DAG.getBitcast(MVT::i16, N0.getOperand(0)));
42738
42739   // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
42740   // and the vbroadcast_load are both integer or both fp. In some cases this
42741   // will remove the bitcast entirely.
42742   if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
42743        VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
42744     auto *BCast = cast<MemIntrinsicSDNode>(N0);
42745     unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
42746     unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
42747     // Don't swap i8/i16 since don't have fp types that size.
42748     if (MemSize >= 32) {
42749       MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
42750                                        : MVT::getIntegerVT(MemSize);
42751       MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
42752                                         : MVT::getIntegerVT(SrcVTSize);
42753       LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
42754
42755       SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
42756       SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
42757       SDValue ResNode =
42758           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
42759                                   MemVT, BCast->getMemOperand());
42760       DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
42761       return DAG.getBitcast(VT, ResNode);
42762     }
42763   }
42764
42765   // Since MMX types are special and don't usually play with other vector types,
42766   // it's better to handle them early to be sure we emit efficient code by
42767   // avoiding store-load conversions.
42768   if (VT == MVT::x86mmx) {
42769     // Detect MMX constant vectors.
42770     APInt UndefElts;
42771     SmallVector<APInt, 1> EltBits;
42772     if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
42773       SDLoc DL(N0);
42774       // Handle zero-extension of i32 with MOVD.
42775       if (EltBits[0].countl_zero() >= 32)
42776         return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
42777                            DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
42778       // Else, bitcast to a double.
42779       // TODO - investigate supporting sext 32-bit immediates on x86_64.
42780       APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
42781       return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
42782     }
42783
42784     // Detect bitcasts to x86mmx low word.
42785     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
42786         (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
42787         N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
42788       bool LowUndef = true, AllUndefOrZero = true;
42789       for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
42790         SDValue Op = N0.getOperand(i);
42791         LowUndef &= Op.isUndef() || (i >= e/2);
42792         AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
42793       }
42794       if (AllUndefOrZero) {
42795         SDValue N00 = N0.getOperand(0);
42796         SDLoc dl(N00);
42797         N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
42798                        : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
42799         return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
42800       }
42801     }
42802
42803     // Detect bitcasts of 64-bit build vectors and convert to a
42804     // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
42805     // lowest element.
42806     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
42807         (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
42808          SrcVT == MVT::v8i8))
42809       return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
42810
42811     // Detect bitcasts between element or subvector extraction to x86mmx.
42812     if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
42813          N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
42814         isNullConstant(N0.getOperand(1))) {
42815       SDValue N00 = N0.getOperand(0);
42816       if (N00.getValueType().is128BitVector())
42817         return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
42818                            DAG.getBitcast(MVT::v2i64, N00));
42819     }
42820
42821     // Detect bitcasts from FP_TO_SINT to x86mmx.
42822     if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
42823       SDLoc DL(N0);
42824       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
42825                                 DAG.getUNDEF(MVT::v2i32));
42826       return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
42827                          DAG.getBitcast(MVT::v2i64, Res));
42828     }
42829   }
42830
42831   // Try to remove a bitcast of constant vXi1 vector. We have to legalize
42832   // most of these to scalar anyway.
42833   if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
42834       SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
42835       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
42836     return combinevXi1ConstantToInteger(N0, DAG);
42837   }
42838
42839   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
42840       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
42841       isa<ConstantSDNode>(N0)) {
42842     auto *C = cast<ConstantSDNode>(N0);
42843     if (C->isAllOnes())
42844       return DAG.getConstant(1, SDLoc(N0), VT);
42845     if (C->isZero())
42846       return DAG.getConstant(0, SDLoc(N0), VT);
42847   }
42848
42849   // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
42850   // Turn it into a sign bit compare that produces a k-register. This avoids
42851   // a trip through a GPR.
42852   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
42853       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
42854       isPowerOf2_32(VT.getVectorNumElements())) {
42855     unsigned NumElts = VT.getVectorNumElements();
42856     SDValue Src = N0;
42857
42858     // Peek through truncate.
42859     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
42860       Src = N0.getOperand(0);
42861
42862     if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
42863       SDValue MovmskIn = Src.getOperand(0);
42864       MVT MovmskVT = MovmskIn.getSimpleValueType();
42865       unsigned MovMskElts = MovmskVT.getVectorNumElements();
42866
42867       // We allow extra bits of the movmsk to be used since they are known zero.
42868       // We can't convert a VPMOVMSKB without avx512bw.
42869       if (MovMskElts <= NumElts &&
42870           (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
42871         EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
42872         MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
42873         SDLoc dl(N);
42874         MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
42875         SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
42876                                    DAG.getConstant(0, dl, IntVT), ISD::SETLT);
42877         if (EVT(CmpVT) == VT)
42878           return Cmp;
42879
42880         // Pad with zeroes up to original VT to replace the zeroes that were
42881         // being used from the MOVMSK.
42882         unsigned NumConcats = NumElts / MovMskElts;
42883         SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
42884         Ops[0] = Cmp;
42885         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
42886       }
42887     }
42888   }
42889
42890   // Try to remove bitcasts from input and output of mask arithmetic to
42891   // remove GPR<->K-register crossings.
42892   if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
42893     return V;
42894
42895   // Convert a bitcasted integer logic operation that has one bitcasted
42896   // floating-point operand into a floating-point logic operation. This may
42897   // create a load of a constant, but that is cheaper than materializing the
42898   // constant in an integer register and transferring it to an SSE register or
42899   // transferring the SSE operand to integer register and back.
42900   unsigned FPOpcode;
42901   switch (N0.getOpcode()) {
42902     case ISD::AND: FPOpcode = X86ISD::FAND; break;
42903     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
42904     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
42905     default: return SDValue();
42906   }
42907
42908   // Check if we have a bitcast from another integer type as well.
42909   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
42910         (Subtarget.hasSSE2() && VT == MVT::f64) ||
42911         (Subtarget.hasFP16() && VT == MVT::f16) ||
42912         (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
42913          TLI.isTypeLegal(VT))))
42914     return SDValue();
42915
42916   SDValue LogicOp0 = N0.getOperand(0);
42917   SDValue LogicOp1 = N0.getOperand(1);
42918   SDLoc DL0(N0);
42919
42920   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
42921   if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
42922       LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
42923       LogicOp0.getOperand(0).getValueType() == VT &&
42924       !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
42925     SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
42926     unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
42927     return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
42928   }
42929   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
42930   if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
42931       LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
42932       LogicOp1.getOperand(0).getValueType() == VT &&
42933       !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
42934     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
42935     unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
42936     return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
42937   }
42938
42939   return SDValue();
42940 }
42941
42942 // (mul (zext a), (sext, b))
42943 static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
42944                          SDValue &Op1) {
42945   Op0 = Mul.getOperand(0);
42946   Op1 = Mul.getOperand(1);
42947
42948   // The operand1 should be signed extend
42949   if (Op0.getOpcode() == ISD::SIGN_EXTEND)
42950     std::swap(Op0, Op1);
42951
42952   auto IsFreeTruncation = [](SDValue &Op) -> bool {
42953     if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
42954          Op.getOpcode() == ISD::SIGN_EXTEND) &&
42955         Op.getOperand(0).getScalarValueSizeInBits() <= 8)
42956       return true;
42957
42958     auto *BV = dyn_cast<BuildVectorSDNode>(Op);
42959     return (BV && BV->isConstant());
42960   };
42961
42962   // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
42963   // value, we need to check Op0 is zero extended value. Op1 should be signed
42964   // value, so we just check the signed bits.
42965   if ((IsFreeTruncation(Op0) &&
42966        DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
42967       (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
42968     return true;
42969
42970   return false;
42971 }
42972
42973 // Given a ABS node, detect the following pattern:
42974 // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
42975 // This is useful as it is the input into a SAD pattern.
42976 static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
42977   SDValue AbsOp1 = Abs->getOperand(0);
42978   if (AbsOp1.getOpcode() != ISD::SUB)
42979     return false;
42980
42981   Op0 = AbsOp1.getOperand(0);
42982   Op1 = AbsOp1.getOperand(1);
42983
42984   // Check if the operands of the sub are zero-extended from vectors of i8.
42985   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
42986       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
42987       Op1.getOpcode() != ISD::ZERO_EXTEND ||
42988       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
42989     return false;
42990
42991   return true;
42992 }
42993
42994 static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
42995                               unsigned &LogBias, const SDLoc &DL,
42996                               const X86Subtarget &Subtarget) {
42997   // Extend or truncate to MVT::i8 first.
42998   MVT Vi8VT =
42999       MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
43000   LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
43001   RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
43002
43003   // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
43004   // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
43005   // The src A, B element type is i8, but the dst C element type is i32.
43006   // When we calculate the reduce stage, we use src vector type vXi8 for it
43007   // so we need logbias 2 to avoid extra 2 stages.
43008   LogBias = 2;
43009
43010   unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
43011   if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
43012     RegSize = std::max(512u, RegSize);
43013
43014   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
43015   // fill in the missing vector elements with 0.
43016   unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
43017   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
43018   Ops[0] = LHS;
43019   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
43020   SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43021   Ops[0] = RHS;
43022   SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43023
43024   // Actually build the DotProduct, split as 256/512 bits for
43025   // AVXVNNI/AVX512VNNI.
43026   auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43027                        ArrayRef<SDValue> Ops) {
43028     MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43029     return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
43030   };
43031   MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
43032   SDValue Zero = DAG.getConstant(0, DL, DpVT);
43033
43034   return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
43035                           DpBuilder, false);
43036 }
43037
43038 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
43039 // to these zexts.
43040 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
43041                             const SDValue &Zext1, const SDLoc &DL,
43042                             const X86Subtarget &Subtarget) {
43043   // Find the appropriate width for the PSADBW.
43044   EVT InVT = Zext0.getOperand(0).getValueType();
43045   unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
43046
43047   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
43048   // fill in the missing vector elements with 0.
43049   unsigned NumConcat = RegSize / InVT.getSizeInBits();
43050   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
43051   Ops[0] = Zext0.getOperand(0);
43052   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
43053   SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43054   Ops[0] = Zext1.getOperand(0);
43055   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43056
43057   // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
43058   auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43059                           ArrayRef<SDValue> Ops) {
43060     MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
43061     return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
43062   };
43063   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
43064   return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
43065                           PSADBWBuilder);
43066 }
43067
43068 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
43069 // PHMINPOSUW.
43070 static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
43071                                       const X86Subtarget &Subtarget) {
43072   // Bail without SSE41.
43073   if (!Subtarget.hasSSE41())
43074     return SDValue();
43075
43076   EVT ExtractVT = Extract->getValueType(0);
43077   if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
43078     return SDValue();
43079
43080   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
43081   ISD::NodeType BinOp;
43082   SDValue Src = DAG.matchBinOpReduction(
43083       Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
43084   if (!Src)
43085     return SDValue();
43086
43087   EVT SrcVT = Src.getValueType();
43088   EVT SrcSVT = SrcVT.getScalarType();
43089   if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
43090     return SDValue();
43091
43092   SDLoc DL(Extract);
43093   SDValue MinPos = Src;
43094
43095   // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
43096   while (SrcVT.getSizeInBits() > 128) {
43097     SDValue Lo, Hi;
43098     std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
43099     SrcVT = Lo.getValueType();
43100     MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
43101   }
43102   assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
43103           (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
43104          "Unexpected value type");
43105
43106   // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
43107   // to flip the value accordingly.
43108   SDValue Mask;
43109   unsigned MaskEltsBits = ExtractVT.getSizeInBits();
43110   if (BinOp == ISD::SMAX)
43111     Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
43112   else if (BinOp == ISD::SMIN)
43113     Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
43114   else if (BinOp == ISD::UMAX)
43115     Mask = DAG.getAllOnesConstant(DL, SrcVT);
43116
43117   if (Mask)
43118     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
43119
43120   // For v16i8 cases we need to perform UMIN on pairs of byte elements,
43121   // shuffling each upper element down and insert zeros. This means that the
43122   // v16i8 UMIN will leave the upper element as zero, performing zero-extension
43123   // ready for the PHMINPOS.
43124   if (ExtractVT == MVT::i8) {
43125     SDValue Upper = DAG.getVectorShuffle(
43126         SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
43127         {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
43128     MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
43129   }
43130
43131   // Perform the PHMINPOS on a v8i16 vector,
43132   MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
43133   MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
43134   MinPos = DAG.getBitcast(SrcVT, MinPos);
43135
43136   if (Mask)
43137     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
43138
43139   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
43140                      DAG.getIntPtrConstant(0, DL));
43141 }
43142
43143 // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
43144 static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
43145                                          const X86Subtarget &Subtarget) {
43146   // Bail without SSE2.
43147   if (!Subtarget.hasSSE2())
43148     return SDValue();
43149
43150   EVT ExtractVT = Extract->getValueType(0);
43151   unsigned BitWidth = ExtractVT.getSizeInBits();
43152   if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
43153       ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
43154     return SDValue();
43155
43156   // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
43157   ISD::NodeType BinOp;
43158   SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
43159   if (!Match && ExtractVT == MVT::i1)
43160     Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
43161   if (!Match)
43162     return SDValue();
43163
43164   // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
43165   // which we can't support here for now.
43166   if (Match.getScalarValueSizeInBits() != BitWidth)
43167     return SDValue();
43168
43169   SDValue Movmsk;
43170   SDLoc DL(Extract);
43171   EVT MatchVT = Match.getValueType();
43172   unsigned NumElts = MatchVT.getVectorNumElements();
43173   unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
43174   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43175   LLVMContext &Ctx = *DAG.getContext();
43176
43177   if (ExtractVT == MVT::i1) {
43178     // Special case for (pre-legalization) vXi1 reductions.
43179     if (NumElts > 64 || !isPowerOf2_32(NumElts))
43180       return SDValue();
43181     if (Match.getOpcode() == ISD::SETCC) {
43182       ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
43183       if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
43184           (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
43185         // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
43186         // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
43187         X86::CondCode X86CC;
43188         SDValue LHS = DAG.getFreeze(Match.getOperand(0));
43189         SDValue RHS = DAG.getFreeze(Match.getOperand(1));
43190         APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
43191         if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
43192                                             DAG, X86CC))
43193           return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
43194                              getSETCC(X86CC, V, DL, DAG));
43195       }
43196     }
43197     if (TLI.isTypeLegal(MatchVT)) {
43198       // If this is a legal AVX512 predicate type then we can just bitcast.
43199       EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
43200       Movmsk = DAG.getBitcast(MovmskVT, Match);
43201     } else {
43202       // Use combineBitcastvxi1 to create the MOVMSK.
43203       while (NumElts > MaxElts) {
43204         SDValue Lo, Hi;
43205         std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
43206         Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
43207         NumElts /= 2;
43208       }
43209       EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
43210       Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
43211     }
43212     if (!Movmsk)
43213       return SDValue();
43214     Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
43215   } else {
43216     // FIXME: Better handling of k-registers or 512-bit vectors?
43217     unsigned MatchSizeInBits = Match.getValueSizeInBits();
43218     if (!(MatchSizeInBits == 128 ||
43219           (MatchSizeInBits == 256 && Subtarget.hasAVX())))
43220       return SDValue();
43221
43222     // Make sure this isn't a vector of 1 element. The perf win from using
43223     // MOVMSK diminishes with less elements in the reduction, but it is
43224     // generally better to get the comparison over to the GPRs as soon as
43225     // possible to reduce the number of vector ops.
43226     if (Match.getValueType().getVectorNumElements() < 2)
43227       return SDValue();
43228
43229     // Check that we are extracting a reduction of all sign bits.
43230     if (DAG.ComputeNumSignBits(Match) != BitWidth)
43231       return SDValue();
43232
43233     if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
43234       SDValue Lo, Hi;
43235       std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
43236       Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
43237       MatchSizeInBits = Match.getValueSizeInBits();
43238     }
43239
43240     // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
43241     MVT MaskSrcVT;
43242     if (64 == BitWidth || 32 == BitWidth)
43243       MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
43244                                    MatchSizeInBits / BitWidth);
43245     else
43246       MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
43247
43248     SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
43249     Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
43250     NumElts = MaskSrcVT.getVectorNumElements();
43251   }
43252   assert((NumElts <= 32 || NumElts == 64) &&
43253          "Not expecting more than 64 elements");
43254
43255   MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
43256   if (BinOp == ISD::XOR) {
43257     // parity -> (PARITY(MOVMSK X))
43258     SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
43259     return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
43260   }
43261
43262   SDValue CmpC;
43263   ISD::CondCode CondCode;
43264   if (BinOp == ISD::OR) {
43265     // any_of -> MOVMSK != 0
43266     CmpC = DAG.getConstant(0, DL, CmpVT);
43267     CondCode = ISD::CondCode::SETNE;
43268   } else {
43269     // all_of -> MOVMSK == ((1 << NumElts) - 1)
43270     CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
43271                            DL, CmpVT);
43272     CondCode = ISD::CondCode::SETEQ;
43273   }
43274
43275   // The setcc produces an i8 of 0/1, so extend that to the result width and
43276   // negate to get the final 0/-1 mask value.
43277   EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
43278   SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
43279   SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
43280   SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
43281   return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
43282 }
43283
43284 static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
43285                                       const X86Subtarget &Subtarget) {
43286   if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
43287     return SDValue();
43288
43289   EVT ExtractVT = Extract->getValueType(0);
43290   // Verify the type we're extracting is i32, as the output element type of
43291   // vpdpbusd is i32.
43292   if (ExtractVT != MVT::i32)
43293     return SDValue();
43294
43295   EVT VT = Extract->getOperand(0).getValueType();
43296   if (!isPowerOf2_32(VT.getVectorNumElements()))
43297     return SDValue();
43298
43299   // Match shuffle + add pyramid.
43300   ISD::NodeType BinOp;
43301   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
43302
43303   // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
43304   // done by vpdpbusd compute a signed 16-bit product that will be sign extended
43305   // before adding into the accumulator.
43306   // TODO:
43307   // We also need to verify that the multiply has at least 2x the number of bits
43308   // of the input. We shouldn't match
43309   // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
43310   // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
43311   //   Root = Root.getOperand(0);
43312
43313   // If there was a match, we want Root to be a mul.
43314   if (!Root || Root.getOpcode() != ISD::MUL)
43315     return SDValue();
43316
43317   // Check whether we have an extend and mul pattern
43318   SDValue LHS, RHS;
43319   if (!detectExtMul(DAG, Root, LHS, RHS))
43320     return SDValue();
43321
43322   // Create the dot product instruction.
43323   SDLoc DL(Extract);
43324   unsigned StageBias;
43325   SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
43326
43327   // If the original vector was wider than 4 elements, sum over the results
43328   // in the DP vector.
43329   unsigned Stages = Log2_32(VT.getVectorNumElements());
43330   EVT DpVT = DP.getValueType();
43331
43332   if (Stages > StageBias) {
43333     unsigned DpElems = DpVT.getVectorNumElements();
43334
43335     for (unsigned i = Stages - StageBias; i > 0; --i) {
43336       SmallVector<int, 16> Mask(DpElems, -1);
43337       for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
43338         Mask[j] = MaskEnd + j;
43339
43340       SDValue Shuffle =
43341           DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
43342       DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
43343     }
43344   }
43345
43346   // Return the lowest ExtractSizeInBits bits.
43347   EVT ResVT =
43348       EVT::getVectorVT(*DAG.getContext(), ExtractVT,
43349                        DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
43350   DP = DAG.getBitcast(ResVT, DP);
43351   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
43352                      Extract->getOperand(1));
43353 }
43354
43355 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
43356                                       const X86Subtarget &Subtarget) {
43357   // PSADBW is only supported on SSE2 and up.
43358   if (!Subtarget.hasSSE2())
43359     return SDValue();
43360
43361   EVT ExtractVT = Extract->getValueType(0);
43362   // Verify the type we're extracting is either i32 or i64.
43363   // FIXME: Could support other types, but this is what we have coverage for.
43364   if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
43365     return SDValue();
43366
43367   EVT VT = Extract->getOperand(0).getValueType();
43368   if (!isPowerOf2_32(VT.getVectorNumElements()))
43369     return SDValue();
43370
43371   // Match shuffle + add pyramid.
43372   ISD::NodeType BinOp;
43373   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
43374
43375   // The operand is expected to be zero extended from i8
43376   // (verified in detectZextAbsDiff).
43377   // In order to convert to i64 and above, additional any/zero/sign
43378   // extend is expected.
43379   // The zero extend from 32 bit has no mathematical effect on the result.
43380   // Also the sign extend is basically zero extend
43381   // (extends the sign bit which is zero).
43382   // So it is correct to skip the sign/zero extend instruction.
43383   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
43384                Root.getOpcode() == ISD::ZERO_EXTEND ||
43385                Root.getOpcode() == ISD::ANY_EXTEND))
43386     Root = Root.getOperand(0);
43387
43388   // If there was a match, we want Root to be a select that is the root of an
43389   // abs-diff pattern.
43390   if (!Root || Root.getOpcode() != ISD::ABS)
43391     return SDValue();
43392
43393   // Check whether we have an abs-diff pattern feeding into the select.
43394   SDValue Zext0, Zext1;
43395   if (!detectZextAbsDiff(Root, Zext0, Zext1))
43396     return SDValue();
43397
43398   // Create the SAD instruction.
43399   SDLoc DL(Extract);
43400   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
43401
43402   // If the original vector was wider than 8 elements, sum over the results
43403   // in the SAD vector.
43404   unsigned Stages = Log2_32(VT.getVectorNumElements());
43405   EVT SadVT = SAD.getValueType();
43406   if (Stages > 3) {
43407     unsigned SadElems = SadVT.getVectorNumElements();
43408
43409     for(unsigned i = Stages - 3; i > 0; --i) {
43410       SmallVector<int, 16> Mask(SadElems, -1);
43411       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
43412         Mask[j] = MaskEnd + j;
43413
43414       SDValue Shuffle =
43415           DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
43416       SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
43417     }
43418   }
43419
43420   unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
43421   // Return the lowest ExtractSizeInBits bits.
43422   EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
43423                                SadVT.getSizeInBits() / ExtractSizeInBits);
43424   SAD = DAG.getBitcast(ResVT, SAD);
43425   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
43426                      Extract->getOperand(1));
43427 }
43428
43429 // Attempt to peek through a target shuffle and extract the scalar from the
43430 // source.
43431 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
43432                                          TargetLowering::DAGCombinerInfo &DCI,
43433                                          const X86Subtarget &Subtarget) {
43434   if (DCI.isBeforeLegalizeOps())
43435     return SDValue();
43436
43437   SDLoc dl(N);
43438   SDValue Src = N->getOperand(0);
43439   SDValue Idx = N->getOperand(1);
43440
43441   EVT VT = N->getValueType(0);
43442   EVT SrcVT = Src.getValueType();
43443   EVT SrcSVT = SrcVT.getVectorElementType();
43444   unsigned SrcEltBits = SrcSVT.getSizeInBits();
43445   unsigned NumSrcElts = SrcVT.getVectorNumElements();
43446
43447   // Don't attempt this for boolean mask vectors or unknown extraction indices.
43448   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
43449     return SDValue();
43450
43451   const APInt &IdxC = N->getConstantOperandAPInt(1);
43452   if (IdxC.uge(NumSrcElts))
43453     return SDValue();
43454
43455   SDValue SrcBC = peekThroughBitcasts(Src);
43456
43457   // Handle extract(bitcast(broadcast(scalar_value))).
43458   if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
43459     SDValue SrcOp = SrcBC.getOperand(0);
43460     EVT SrcOpVT = SrcOp.getValueType();
43461     if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
43462         (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
43463       unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
43464       unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
43465       // TODO support non-zero offsets.
43466       if (Offset == 0) {
43467         SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
43468         SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
43469         return SrcOp;
43470       }
43471     }
43472   }
43473
43474   // If we're extracting a single element from a broadcast load and there are
43475   // no other users, just create a single load.
43476   if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
43477     auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
43478     unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
43479     if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
43480         VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
43481       SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
43482                                  MemIntr->getBasePtr(),
43483                                  MemIntr->getPointerInfo(),
43484                                  MemIntr->getOriginalAlign(),
43485                                  MemIntr->getMemOperand()->getFlags());
43486       DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
43487       return Load;
43488     }
43489   }
43490
43491   // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
43492   // TODO: Move to DAGCombine?
43493   if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
43494       SrcBC.getValueType().isInteger() &&
43495       (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
43496       SrcBC.getScalarValueSizeInBits() ==
43497           SrcBC.getOperand(0).getValueSizeInBits()) {
43498     unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
43499     if (IdxC.ult(Scale)) {
43500       unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
43501       SDValue Scl = SrcBC.getOperand(0);
43502       EVT SclVT = Scl.getValueType();
43503       if (Offset) {
43504         Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
43505                           DAG.getShiftAmountConstant(Offset, SclVT, dl));
43506       }
43507       Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
43508       Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
43509       return Scl;
43510     }
43511   }
43512
43513   // Handle extract(truncate(x)) for 0'th index.
43514   // TODO: Treat this as a faux shuffle?
43515   // TODO: When can we use this for general indices?
43516   if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
43517       (SrcVT.getSizeInBits() % 128) == 0) {
43518     Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
43519     MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
43520     return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
43521                        Idx);
43522   }
43523
43524   // We can only legally extract other elements from 128-bit vectors and in
43525   // certain circumstances, depending on SSE-level.
43526   // TODO: Investigate float/double extraction if it will be just stored.
43527   auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
43528                                                  unsigned Idx) {
43529     EVT VecSVT = VecVT.getScalarType();
43530     if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
43531         (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
43532          VecSVT == MVT::i64)) {
43533       unsigned EltSizeInBits = VecSVT.getSizeInBits();
43534       unsigned NumEltsPerLane = 128 / EltSizeInBits;
43535       unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
43536       unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
43537       VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
43538       Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
43539       Idx &= (NumEltsPerLane - 1);
43540     }
43541     if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
43542         ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
43543       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
43544                          DAG.getBitcast(VecVT, Vec),
43545                          DAG.getIntPtrConstant(Idx, dl));
43546     }
43547     if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
43548         (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
43549       unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
43550       return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
43551                          DAG.getTargetConstant(Idx, dl, MVT::i8));
43552     }
43553     return SDValue();
43554   };
43555
43556   // Resolve the target shuffle inputs and mask.
43557   SmallVector<int, 16> Mask;
43558   SmallVector<SDValue, 2> Ops;
43559   if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
43560     return SDValue();
43561
43562   // Shuffle inputs must be the same size as the result.
43563   if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
43564         return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
43565       }))
43566     return SDValue();
43567
43568   // Attempt to narrow/widen the shuffle mask to the correct size.
43569   if (Mask.size() != NumSrcElts) {
43570     if ((NumSrcElts % Mask.size()) == 0) {
43571       SmallVector<int, 16> ScaledMask;
43572       int Scale = NumSrcElts / Mask.size();
43573       narrowShuffleMaskElts(Scale, Mask, ScaledMask);
43574       Mask = std::move(ScaledMask);
43575     } else if ((Mask.size() % NumSrcElts) == 0) {
43576       // Simplify Mask based on demanded element.
43577       int ExtractIdx = (int)IdxC.getZExtValue();
43578       int Scale = Mask.size() / NumSrcElts;
43579       int Lo = Scale * ExtractIdx;
43580       int Hi = Scale * (ExtractIdx + 1);
43581       for (int i = 0, e = (int)Mask.size(); i != e; ++i)
43582         if (i < Lo || Hi <= i)
43583           Mask[i] = SM_SentinelUndef;
43584
43585       SmallVector<int, 16> WidenedMask;
43586       while (Mask.size() > NumSrcElts &&
43587              canWidenShuffleElements(Mask, WidenedMask))
43588         Mask = std::move(WidenedMask);
43589     }
43590   }
43591
43592   // If narrowing/widening failed, see if we can extract+zero-extend.
43593   int ExtractIdx;
43594   EVT ExtractVT;
43595   if (Mask.size() == NumSrcElts) {
43596     ExtractIdx = Mask[IdxC.getZExtValue()];
43597     ExtractVT = SrcVT;
43598   } else {
43599     unsigned Scale = Mask.size() / NumSrcElts;
43600     if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
43601       return SDValue();
43602     unsigned ScaledIdx = Scale * IdxC.getZExtValue();
43603     if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
43604       return SDValue();
43605     ExtractIdx = Mask[ScaledIdx];
43606     EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
43607     ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
43608     assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
43609            "Failed to widen vector type");
43610   }
43611
43612   // If the shuffle source element is undef/zero then we can just accept it.
43613   if (ExtractIdx == SM_SentinelUndef)
43614     return DAG.getUNDEF(VT);
43615
43616   if (ExtractIdx == SM_SentinelZero)
43617     return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
43618                                 : DAG.getConstant(0, dl, VT);
43619
43620   SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
43621   ExtractIdx = ExtractIdx % Mask.size();
43622   if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
43623     return DAG.getZExtOrTrunc(V, dl, VT);
43624
43625   return SDValue();
43626 }
43627
43628 /// Extracting a scalar FP value from vector element 0 is free, so extract each
43629 /// operand first, then perform the math as a scalar op.
43630 static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
43631                                  const X86Subtarget &Subtarget) {
43632   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
43633   SDValue Vec = ExtElt->getOperand(0);
43634   SDValue Index = ExtElt->getOperand(1);
43635   EVT VT = ExtElt->getValueType(0);
43636   EVT VecVT = Vec.getValueType();
43637
43638   // TODO: If this is a unary/expensive/expand op, allow extraction from a
43639   // non-zero element because the shuffle+scalar op will be cheaper?
43640   if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
43641     return SDValue();
43642
43643   // Vector FP compares don't fit the pattern of FP math ops (propagate, not
43644   // extract, the condition code), so deal with those as a special-case.
43645   if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
43646     EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
43647     if (OpVT != MVT::f32 && OpVT != MVT::f64)
43648       return SDValue();
43649
43650     // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
43651     SDLoc DL(ExtElt);
43652     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
43653                                Vec.getOperand(0), Index);
43654     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
43655                                Vec.getOperand(1), Index);
43656     return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
43657   }
43658
43659   if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
43660       VT != MVT::f64)
43661     return SDValue();
43662
43663   // Vector FP selects don't fit the pattern of FP math ops (because the
43664   // condition has a different type and we have to change the opcode), so deal
43665   // with those here.
43666   // FIXME: This is restricted to pre type legalization by ensuring the setcc
43667   // has i1 elements. If we loosen this we need to convert vector bool to a
43668   // scalar bool.
43669   if (Vec.getOpcode() == ISD::VSELECT &&
43670       Vec.getOperand(0).getOpcode() == ISD::SETCC &&
43671       Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
43672       Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
43673     // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
43674     SDLoc DL(ExtElt);
43675     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
43676                                Vec.getOperand(0).getValueType().getScalarType(),
43677                                Vec.getOperand(0), Index);
43678     SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
43679                                Vec.getOperand(1), Index);
43680     SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
43681                                Vec.getOperand(2), Index);
43682     return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
43683   }
43684
43685   // TODO: This switch could include FNEG and the x86-specific FP logic ops
43686   // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
43687   // missed load folding and fma+fneg combining.
43688   switch (Vec.getOpcode()) {
43689   case ISD::FMA: // Begin 3 operands
43690   case ISD::FMAD:
43691   case ISD::FADD: // Begin 2 operands
43692   case ISD::FSUB:
43693   case ISD::FMUL:
43694   case ISD::FDIV:
43695   case ISD::FREM:
43696   case ISD::FCOPYSIGN:
43697   case ISD::FMINNUM:
43698   case ISD::FMAXNUM:
43699   case ISD::FMINNUM_IEEE:
43700   case ISD::FMAXNUM_IEEE:
43701   case ISD::FMAXIMUM:
43702   case ISD::FMINIMUM:
43703   case X86ISD::FMAX:
43704   case X86ISD::FMIN:
43705   case ISD::FABS: // Begin 1 operand
43706   case ISD::FSQRT:
43707   case ISD::FRINT:
43708   case ISD::FCEIL:
43709   case ISD::FTRUNC:
43710   case ISD::FNEARBYINT:
43711   case ISD::FROUND:
43712   case ISD::FFLOOR:
43713   case X86ISD::FRCP:
43714   case X86ISD::FRSQRT: {
43715     // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
43716     SDLoc DL(ExtElt);
43717     SmallVector<SDValue, 4> ExtOps;
43718     for (SDValue Op : Vec->ops())
43719       ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
43720     return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
43721   }
43722   default:
43723     return SDValue();
43724   }
43725   llvm_unreachable("All opcodes should return within switch");
43726 }
43727
43728 /// Try to convert a vector reduction sequence composed of binops and shuffles
43729 /// into horizontal ops.
43730 static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
43731                                      const X86Subtarget &Subtarget) {
43732   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
43733
43734   // We need at least SSE2 to anything here.
43735   if (!Subtarget.hasSSE2())
43736     return SDValue();
43737
43738   ISD::NodeType Opc;
43739   SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
43740                                         {ISD::ADD, ISD::MUL, ISD::FADD}, true);
43741   if (!Rdx)
43742     return SDValue();
43743
43744   SDValue Index = ExtElt->getOperand(1);
43745   assert(isNullConstant(Index) &&
43746          "Reduction doesn't end in an extract from index 0");
43747
43748   EVT VT = ExtElt->getValueType(0);
43749   EVT VecVT = Rdx.getValueType();
43750   if (VecVT.getScalarType() != VT)
43751     return SDValue();
43752
43753   SDLoc DL(ExtElt);
43754   unsigned NumElts = VecVT.getVectorNumElements();
43755   unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
43756
43757   // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
43758   auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
43759     if (V.getValueType() == MVT::v4i8) {
43760       if (ZeroExtend && Subtarget.hasSSE41()) {
43761         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
43762                         DAG.getConstant(0, DL, MVT::v4i32),
43763                         DAG.getBitcast(MVT::i32, V),
43764                         DAG.getIntPtrConstant(0, DL));
43765         return DAG.getBitcast(MVT::v16i8, V);
43766       }
43767       V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
43768                       ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
43769                                  : DAG.getUNDEF(MVT::v4i8));
43770     }
43771     return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
43772                        DAG.getUNDEF(MVT::v8i8));
43773   };
43774
43775   // vXi8 mul reduction - promote to vXi16 mul reduction.
43776   if (Opc == ISD::MUL) {
43777     if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
43778       return SDValue();
43779     if (VecVT.getSizeInBits() >= 128) {
43780       EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
43781       SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
43782       SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
43783       Lo = DAG.getBitcast(WideVT, Lo);
43784       Hi = DAG.getBitcast(WideVT, Hi);
43785       Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
43786       while (Rdx.getValueSizeInBits() > 128) {
43787         std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
43788         Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
43789       }
43790     } else {
43791       Rdx = WidenToV16I8(Rdx, false);
43792       Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
43793       Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
43794     }
43795     if (NumElts >= 8)
43796       Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
43797                         DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
43798                                              {4, 5, 6, 7, -1, -1, -1, -1}));
43799     Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
43800                       DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
43801                                            {2, 3, -1, -1, -1, -1, -1, -1}));
43802     Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
43803                       DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
43804                                            {1, -1, -1, -1, -1, -1, -1, -1}));
43805     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
43806     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43807   }
43808
43809   // vXi8 add reduction - sub 128-bit vector.
43810   if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
43811     Rdx = WidenToV16I8(Rdx, true);
43812     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
43813                       DAG.getConstant(0, DL, MVT::v16i8));
43814     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
43815     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43816   }
43817
43818   // Must be a >=128-bit vector with pow2 elements.
43819   if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
43820     return SDValue();
43821
43822   // vXi8 add reduction - sum lo/hi halves then use PSADBW.
43823   if (VT == MVT::i8) {
43824     while (Rdx.getValueSizeInBits() > 128) {
43825       SDValue Lo, Hi;
43826       std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
43827       VecVT = Lo.getValueType();
43828       Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
43829     }
43830     assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
43831
43832     SDValue Hi = DAG.getVectorShuffle(
43833         MVT::v16i8, DL, Rdx, Rdx,
43834         {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
43835     Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
43836     Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
43837                       getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
43838     Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
43839     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43840   }
43841
43842   // See if we can use vXi8 PSADBW add reduction for larger zext types.
43843   // If the source vector values are 0-255, then we can use PSADBW to
43844   // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
43845   // TODO: See if its worth avoiding vXi16/i32 truncations?
43846   if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
43847       DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
43848       (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
43849        Subtarget.hasAVX512())) {
43850     if (Rdx.getValueType() == MVT::v8i16) {
43851       Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
43852                         DAG.getUNDEF(MVT::v8i16));
43853     } else {
43854       EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
43855       Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
43856       if (ByteVT.getSizeInBits() < 128)
43857         Rdx = WidenToV16I8(Rdx, true);
43858     }
43859
43860     // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
43861     auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43862                             ArrayRef<SDValue> Ops) {
43863       MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
43864       SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
43865       return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
43866     };
43867     MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
43868     Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
43869
43870     // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
43871     while (Rdx.getValueSizeInBits() > 128) {
43872       SDValue Lo, Hi;
43873       std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
43874       VecVT = Lo.getValueType();
43875       Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
43876     }
43877     assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
43878
43879     if (NumElts > 8) {
43880       SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
43881       Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
43882     }
43883
43884     VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
43885     Rdx = DAG.getBitcast(VecVT, Rdx);
43886     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43887   }
43888
43889   // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
43890   if (!shouldUseHorizontalOp(true, DAG, Subtarget))
43891     return SDValue();
43892
43893   unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
43894
43895   // 256-bit horizontal instructions operate on 128-bit chunks rather than
43896   // across the whole vector, so we need an extract + hop preliminary stage.
43897   // This is the only step where the operands of the hop are not the same value.
43898   // TODO: We could extend this to handle 512-bit or even longer vectors.
43899   if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
43900       ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
43901     unsigned NumElts = VecVT.getVectorNumElements();
43902     SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
43903     SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
43904     Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
43905     VecVT = Rdx.getValueType();
43906   }
43907   if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
43908       !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
43909     return SDValue();
43910
43911   // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
43912   unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
43913   for (unsigned i = 0; i != ReductionSteps; ++i)
43914     Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
43915
43916   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
43917 }
43918
43919 /// Detect vector gather/scatter index generation and convert it from being a
43920 /// bunch of shuffles and extracts into a somewhat faster sequence.
43921 /// For i686, the best sequence is apparently storing the value and loading
43922 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
43923 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
43924                                        TargetLowering::DAGCombinerInfo &DCI,
43925                                        const X86Subtarget &Subtarget) {
43926   if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
43927     return NewOp;
43928
43929   SDValue InputVector = N->getOperand(0);
43930   SDValue EltIdx = N->getOperand(1);
43931   auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
43932
43933   EVT SrcVT = InputVector.getValueType();
43934   EVT VT = N->getValueType(0);
43935   SDLoc dl(InputVector);
43936   bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
43937   unsigned NumSrcElts = SrcVT.getVectorNumElements();
43938   unsigned NumEltBits = VT.getScalarSizeInBits();
43939   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43940
43941   if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
43942     return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
43943
43944   // Integer Constant Folding.
43945   if (CIdx && VT.isInteger()) {
43946     APInt UndefVecElts;
43947     SmallVector<APInt, 16> EltBits;
43948     unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
43949     if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
43950                                       EltBits, true, false)) {
43951       uint64_t Idx = CIdx->getZExtValue();
43952       if (UndefVecElts[Idx])
43953         return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
43954       return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
43955     }
43956
43957     // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
43958     // Improves lowering of bool masks on rust which splits them into byte array.
43959     if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
43960       SDValue Src = peekThroughBitcasts(InputVector);
43961       if (Src.getValueType().getScalarType() == MVT::i1 &&
43962           TLI.isTypeLegal(Src.getValueType())) {
43963         MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
43964         SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
43965             DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
43966         return DAG.getBitcast(VT, Sub);
43967       }
43968     }
43969   }
43970
43971   if (IsPextr) {
43972     if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
43973                                  DCI))
43974       return SDValue(N, 0);
43975
43976     // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
43977     if ((InputVector.getOpcode() == X86ISD::PINSRB ||
43978          InputVector.getOpcode() == X86ISD::PINSRW) &&
43979         InputVector.getOperand(2) == EltIdx) {
43980       assert(SrcVT == InputVector.getOperand(0).getValueType() &&
43981              "Vector type mismatch");
43982       SDValue Scl = InputVector.getOperand(1);
43983       Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
43984       return DAG.getZExtOrTrunc(Scl, dl, VT);
43985     }
43986
43987     // TODO - Remove this once we can handle the implicit zero-extension of
43988     // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
43989     // combineBasicSADPattern.
43990     return SDValue();
43991   }
43992
43993   // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
43994   if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
43995       InputVector.getOpcode() == ISD::BITCAST &&
43996       InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
43997       isNullConstant(EltIdx) && InputVector.hasOneUse())
43998     return DAG.getBitcast(VT, InputVector);
43999
44000   // Detect mmx to i32 conversion through a v2i32 elt extract.
44001   if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
44002       InputVector.getOpcode() == ISD::BITCAST &&
44003       InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
44004       isNullConstant(EltIdx) && InputVector.hasOneUse())
44005     return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
44006                        InputVector.getOperand(0));
44007
44008   // Check whether this extract is the root of a sum of absolute differences
44009   // pattern. This has to be done here because we really want it to happen
44010   // pre-legalization,
44011   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
44012     return SAD;
44013
44014   if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
44015     return VPDPBUSD;
44016
44017   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
44018   if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
44019     return Cmp;
44020
44021   // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
44022   if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
44023     return MinMax;
44024
44025   // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
44026   if (SDValue V = combineArithReduction(N, DAG, Subtarget))
44027     return V;
44028
44029   if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
44030     return V;
44031
44032   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
44033   // and then testing the relevant element.
44034   //
44035   // Note that we only combine extracts on the *same* result number, i.e.
44036   //   t0 = merge_values a0, a1, a2, a3
44037   //   i1 = extract_vector_elt t0, Constant:i64<2>
44038   //   i1 = extract_vector_elt t0, Constant:i64<3>
44039   // but not
44040   //   i1 = extract_vector_elt t0:1, Constant:i64<2>
44041   // since the latter would need its own MOVMSK.
44042   if (SrcVT.getScalarType() == MVT::i1) {
44043     bool IsVar = !CIdx;
44044     SmallVector<SDNode *, 16> BoolExtracts;
44045     unsigned ResNo = InputVector.getResNo();
44046     auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
44047       if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44048           Use->getOperand(0).getResNo() == ResNo &&
44049           Use->getValueType(0) == MVT::i1) {
44050         BoolExtracts.push_back(Use);
44051         IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
44052         return true;
44053       }
44054       return false;
44055     };
44056     // TODO: Can we drop the oneuse check for constant extracts?
44057     if (all_of(InputVector->uses(), IsBoolExtract) &&
44058         (IsVar || BoolExtracts.size() > 1)) {
44059       EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
44060       if (SDValue BC =
44061               combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
44062         for (SDNode *Use : BoolExtracts) {
44063           // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
44064           // Mask = 1 << MaskIdx
44065           SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
44066           SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
44067           SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
44068           SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
44069           Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
44070           DCI.CombineTo(Use, Res);
44071         }
44072         return SDValue(N, 0);
44073       }
44074     }
44075   }
44076
44077   // If this extract is from a loaded vector value and will be used as an
44078   // integer, that requires a potentially expensive XMM -> GPR transfer.
44079   // Additionally, if we can convert to a scalar integer load, that will likely
44080   // be folded into a subsequent integer op.
44081   // Note: Unlike the related fold for this in DAGCombiner, this is not limited
44082   //       to a single-use of the loaded vector. For the reasons above, we
44083   //       expect this to be profitable even if it creates an extra load.
44084   bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
44085     return Use->getOpcode() == ISD::STORE ||
44086            Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
44087            Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
44088   });
44089   auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);
44090   if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
44091       SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&
44092       !LikelyUsedAsVector && LoadVec->isSimple()) {
44093     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44094     SDValue NewPtr =
44095         TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);
44096     unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;
44097     MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
44098     Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
44099     SDValue Load =
44100         DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
44101                     LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
44102     DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
44103     return Load;
44104   }
44105
44106   return SDValue();
44107 }
44108
44109 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
44110 // This is more or less the reverse of combineBitcastvxi1.
44111 static SDValue combineToExtendBoolVectorInReg(
44112     unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
44113     TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
44114   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
44115       Opcode != ISD::ANY_EXTEND)
44116     return SDValue();
44117   if (!DCI.isBeforeLegalizeOps())
44118     return SDValue();
44119   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
44120     return SDValue();
44121
44122   EVT SVT = VT.getScalarType();
44123   EVT InSVT = N0.getValueType().getScalarType();
44124   unsigned EltSizeInBits = SVT.getSizeInBits();
44125
44126   // Input type must be extending a bool vector (bit-casted from a scalar
44127   // integer) to legal integer types.
44128   if (!VT.isVector())
44129     return SDValue();
44130   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
44131     return SDValue();
44132   if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
44133     return SDValue();
44134
44135   SDValue N00 = N0.getOperand(0);
44136   EVT SclVT = N00.getValueType();
44137   if (!SclVT.isScalarInteger())
44138     return SDValue();
44139
44140   SDValue Vec;
44141   SmallVector<int> ShuffleMask;
44142   unsigned NumElts = VT.getVectorNumElements();
44143   assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
44144
44145   // Broadcast the scalar integer to the vector elements.
44146   if (NumElts > EltSizeInBits) {
44147     // If the scalar integer is greater than the vector element size, then we
44148     // must split it down into sub-sections for broadcasting. For example:
44149     //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
44150     //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
44151     assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
44152     unsigned Scale = NumElts / EltSizeInBits;
44153     EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
44154     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44155     Vec = DAG.getBitcast(VT, Vec);
44156
44157     for (unsigned i = 0; i != Scale; ++i)
44158       ShuffleMask.append(EltSizeInBits, i);
44159     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44160   } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
44161              (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
44162     // If we have register broadcast instructions, use the scalar size as the
44163     // element type for the shuffle. Then cast to the wider element type. The
44164     // widened bits won't be used, and this might allow the use of a broadcast
44165     // load.
44166     assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
44167     unsigned Scale = EltSizeInBits / NumElts;
44168     EVT BroadcastVT =
44169         EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
44170     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44171     ShuffleMask.append(NumElts * Scale, 0);
44172     Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
44173     Vec = DAG.getBitcast(VT, Vec);
44174   } else {
44175     // For smaller scalar integers, we can simply any-extend it to the vector
44176     // element size (we don't care about the upper bits) and broadcast it to all
44177     // elements.
44178     SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
44179     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
44180     ShuffleMask.append(NumElts, 0);
44181     Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44182   }
44183
44184   // Now, mask the relevant bit in each element.
44185   SmallVector<SDValue, 32> Bits;
44186   for (unsigned i = 0; i != NumElts; ++i) {
44187     int BitIdx = (i % EltSizeInBits);
44188     APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
44189     Bits.push_back(DAG.getConstant(Bit, DL, SVT));
44190   }
44191   SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
44192   Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
44193
44194   // Compare against the bitmask and extend the result.
44195   EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
44196   Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
44197   Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
44198
44199   // For SEXT, this is now done, otherwise shift the result down for
44200   // zero-extension.
44201   if (Opcode == ISD::SIGN_EXTEND)
44202     return Vec;
44203   return DAG.getNode(ISD::SRL, DL, VT, Vec,
44204                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
44205 }
44206
44207 /// If a vector select has an operand that is -1 or 0, try to simplify the
44208 /// select to a bitwise logic operation.
44209 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
44210 static SDValue
44211 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
44212                                  TargetLowering::DAGCombinerInfo &DCI,
44213                                  const X86Subtarget &Subtarget) {
44214   SDValue Cond = N->getOperand(0);
44215   SDValue LHS = N->getOperand(1);
44216   SDValue RHS = N->getOperand(2);
44217   EVT VT = LHS.getValueType();
44218   EVT CondVT = Cond.getValueType();
44219   SDLoc DL(N);
44220   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44221
44222   if (N->getOpcode() != ISD::VSELECT)
44223     return SDValue();
44224
44225   assert(CondVT.isVector() && "Vector select expects a vector selector!");
44226
44227   // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
44228   // TODO: Can we assert that both operands are not zeros (because that should
44229   //       get simplified at node creation time)?
44230   bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
44231   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
44232
44233   // If both inputs are 0/undef, create a complete zero vector.
44234   // FIXME: As noted above this should be handled by DAGCombiner/getNode.
44235   if (TValIsAllZeros && FValIsAllZeros) {
44236     if (VT.isFloatingPoint())
44237       return DAG.getConstantFP(0.0, DL, VT);
44238     return DAG.getConstant(0, DL, VT);
44239   }
44240
44241   // To use the condition operand as a bitwise mask, it must have elements that
44242   // are the same size as the select elements. Ie, the condition operand must
44243   // have already been promoted from the IR select condition type <N x i1>.
44244   // Don't check if the types themselves are equal because that excludes
44245   // vector floating-point selects.
44246   if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
44247     return SDValue();
44248
44249   // Try to invert the condition if true value is not all 1s and false value is
44250   // not all 0s. Only do this if the condition has one use.
44251   bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
44252   if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
44253       // Check if the selector will be produced by CMPP*/PCMP*.
44254       Cond.getOpcode() == ISD::SETCC &&
44255       // Check if SETCC has already been promoted.
44256       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
44257           CondVT) {
44258     bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
44259
44260     if (TValIsAllZeros || FValIsAllOnes) {
44261       SDValue CC = Cond.getOperand(2);
44262       ISD::CondCode NewCC = ISD::getSetCCInverse(
44263           cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
44264       Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
44265                           NewCC);
44266       std::swap(LHS, RHS);
44267       TValIsAllOnes = FValIsAllOnes;
44268       FValIsAllZeros = TValIsAllZeros;
44269     }
44270   }
44271
44272   // Cond value must be 'sign splat' to be converted to a logical op.
44273   if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
44274     return SDValue();
44275
44276   // vselect Cond, 111..., 000... -> Cond
44277   if (TValIsAllOnes && FValIsAllZeros)
44278     return DAG.getBitcast(VT, Cond);
44279
44280   if (!TLI.isTypeLegal(CondVT))
44281     return SDValue();
44282
44283   // vselect Cond, 111..., X -> or Cond, X
44284   if (TValIsAllOnes) {
44285     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
44286     SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
44287     return DAG.getBitcast(VT, Or);
44288   }
44289
44290   // vselect Cond, X, 000... -> and Cond, X
44291   if (FValIsAllZeros) {
44292     SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
44293     SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
44294     return DAG.getBitcast(VT, And);
44295   }
44296
44297   // vselect Cond, 000..., X -> andn Cond, X
44298   if (TValIsAllZeros) {
44299     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
44300     SDValue AndN;
44301     // The canonical form differs for i1 vectors - x86andnp is not used
44302     if (CondVT.getScalarType() == MVT::i1)
44303       AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
44304                          CastRHS);
44305     else
44306       AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
44307     return DAG.getBitcast(VT, AndN);
44308   }
44309
44310   return SDValue();
44311 }
44312
44313 /// If both arms of a vector select are concatenated vectors, split the select,
44314 /// and concatenate the result to eliminate a wide (256-bit) vector instruction:
44315 ///   vselect Cond, (concat T0, T1), (concat F0, F1) -->
44316 ///   concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
44317 static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
44318                                   const X86Subtarget &Subtarget) {
44319   unsigned Opcode = N->getOpcode();
44320   if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
44321     return SDValue();
44322
44323   // TODO: Split 512-bit vectors too?
44324   EVT VT = N->getValueType(0);
44325   if (!VT.is256BitVector())
44326     return SDValue();
44327
44328   // TODO: Split as long as any 2 of the 3 operands are concatenated?
44329   SDValue Cond = N->getOperand(0);
44330   SDValue TVal = N->getOperand(1);
44331   SDValue FVal = N->getOperand(2);
44332   if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
44333       !isFreeToSplitVector(TVal.getNode(), DAG) ||
44334       !isFreeToSplitVector(FVal.getNode(), DAG))
44335     return SDValue();
44336
44337   auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
44338                             ArrayRef<SDValue> Ops) {
44339     return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
44340   };
44341   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
44342                           makeBlend, /*CheckBWI*/ false);
44343 }
44344
44345 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
44346   SDValue Cond = N->getOperand(0);
44347   SDValue LHS = N->getOperand(1);
44348   SDValue RHS = N->getOperand(2);
44349   SDLoc DL(N);
44350
44351   auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
44352   auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
44353   if (!TrueC || !FalseC)
44354     return SDValue();
44355
44356   // Don't do this for crazy integer types.
44357   EVT VT = N->getValueType(0);
44358   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
44359     return SDValue();
44360
44361   // We're going to use the condition bit in math or logic ops. We could allow
44362   // this with a wider condition value (post-legalization it becomes an i8),
44363   // but if nothing is creating selects that late, it doesn't matter.
44364   if (Cond.getValueType() != MVT::i1)
44365     return SDValue();
44366
44367   // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
44368   // 3, 5, or 9 with i32/i64, so those get transformed too.
44369   // TODO: For constants that overflow or do not differ by power-of-2 or small
44370   // multiplier, convert to 'and' + 'add'.
44371   const APInt &TrueVal = TrueC->getAPIntValue();
44372   const APInt &FalseVal = FalseC->getAPIntValue();
44373
44374   // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
44375   if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
44376       Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
44377     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44378     if (CC == ISD::SETEQ || CC == ISD::SETNE)
44379       return SDValue();
44380   }
44381
44382   bool OV;
44383   APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
44384   if (OV)
44385     return SDValue();
44386
44387   APInt AbsDiff = Diff.abs();
44388   if (AbsDiff.isPowerOf2() ||
44389       ((VT == MVT::i32 || VT == MVT::i64) &&
44390        (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
44391
44392     // We need a positive multiplier constant for shift/LEA codegen. The 'not'
44393     // of the condition can usually be folded into a compare predicate, but even
44394     // without that, the sequence should be cheaper than a CMOV alternative.
44395     if (TrueVal.slt(FalseVal)) {
44396       Cond = DAG.getNOT(DL, Cond, MVT::i1);
44397       std::swap(TrueC, FalseC);
44398     }
44399
44400     // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
44401     SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
44402
44403     // Multiply condition by the difference if non-one.
44404     if (!AbsDiff.isOne())
44405       R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
44406
44407     // Add the base if non-zero.
44408     if (!FalseC->isZero())
44409       R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
44410
44411     return R;
44412   }
44413
44414   return SDValue();
44415 }
44416
44417 /// If this is a *dynamic* select (non-constant condition) and we can match
44418 /// this node with one of the variable blend instructions, restructure the
44419 /// condition so that blends can use the high (sign) bit of each element.
44420 /// This function will also call SimplifyDemandedBits on already created
44421 /// BLENDV to perform additional simplifications.
44422 static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
44423                                       TargetLowering::DAGCombinerInfo &DCI,
44424                                       const X86Subtarget &Subtarget) {
44425   SDValue Cond = N->getOperand(0);
44426   if ((N->getOpcode() != ISD::VSELECT &&
44427        N->getOpcode() != X86ISD::BLENDV) ||
44428       ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
44429     return SDValue();
44430
44431   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44432   unsigned BitWidth = Cond.getScalarValueSizeInBits();
44433   EVT VT = N->getValueType(0);
44434
44435   // We can only handle the cases where VSELECT is directly legal on the
44436   // subtarget. We custom lower VSELECT nodes with constant conditions and
44437   // this makes it hard to see whether a dynamic VSELECT will correctly
44438   // lower, so we both check the operation's status and explicitly handle the
44439   // cases where a *dynamic* blend will fail even though a constant-condition
44440   // blend could be custom lowered.
44441   // FIXME: We should find a better way to handle this class of problems.
44442   // Potentially, we should combine constant-condition vselect nodes
44443   // pre-legalization into shuffles and not mark as many types as custom
44444   // lowered.
44445   if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
44446     return SDValue();
44447   // FIXME: We don't support i16-element blends currently. We could and
44448   // should support them by making *all* the bits in the condition be set
44449   // rather than just the high bit and using an i8-element blend.
44450   if (VT.getVectorElementType() == MVT::i16)
44451     return SDValue();
44452   // Dynamic blending was only available from SSE4.1 onward.
44453   if (VT.is128BitVector() && !Subtarget.hasSSE41())
44454     return SDValue();
44455   // Byte blends are only available in AVX2
44456   if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
44457     return SDValue();
44458   // There are no 512-bit blend instructions that use sign bits.
44459   if (VT.is512BitVector())
44460     return SDValue();
44461
44462   // Don't optimize before the condition has been transformed to a legal type
44463   // and don't ever optimize vector selects that map to AVX512 mask-registers.
44464   if (BitWidth < 8 || BitWidth > 64)
44465     return SDValue();
44466
44467   auto OnlyUsedAsSelectCond = [](SDValue Cond) {
44468     for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
44469          UI != UE; ++UI)
44470       if ((UI->getOpcode() != ISD::VSELECT &&
44471            UI->getOpcode() != X86ISD::BLENDV) ||
44472           UI.getOperandNo() != 0)
44473         return false;
44474
44475     return true;
44476   };
44477
44478   APInt DemandedBits(APInt::getSignMask(BitWidth));
44479
44480   if (OnlyUsedAsSelectCond(Cond)) {
44481     KnownBits Known;
44482     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
44483                                           !DCI.isBeforeLegalizeOps());
44484     if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
44485       return SDValue();
44486
44487     // If we changed the computation somewhere in the DAG, this change will
44488     // affect all users of Cond. Update all the nodes so that we do not use
44489     // the generic VSELECT anymore. Otherwise, we may perform wrong
44490     // optimizations as we messed with the actual expectation for the vector
44491     // boolean values.
44492     for (SDNode *U : Cond->uses()) {
44493       if (U->getOpcode() == X86ISD::BLENDV)
44494         continue;
44495
44496       SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
44497                                Cond, U->getOperand(1), U->getOperand(2));
44498       DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
44499       DCI.AddToWorklist(U);
44500     }
44501     DCI.CommitTargetLoweringOpt(TLO);
44502     return SDValue(N, 0);
44503   }
44504
44505   // Otherwise we can still at least try to simplify multiple use bits.
44506   if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
44507       return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
44508                          N->getOperand(1), N->getOperand(2));
44509
44510   return SDValue();
44511 }
44512
44513 // Try to match:
44514 //   (or (and (M, (sub 0, X)), (pandn M, X)))
44515 // which is a special case of:
44516 //   (select M, (sub 0, X), X)
44517 // Per:
44518 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
44519 // We know that, if fNegate is 0 or 1:
44520 //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
44521 //
44522 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
44523 //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
44524 //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
44525 // This lets us transform our vselect to:
44526 //   (add (xor X, M), (and M, 1))
44527 // And further to:
44528 //   (sub (xor X, M), M)
44529 static SDValue combineLogicBlendIntoConditionalNegate(
44530     EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
44531     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
44532   EVT MaskVT = Mask.getValueType();
44533   assert(MaskVT.isInteger() &&
44534          DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
44535          "Mask must be zero/all-bits");
44536
44537   if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
44538     return SDValue();
44539   if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
44540     return SDValue();
44541
44542   auto IsNegV = [](SDNode *N, SDValue V) {
44543     return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
44544            ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
44545   };
44546
44547   SDValue V;
44548   if (IsNegV(Y.getNode(), X))
44549     V = X;
44550   else if (IsNegV(X.getNode(), Y))
44551     V = Y;
44552   else
44553     return SDValue();
44554
44555   SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
44556   SDValue SubOp2 = Mask;
44557
44558   // If the negate was on the false side of the select, then
44559   // the operands of the SUB need to be swapped. PR 27251.
44560   // This is because the pattern being matched above is
44561   // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
44562   // but if the pattern matched was
44563   // (vselect M, X, (sub (0, X))), that is really negation of the pattern
44564   // above, -(vselect M, (sub 0, X), X), and therefore the replacement
44565   // pattern also needs to be a negation of the replacement pattern above.
44566   // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
44567   // sub accomplishes the negation of the replacement pattern.
44568   if (V == Y)
44569     std::swap(SubOp1, SubOp2);
44570
44571   SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
44572   return DAG.getBitcast(VT, Res);
44573 }
44574
44575 static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG,
44576                                   const X86Subtarget &Subtarget) {
44577   if (!Subtarget.hasAVX512())
44578     return SDValue();
44579   if (N->getOpcode() != ISD::VSELECT)
44580     return SDValue();
44581
44582   SDLoc DL(N);
44583   SDValue Cond = N->getOperand(0);
44584   SDValue LHS = N->getOperand(1);
44585   SDValue RHS = N->getOperand(2);
44586
44587   if (canCombineAsMaskOperation(LHS, Subtarget))
44588     return SDValue();
44589
44590   if (!canCombineAsMaskOperation(RHS, Subtarget))
44591     return SDValue();
44592
44593   if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
44594     return SDValue();
44595
44596   // Commute LHS and RHS to create opportunity to select mask instruction.
44597   // (vselect M, L, R) -> (vselect ~M, R, L)
44598   ISD::CondCode NewCC =
44599       ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
44600                            Cond.getOperand(0).getValueType());
44601   Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
44602                                         Cond.getOperand(1), NewCC);
44603   return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
44604 }
44605
44606 /// Do target-specific dag combines on SELECT and VSELECT nodes.
44607 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
44608                              TargetLowering::DAGCombinerInfo &DCI,
44609                              const X86Subtarget &Subtarget) {
44610   SDLoc DL(N);
44611   SDValue Cond = N->getOperand(0);
44612   SDValue LHS = N->getOperand(1);
44613   SDValue RHS = N->getOperand(2);
44614
44615   // Try simplification again because we use this function to optimize
44616   // BLENDV nodes that are not handled by the generic combiner.
44617   if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
44618     return V;
44619
44620   // When avx512 is available the lhs operand of select instruction can be
44621   // folded with mask instruction, while the rhs operand can't. Commute the
44622   // lhs and rhs of the select instruction to create the opportunity of
44623   // folding.
44624   if (SDValue V = commuteSelect(N, DAG, Subtarget))
44625     return V;
44626
44627   EVT VT = LHS.getValueType();
44628   EVT CondVT = Cond.getValueType();
44629   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44630   bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
44631
44632   // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
44633   // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
44634   // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
44635   if (CondVT.isVector() && CondVT.isInteger() &&
44636       CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
44637       (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
44638       DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
44639     if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
44640                                                            DL, DAG, Subtarget))
44641       return V;
44642
44643   // Convert vselects with constant condition into shuffles.
44644   if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
44645       (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
44646     SmallVector<int, 64> Mask;
44647     if (createShuffleMaskFromVSELECT(Mask, Cond,
44648                                      N->getOpcode() == X86ISD::BLENDV))
44649       return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
44650   }
44651
44652   // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
44653   // by forcing the unselected elements to zero.
44654   // TODO: Can we handle more shuffles with this?
44655   if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
44656       LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
44657       LHS.hasOneUse() && RHS.hasOneUse()) {
44658     MVT SimpleVT = VT.getSimpleVT();
44659     SmallVector<SDValue, 1> LHSOps, RHSOps;
44660     SmallVector<int, 64> LHSMask, RHSMask, CondMask;
44661     if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
44662         getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
44663         getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
44664       int NumElts = VT.getVectorNumElements();
44665       for (int i = 0; i != NumElts; ++i) {
44666         // getConstVector sets negative shuffle mask values as undef, so ensure
44667         // we hardcode SM_SentinelZero values to zero (0x80).
44668         if (CondMask[i] < NumElts) {
44669           LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
44670           RHSMask[i] = 0x80;
44671         } else {
44672           LHSMask[i] = 0x80;
44673           RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
44674         }
44675       }
44676       LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
44677                         getConstVector(LHSMask, SimpleVT, DAG, DL, true));
44678       RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
44679                         getConstVector(RHSMask, SimpleVT, DAG, DL, true));
44680       return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
44681     }
44682   }
44683
44684   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
44685   // instructions match the semantics of the common C idiom x<y?x:y but not
44686   // x<=y?x:y, because of how they handle negative zero (which can be
44687   // ignored in unsafe-math mode).
44688   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
44689   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
44690       VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
44691       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
44692       (Subtarget.hasSSE2() ||
44693        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
44694     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44695
44696     unsigned Opcode = 0;
44697     // Check for x CC y ? x : y.
44698     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
44699         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
44700       switch (CC) {
44701       default: break;
44702       case ISD::SETULT:
44703         // Converting this to a min would handle NaNs incorrectly, and swapping
44704         // the operands would cause it to handle comparisons between positive
44705         // and negative zero incorrectly.
44706         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
44707           if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44708               !(DAG.isKnownNeverZeroFloat(LHS) ||
44709                 DAG.isKnownNeverZeroFloat(RHS)))
44710             break;
44711           std::swap(LHS, RHS);
44712         }
44713         Opcode = X86ISD::FMIN;
44714         break;
44715       case ISD::SETOLE:
44716         // Converting this to a min would handle comparisons between positive
44717         // and negative zero incorrectly.
44718         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44719             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
44720           break;
44721         Opcode = X86ISD::FMIN;
44722         break;
44723       case ISD::SETULE:
44724         // Converting this to a min would handle both negative zeros and NaNs
44725         // incorrectly, but we can swap the operands to fix both.
44726         std::swap(LHS, RHS);
44727         [[fallthrough]];
44728       case ISD::SETOLT:
44729       case ISD::SETLT:
44730       case ISD::SETLE:
44731         Opcode = X86ISD::FMIN;
44732         break;
44733
44734       case ISD::SETOGE:
44735         // Converting this to a max would handle comparisons between positive
44736         // and negative zero incorrectly.
44737         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44738             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
44739           break;
44740         Opcode = X86ISD::FMAX;
44741         break;
44742       case ISD::SETUGT:
44743         // Converting this to a max would handle NaNs incorrectly, and swapping
44744         // the operands would cause it to handle comparisons between positive
44745         // and negative zero incorrectly.
44746         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
44747           if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44748               !(DAG.isKnownNeverZeroFloat(LHS) ||
44749                 DAG.isKnownNeverZeroFloat(RHS)))
44750             break;
44751           std::swap(LHS, RHS);
44752         }
44753         Opcode = X86ISD::FMAX;
44754         break;
44755       case ISD::SETUGE:
44756         // Converting this to a max would handle both negative zeros and NaNs
44757         // incorrectly, but we can swap the operands to fix both.
44758         std::swap(LHS, RHS);
44759         [[fallthrough]];
44760       case ISD::SETOGT:
44761       case ISD::SETGT:
44762       case ISD::SETGE:
44763         Opcode = X86ISD::FMAX;
44764         break;
44765       }
44766     // Check for x CC y ? y : x -- a min/max with reversed arms.
44767     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
44768                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
44769       switch (CC) {
44770       default: break;
44771       case ISD::SETOGE:
44772         // Converting this to a min would handle comparisons between positive
44773         // and negative zero incorrectly, and swapping the operands would
44774         // cause it to handle NaNs incorrectly.
44775         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44776             !(DAG.isKnownNeverZeroFloat(LHS) ||
44777               DAG.isKnownNeverZeroFloat(RHS))) {
44778           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44779             break;
44780           std::swap(LHS, RHS);
44781         }
44782         Opcode = X86ISD::FMIN;
44783         break;
44784       case ISD::SETUGT:
44785         // Converting this to a min would handle NaNs incorrectly.
44786         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44787           break;
44788         Opcode = X86ISD::FMIN;
44789         break;
44790       case ISD::SETUGE:
44791         // Converting this to a min would handle both negative zeros and NaNs
44792         // incorrectly, but we can swap the operands to fix both.
44793         std::swap(LHS, RHS);
44794         [[fallthrough]];
44795       case ISD::SETOGT:
44796       case ISD::SETGT:
44797       case ISD::SETGE:
44798         Opcode = X86ISD::FMIN;
44799         break;
44800
44801       case ISD::SETULT:
44802         // Converting this to a max would handle NaNs incorrectly.
44803         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44804           break;
44805         Opcode = X86ISD::FMAX;
44806         break;
44807       case ISD::SETOLE:
44808         // Converting this to a max would handle comparisons between positive
44809         // and negative zero incorrectly, and swapping the operands would
44810         // cause it to handle NaNs incorrectly.
44811         if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
44812             !DAG.isKnownNeverZeroFloat(LHS) &&
44813             !DAG.isKnownNeverZeroFloat(RHS)) {
44814           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
44815             break;
44816           std::swap(LHS, RHS);
44817         }
44818         Opcode = X86ISD::FMAX;
44819         break;
44820       case ISD::SETULE:
44821         // Converting this to a max would handle both negative zeros and NaNs
44822         // incorrectly, but we can swap the operands to fix both.
44823         std::swap(LHS, RHS);
44824         [[fallthrough]];
44825       case ISD::SETOLT:
44826       case ISD::SETLT:
44827       case ISD::SETLE:
44828         Opcode = X86ISD::FMAX;
44829         break;
44830       }
44831     }
44832
44833     if (Opcode)
44834       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
44835   }
44836
44837   // Some mask scalar intrinsics rely on checking if only one bit is set
44838   // and implement it in C code like this:
44839   // A[0] = (U & 1) ? A[0] : W[0];
44840   // This creates some redundant instructions that break pattern matching.
44841   // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
44842   if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
44843       Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
44844     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44845     SDValue AndNode = Cond.getOperand(0);
44846     if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
44847         isNullConstant(Cond.getOperand(1)) &&
44848         isOneConstant(AndNode.getOperand(1))) {
44849       // LHS and RHS swapped due to
44850       // setcc outputting 1 when AND resulted in 0 and vice versa.
44851       AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
44852       return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
44853     }
44854   }
44855
44856   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
44857   // lowering on KNL. In this case we convert it to
44858   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
44859   // The same situation all vectors of i8 and i16 without BWI.
44860   // Make sure we extend these even before type legalization gets a chance to
44861   // split wide vectors.
44862   // Since SKX these selects have a proper lowering.
44863   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
44864       CondVT.getVectorElementType() == MVT::i1 &&
44865       (VT.getVectorElementType() == MVT::i8 ||
44866        VT.getVectorElementType() == MVT::i16)) {
44867     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
44868     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
44869   }
44870
44871   // AVX512 - Extend select with zero to merge with target shuffle.
44872   // select(mask, extract_subvector(shuffle(x)), zero) -->
44873   // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
44874   // TODO - support non target shuffles as well.
44875   if (Subtarget.hasAVX512() && CondVT.isVector() &&
44876       CondVT.getVectorElementType() == MVT::i1) {
44877     auto SelectableOp = [&TLI](SDValue Op) {
44878       return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44879              isTargetShuffle(Op.getOperand(0).getOpcode()) &&
44880              isNullConstant(Op.getOperand(1)) &&
44881              TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
44882              Op.hasOneUse() && Op.getOperand(0).hasOneUse();
44883     };
44884
44885     bool SelectableLHS = SelectableOp(LHS);
44886     bool SelectableRHS = SelectableOp(RHS);
44887     bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
44888     bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
44889
44890     if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
44891       EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
44892                                 : RHS.getOperand(0).getValueType();
44893       EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
44894       LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
44895                             VT.getSizeInBits());
44896       RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
44897                             VT.getSizeInBits());
44898       Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
44899                          DAG.getUNDEF(SrcCondVT), Cond,
44900                          DAG.getIntPtrConstant(0, DL));
44901       SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
44902       return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
44903     }
44904   }
44905
44906   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
44907     return V;
44908
44909   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
44910       Cond.hasOneUse()) {
44911     EVT CondVT = Cond.getValueType();
44912     SDValue Cond0 = Cond.getOperand(0);
44913     SDValue Cond1 = Cond.getOperand(1);
44914     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
44915
44916     // Canonicalize min/max:
44917     // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
44918     // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
44919     // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
44920     // the need for an extra compare against zero. e.g.
44921     // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
44922     // subl   %esi, %edi
44923     // testl  %edi, %edi
44924     // movl   $0, %eax
44925     // cmovgl %edi, %eax
44926     // =>
44927     // xorl   %eax, %eax
44928     // subl   %esi, $edi
44929     // cmovsl %eax, %edi
44930     //
44931     // We can also canonicalize
44932     //  (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
44933     //  (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
44934     // This allows the use of a test instruction for the compare.
44935     if (LHS == Cond0 && RHS == Cond1) {
44936       if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
44937           (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
44938         ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
44939         Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
44940         return DAG.getSelect(DL, VT, Cond, LHS, RHS);
44941       }
44942       if (CC == ISD::SETUGT && isOneConstant(RHS)) {
44943         ISD::CondCode NewCC = ISD::SETUGE;
44944         Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
44945         return DAG.getSelect(DL, VT, Cond, LHS, RHS);
44946       }
44947     }
44948
44949     // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
44950     // fold eq + gt/lt nested selects into ge/le selects
44951     // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
44952     // --> (select (cmpuge Cond0, Cond1), LHS, Y)
44953     // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
44954     // --> (select (cmpsle Cond0, Cond1), LHS, Y)
44955     // .. etc ..
44956     if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
44957         RHS.getOperand(0).getOpcode() == ISD::SETCC) {
44958       SDValue InnerSetCC = RHS.getOperand(0);
44959       ISD::CondCode InnerCC =
44960           cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
44961       if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
44962           Cond0 == InnerSetCC.getOperand(0) &&
44963           Cond1 == InnerSetCC.getOperand(1)) {
44964         ISD::CondCode NewCC;
44965         switch (CC == ISD::SETEQ ? InnerCC : CC) {
44966         case ISD::SETGT:  NewCC = ISD::SETGE; break;
44967         case ISD::SETLT:  NewCC = ISD::SETLE; break;
44968         case ISD::SETUGT: NewCC = ISD::SETUGE; break;
44969         case ISD::SETULT: NewCC = ISD::SETULE; break;
44970         default: NewCC = ISD::SETCC_INVALID; break;
44971         }
44972         if (NewCC != ISD::SETCC_INVALID) {
44973           Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
44974           return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
44975         }
44976       }
44977     }
44978   }
44979
44980   // Check if the first operand is all zeros and Cond type is vXi1.
44981   // If this an avx512 target we can improve the use of zero masking by
44982   // swapping the operands and inverting the condition.
44983   if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
44984       Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
44985       ISD::isBuildVectorAllZeros(LHS.getNode()) &&
44986       !ISD::isBuildVectorAllZeros(RHS.getNode())) {
44987     // Invert the cond to not(cond) : xor(op,allones)=not(op)
44988     SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
44989     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
44990     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
44991   }
44992
44993   // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
44994   // get split by legalization.
44995   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
44996       CondVT.getVectorElementType() == MVT::i1 &&
44997       TLI.isTypeLegal(VT.getScalarType())) {
44998     EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
44999     if (SDValue ExtCond = combineToExtendBoolVectorInReg(
45000             ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
45001       ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
45002       return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
45003     }
45004   }
45005
45006   // Early exit check
45007   if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
45008     return SDValue();
45009
45010   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
45011     return V;
45012
45013   if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
45014     return V;
45015
45016   if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
45017     return V;
45018
45019   // select(~Cond, X, Y) -> select(Cond, Y, X)
45020   if (CondVT.getScalarType() != MVT::i1) {
45021     if (SDValue CondNot = IsNOT(Cond, DAG))
45022       return DAG.getNode(N->getOpcode(), DL, VT,
45023                          DAG.getBitcast(CondVT, CondNot), RHS, LHS);
45024
45025     // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
45026     // signbit.
45027     if (Cond.getOpcode() == X86ISD::PCMPGT &&
45028         ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
45029         Cond.hasOneUse()) {
45030       Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
45031                          DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
45032       return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
45033     }
45034   }
45035
45036   // Try to optimize vXi1 selects if both operands are either all constants or
45037   // bitcasts from scalar integer type. In that case we can convert the operands
45038   // to integer and use an integer select which will be converted to a CMOV.
45039   // We need to take a little bit of care to avoid creating an i64 type after
45040   // type legalization.
45041   if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
45042       VT.getVectorElementType() == MVT::i1 &&
45043       (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
45044     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
45045     if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
45046       bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
45047       bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
45048
45049       if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
45050                           LHS.getOperand(0).getValueType() == IntVT)) &&
45051           (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
45052                           RHS.getOperand(0).getValueType() == IntVT))) {
45053         if (LHSIsConst)
45054           LHS = combinevXi1ConstantToInteger(LHS, DAG);
45055         else
45056           LHS = LHS.getOperand(0);
45057
45058         if (RHSIsConst)
45059           RHS = combinevXi1ConstantToInteger(RHS, DAG);
45060         else
45061           RHS = RHS.getOperand(0);
45062
45063         SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
45064         return DAG.getBitcast(VT, Select);
45065       }
45066     }
45067   }
45068
45069   // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
45070   // single bits, then invert the predicate and swap the select operands.
45071   // This can lower using a vector shift bit-hack rather than mask and compare.
45072   if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
45073       N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
45074       Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
45075       Cond.getOperand(0).getOpcode() == ISD::AND &&
45076       isNullOrNullSplat(Cond.getOperand(1)) &&
45077       cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
45078       Cond.getOperand(0).getValueType() == VT) {
45079     // The 'and' mask must be composed of power-of-2 constants.
45080     SDValue And = Cond.getOperand(0);
45081     auto *C = isConstOrConstSplat(And.getOperand(1));
45082     if (C && C->getAPIntValue().isPowerOf2()) {
45083       // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
45084       SDValue NotCond =
45085           DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
45086       return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
45087     }
45088
45089     // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
45090     // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
45091     // 16-bit lacks a proper blendv.
45092     unsigned EltBitWidth = VT.getScalarSizeInBits();
45093     bool CanShiftBlend =
45094         TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
45095                                 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
45096                                 (Subtarget.hasXOP()));
45097     if (CanShiftBlend &&
45098         ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
45099           return C->getAPIntValue().isPowerOf2();
45100         })) {
45101       // Create a left-shift constant to get the mask bits over to the sign-bit.
45102       SDValue Mask = And.getOperand(1);
45103       SmallVector<int, 32> ShlVals;
45104       for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
45105         auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
45106         ShlVals.push_back(EltBitWidth - 1 -
45107                           MaskVal->getAPIntValue().exactLogBase2());
45108       }
45109       // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
45110       SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
45111       SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
45112       SDValue NewCond =
45113           DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
45114       return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
45115     }
45116   }
45117
45118   return SDValue();
45119 }
45120
45121 /// Combine:
45122 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
45123 /// to:
45124 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
45125 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
45126 /// Note that this is only legal for some op/cc combinations.
45127 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
45128                                        SelectionDAG &DAG,
45129                                        const X86Subtarget &Subtarget) {
45130   // This combine only operates on CMP-like nodes.
45131   if (!(Cmp.getOpcode() == X86ISD::CMP ||
45132         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
45133     return SDValue();
45134
45135   // Can't replace the cmp if it has more uses than the one we're looking at.
45136   // FIXME: We would like to be able to handle this, but would need to make sure
45137   // all uses were updated.
45138   if (!Cmp.hasOneUse())
45139     return SDValue();
45140
45141   // This only applies to variations of the common case:
45142   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
45143   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
45144   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
45145   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
45146   // Using the proper condcodes (see below), overflow is checked for.
45147
45148   // FIXME: We can generalize both constraints:
45149   // - XOR/OR/AND (if they were made to survive AtomicExpand)
45150   // - LHS != 1
45151   // if the result is compared.
45152
45153   SDValue CmpLHS = Cmp.getOperand(0);
45154   SDValue CmpRHS = Cmp.getOperand(1);
45155   EVT CmpVT = CmpLHS.getValueType();
45156
45157   if (!CmpLHS.hasOneUse())
45158     return SDValue();
45159
45160   unsigned Opc = CmpLHS.getOpcode();
45161   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
45162     return SDValue();
45163
45164   SDValue OpRHS = CmpLHS.getOperand(2);
45165   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
45166   if (!OpRHSC)
45167     return SDValue();
45168
45169   APInt Addend = OpRHSC->getAPIntValue();
45170   if (Opc == ISD::ATOMIC_LOAD_SUB)
45171     Addend = -Addend;
45172
45173   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
45174   if (!CmpRHSC)
45175     return SDValue();
45176
45177   APInt Comparison = CmpRHSC->getAPIntValue();
45178   APInt NegAddend = -Addend;
45179
45180   // See if we can adjust the CC to make the comparison match the negated
45181   // addend.
45182   if (Comparison != NegAddend) {
45183     APInt IncComparison = Comparison + 1;
45184     if (IncComparison == NegAddend) {
45185       if (CC == X86::COND_A && !Comparison.isMaxValue()) {
45186         Comparison = IncComparison;
45187         CC = X86::COND_AE;
45188       } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
45189         Comparison = IncComparison;
45190         CC = X86::COND_L;
45191       }
45192     }
45193     APInt DecComparison = Comparison - 1;
45194     if (DecComparison == NegAddend) {
45195       if (CC == X86::COND_AE && !Comparison.isMinValue()) {
45196         Comparison = DecComparison;
45197         CC = X86::COND_A;
45198       } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
45199         Comparison = DecComparison;
45200         CC = X86::COND_LE;
45201       }
45202     }
45203   }
45204
45205   // If the addend is the negation of the comparison value, then we can do
45206   // a full comparison by emitting the atomic arithmetic as a locked sub.
45207   if (Comparison == NegAddend) {
45208     // The CC is fine, but we need to rewrite the LHS of the comparison as an
45209     // atomic sub.
45210     auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
45211     auto AtomicSub = DAG.getAtomic(
45212         ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
45213         /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
45214         /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
45215         AN->getMemOperand());
45216     auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
45217     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
45218     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
45219     return LockOp;
45220   }
45221
45222   // We can handle comparisons with zero in a number of cases by manipulating
45223   // the CC used.
45224   if (!Comparison.isZero())
45225     return SDValue();
45226
45227   if (CC == X86::COND_S && Addend == 1)
45228     CC = X86::COND_LE;
45229   else if (CC == X86::COND_NS && Addend == 1)
45230     CC = X86::COND_G;
45231   else if (CC == X86::COND_G && Addend == -1)
45232     CC = X86::COND_GE;
45233   else if (CC == X86::COND_LE && Addend == -1)
45234     CC = X86::COND_L;
45235   else
45236     return SDValue();
45237
45238   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
45239   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
45240   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
45241   return LockOp;
45242 }
45243
45244 // Check whether a boolean test is testing a boolean value generated by
45245 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
45246 // code.
45247 //
45248 // Simplify the following patterns:
45249 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
45250 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
45251 // to (Op EFLAGS Cond)
45252 //
45253 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
45254 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
45255 // to (Op EFLAGS !Cond)
45256 //
45257 // where Op could be BRCOND or CMOV.
45258 //
45259 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
45260   // This combine only operates on CMP-like nodes.
45261   if (!(Cmp.getOpcode() == X86ISD::CMP ||
45262         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
45263     return SDValue();
45264
45265   // Quit if not used as a boolean value.
45266   if (CC != X86::COND_E && CC != X86::COND_NE)
45267     return SDValue();
45268
45269   // Check CMP operands. One of them should be 0 or 1 and the other should be
45270   // an SetCC or extended from it.
45271   SDValue Op1 = Cmp.getOperand(0);
45272   SDValue Op2 = Cmp.getOperand(1);
45273
45274   SDValue SetCC;
45275   const ConstantSDNode* C = nullptr;
45276   bool needOppositeCond = (CC == X86::COND_E);
45277   bool checkAgainstTrue = false; // Is it a comparison against 1?
45278
45279   if ((C = dyn_cast<ConstantSDNode>(Op1)))
45280     SetCC = Op2;
45281   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
45282     SetCC = Op1;
45283   else // Quit if all operands are not constants.
45284     return SDValue();
45285
45286   if (C->getZExtValue() == 1) {
45287     needOppositeCond = !needOppositeCond;
45288     checkAgainstTrue = true;
45289   } else if (C->getZExtValue() != 0)
45290     // Quit if the constant is neither 0 or 1.
45291     return SDValue();
45292
45293   bool truncatedToBoolWithAnd = false;
45294   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
45295   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
45296          SetCC.getOpcode() == ISD::TRUNCATE ||
45297          SetCC.getOpcode() == ISD::AND) {
45298     if (SetCC.getOpcode() == ISD::AND) {
45299       int OpIdx = -1;
45300       if (isOneConstant(SetCC.getOperand(0)))
45301         OpIdx = 1;
45302       if (isOneConstant(SetCC.getOperand(1)))
45303         OpIdx = 0;
45304       if (OpIdx < 0)
45305         break;
45306       SetCC = SetCC.getOperand(OpIdx);
45307       truncatedToBoolWithAnd = true;
45308     } else
45309       SetCC = SetCC.getOperand(0);
45310   }
45311
45312   switch (SetCC.getOpcode()) {
45313   case X86ISD::SETCC_CARRY:
45314     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
45315     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
45316     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
45317     // truncated to i1 using 'and'.
45318     if (checkAgainstTrue && !truncatedToBoolWithAnd)
45319       break;
45320     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
45321            "Invalid use of SETCC_CARRY!");
45322     [[fallthrough]];
45323   case X86ISD::SETCC:
45324     // Set the condition code or opposite one if necessary.
45325     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
45326     if (needOppositeCond)
45327       CC = X86::GetOppositeBranchCondition(CC);
45328     return SetCC.getOperand(1);
45329   case X86ISD::CMOV: {
45330     // Check whether false/true value has canonical one, i.e. 0 or 1.
45331     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
45332     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
45333     // Quit if true value is not a constant.
45334     if (!TVal)
45335       return SDValue();
45336     // Quit if false value is not a constant.
45337     if (!FVal) {
45338       SDValue Op = SetCC.getOperand(0);
45339       // Skip 'zext' or 'trunc' node.
45340       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
45341           Op.getOpcode() == ISD::TRUNCATE)
45342         Op = Op.getOperand(0);
45343       // A special case for rdrand/rdseed, where 0 is set if false cond is
45344       // found.
45345       if ((Op.getOpcode() != X86ISD::RDRAND &&
45346            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
45347         return SDValue();
45348     }
45349     // Quit if false value is not the constant 0 or 1.
45350     bool FValIsFalse = true;
45351     if (FVal && FVal->getZExtValue() != 0) {
45352       if (FVal->getZExtValue() != 1)
45353         return SDValue();
45354       // If FVal is 1, opposite cond is needed.
45355       needOppositeCond = !needOppositeCond;
45356       FValIsFalse = false;
45357     }
45358     // Quit if TVal is not the constant opposite of FVal.
45359     if (FValIsFalse && TVal->getZExtValue() != 1)
45360       return SDValue();
45361     if (!FValIsFalse && TVal->getZExtValue() != 0)
45362       return SDValue();
45363     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
45364     if (needOppositeCond)
45365       CC = X86::GetOppositeBranchCondition(CC);
45366     return SetCC.getOperand(3);
45367   }
45368   }
45369
45370   return SDValue();
45371 }
45372
45373 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
45374 /// Match:
45375 ///   (X86or (X86setcc) (X86setcc))
45376 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
45377 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
45378                                            X86::CondCode &CC1, SDValue &Flags,
45379                                            bool &isAnd) {
45380   if (Cond->getOpcode() == X86ISD::CMP) {
45381     if (!isNullConstant(Cond->getOperand(1)))
45382       return false;
45383
45384     Cond = Cond->getOperand(0);
45385   }
45386
45387   isAnd = false;
45388
45389   SDValue SetCC0, SetCC1;
45390   switch (Cond->getOpcode()) {
45391   default: return false;
45392   case ISD::AND:
45393   case X86ISD::AND:
45394     isAnd = true;
45395     [[fallthrough]];
45396   case ISD::OR:
45397   case X86ISD::OR:
45398     SetCC0 = Cond->getOperand(0);
45399     SetCC1 = Cond->getOperand(1);
45400     break;
45401   };
45402
45403   // Make sure we have SETCC nodes, using the same flags value.
45404   if (SetCC0.getOpcode() != X86ISD::SETCC ||
45405       SetCC1.getOpcode() != X86ISD::SETCC ||
45406       SetCC0->getOperand(1) != SetCC1->getOperand(1))
45407     return false;
45408
45409   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
45410   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
45411   Flags = SetCC0->getOperand(1);
45412   return true;
45413 }
45414
45415 // When legalizing carry, we create carries via add X, -1
45416 // If that comes from an actual carry, via setcc, we use the
45417 // carry directly.
45418 static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
45419   if (EFLAGS.getOpcode() == X86ISD::ADD) {
45420     if (isAllOnesConstant(EFLAGS.getOperand(1))) {
45421       bool FoundAndLSB = false;
45422       SDValue Carry = EFLAGS.getOperand(0);
45423       while (Carry.getOpcode() == ISD::TRUNCATE ||
45424              Carry.getOpcode() == ISD::ZERO_EXTEND ||
45425              (Carry.getOpcode() == ISD::AND &&
45426               isOneConstant(Carry.getOperand(1)))) {
45427         FoundAndLSB |= Carry.getOpcode() == ISD::AND;
45428         Carry = Carry.getOperand(0);
45429       }
45430       if (Carry.getOpcode() == X86ISD::SETCC ||
45431           Carry.getOpcode() == X86ISD::SETCC_CARRY) {
45432         // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
45433         uint64_t CarryCC = Carry.getConstantOperandVal(0);
45434         SDValue CarryOp1 = Carry.getOperand(1);
45435         if (CarryCC == X86::COND_B)
45436           return CarryOp1;
45437         if (CarryCC == X86::COND_A) {
45438           // Try to convert COND_A into COND_B in an attempt to facilitate
45439           // materializing "setb reg".
45440           //
45441           // Do not flip "e > c", where "c" is a constant, because Cmp
45442           // instruction cannot take an immediate as its first operand.
45443           //
45444           if (CarryOp1.getOpcode() == X86ISD::SUB &&
45445               CarryOp1.getNode()->hasOneUse() &&
45446               CarryOp1.getValueType().isInteger() &&
45447               !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
45448             SDValue SubCommute =
45449                 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
45450                             CarryOp1.getOperand(1), CarryOp1.getOperand(0));
45451             return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
45452           }
45453         }
45454         // If this is a check of the z flag of an add with 1, switch to the
45455         // C flag.
45456         if (CarryCC == X86::COND_E &&
45457             CarryOp1.getOpcode() == X86ISD::ADD &&
45458             isOneConstant(CarryOp1.getOperand(1)))
45459           return CarryOp1;
45460       } else if (FoundAndLSB) {
45461         SDLoc DL(Carry);
45462         SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
45463         if (Carry.getOpcode() == ISD::SRL) {
45464           BitNo = Carry.getOperand(1);
45465           Carry = Carry.getOperand(0);
45466         }
45467         return getBT(Carry, BitNo, DL, DAG);
45468       }
45469     }
45470   }
45471
45472   return SDValue();
45473 }
45474
45475 /// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
45476 /// to avoid the inversion.
45477 static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
45478                               SelectionDAG &DAG,
45479                               const X86Subtarget &Subtarget) {
45480   // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
45481   if (EFLAGS.getOpcode() != X86ISD::PTEST &&
45482       EFLAGS.getOpcode() != X86ISD::TESTP)
45483     return SDValue();
45484
45485   // PTEST/TESTP sets EFLAGS as:
45486   // TESTZ: ZF = (Op0 & Op1) == 0
45487   // TESTC: CF = (~Op0 & Op1) == 0
45488   // TESTNZC: ZF == 0 && CF == 0
45489   MVT VT = EFLAGS.getSimpleValueType();
45490   SDValue Op0 = EFLAGS.getOperand(0);
45491   SDValue Op1 = EFLAGS.getOperand(1);
45492   MVT OpVT = Op0.getSimpleValueType();
45493
45494   // TEST*(~X,Y) == TEST*(X,Y)
45495   if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
45496     X86::CondCode InvCC;
45497     switch (CC) {
45498     case X86::COND_B:
45499       // testc -> testz.
45500       InvCC = X86::COND_E;
45501       break;
45502     case X86::COND_AE:
45503       // !testc -> !testz.
45504       InvCC = X86::COND_NE;
45505       break;
45506     case X86::COND_E:
45507       // testz -> testc.
45508       InvCC = X86::COND_B;
45509       break;
45510     case X86::COND_NE:
45511       // !testz -> !testc.
45512       InvCC = X86::COND_AE;
45513       break;
45514     case X86::COND_A:
45515     case X86::COND_BE:
45516       // testnzc -> testnzc (no change).
45517       InvCC = CC;
45518       break;
45519     default:
45520       InvCC = X86::COND_INVALID;
45521       break;
45522     }
45523
45524     if (InvCC != X86::COND_INVALID) {
45525       CC = InvCC;
45526       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
45527                          DAG.getBitcast(OpVT, NotOp0), Op1);
45528     }
45529   }
45530
45531   if (CC == X86::COND_B || CC == X86::COND_AE) {
45532     // TESTC(X,~X) == TESTC(X,-1)
45533     if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
45534       if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
45535         SDLoc DL(EFLAGS);
45536         return DAG.getNode(
45537             EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
45538             DAG.getBitcast(OpVT,
45539                            DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
45540       }
45541     }
45542   }
45543
45544   if (CC == X86::COND_E || CC == X86::COND_NE) {
45545     // TESTZ(X,~Y) == TESTC(Y,X)
45546     if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
45547       CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
45548       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
45549                          DAG.getBitcast(OpVT, NotOp1), Op0);
45550     }
45551
45552     if (Op0 == Op1) {
45553       SDValue BC = peekThroughBitcasts(Op0);
45554       EVT BCVT = BC.getValueType();
45555
45556       // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
45557       if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
45558         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
45559                            DAG.getBitcast(OpVT, BC.getOperand(0)),
45560                            DAG.getBitcast(OpVT, BC.getOperand(1)));
45561       }
45562
45563       // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
45564       if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
45565         CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
45566         return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
45567                            DAG.getBitcast(OpVT, BC.getOperand(0)),
45568                            DAG.getBitcast(OpVT, BC.getOperand(1)));
45569       }
45570
45571       // If every element is an all-sign value, see if we can use TESTP/MOVMSK
45572       // to more efficiently extract the sign bits and compare that.
45573       // TODO: Handle TESTC with comparison inversion.
45574       // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
45575       // TESTP/MOVMSK combines to make sure its never worse than PTEST?
45576       if (BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
45577         unsigned EltBits = BCVT.getScalarSizeInBits();
45578         if (DAG.ComputeNumSignBits(BC) == EltBits) {
45579           assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
45580           APInt SignMask = APInt::getSignMask(EltBits);
45581           const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45582           if (SDValue Res =
45583                   TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
45584             // For vXi16 cases we need to use pmovmksb and extract every other
45585             // sign bit.
45586             SDLoc DL(EFLAGS);
45587             if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
45588               MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
45589               MVT FloatVT =
45590                   MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
45591               Res = DAG.getBitcast(FloatVT, Res);
45592               return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
45593             } else if (EltBits == 16) {
45594               MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
45595               Res = DAG.getBitcast(MovmskVT, Res);
45596               Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
45597               Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
45598                                 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
45599             } else {
45600               Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
45601             }
45602             return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
45603                                DAG.getConstant(0, DL, MVT::i32));
45604           }
45605         }
45606       }
45607     }
45608
45609     // TESTZ(-1,X) == TESTZ(X,X)
45610     if (ISD::isBuildVectorAllOnes(Op0.getNode()))
45611       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
45612
45613     // TESTZ(X,-1) == TESTZ(X,X)
45614     if (ISD::isBuildVectorAllOnes(Op1.getNode()))
45615       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
45616
45617     // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
45618     // TODO: Add COND_NE handling?
45619     if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
45620       SDValue Src0 = peekThroughBitcasts(Op0);
45621       SDValue Src1 = peekThroughBitcasts(Op1);
45622       if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
45623         Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),
45624                                  peekThroughBitcasts(Src0.getOperand(1)), true);
45625         Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),
45626                                  peekThroughBitcasts(Src1.getOperand(1)), true);
45627         if (Src0 && Src1) {
45628           MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
45629           return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
45630                              DAG.getBitcast(OpVT2, Src0),
45631                              DAG.getBitcast(OpVT2, Src1));
45632         }
45633       }
45634     }
45635   }
45636
45637   return SDValue();
45638 }
45639
45640 // Attempt to simplify the MOVMSK input based on the comparison type.
45641 static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
45642                                   SelectionDAG &DAG,
45643                                   const X86Subtarget &Subtarget) {
45644   // Handle eq/ne against zero (any_of).
45645   // Handle eq/ne against -1 (all_of).
45646   if (!(CC == X86::COND_E || CC == X86::COND_NE))
45647     return SDValue();
45648   if (EFLAGS.getValueType() != MVT::i32)
45649     return SDValue();
45650   unsigned CmpOpcode = EFLAGS.getOpcode();
45651   if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
45652     return SDValue();
45653   auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
45654   if (!CmpConstant)
45655     return SDValue();
45656   const APInt &CmpVal = CmpConstant->getAPIntValue();
45657
45658   SDValue CmpOp = EFLAGS.getOperand(0);
45659   unsigned CmpBits = CmpOp.getValueSizeInBits();
45660   assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
45661
45662   // Peek through any truncate.
45663   if (CmpOp.getOpcode() == ISD::TRUNCATE)
45664     CmpOp = CmpOp.getOperand(0);
45665
45666   // Bail if we don't find a MOVMSK.
45667   if (CmpOp.getOpcode() != X86ISD::MOVMSK)
45668     return SDValue();
45669
45670   SDValue Vec = CmpOp.getOperand(0);
45671   MVT VecVT = Vec.getSimpleValueType();
45672   assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
45673          "Unexpected MOVMSK operand");
45674   unsigned NumElts = VecVT.getVectorNumElements();
45675   unsigned NumEltBits = VecVT.getScalarSizeInBits();
45676
45677   bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
45678   bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
45679                  NumElts <= CmpBits && CmpVal.isMask(NumElts);
45680   if (!IsAnyOf && !IsAllOf)
45681     return SDValue();
45682
45683   // TODO: Check more combining cases for me.
45684   // Here we check the cmp use number to decide do combining or not.
45685   // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
45686   // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
45687   bool IsOneUse = CmpOp.getNode()->hasOneUse();
45688
45689   // See if we can peek through to a vector with a wider element type, if the
45690   // signbits extend down to all the sub-elements as well.
45691   // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
45692   // potential SimplifyDemandedBits/Elts cases.
45693   // If we looked through a truncate that discard bits, we can't do this
45694   // transform.
45695   // FIXME: We could do this transform for truncates that discarded bits by
45696   // inserting an AND mask between the new MOVMSK and the CMP.
45697   if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
45698     SDValue BC = peekThroughBitcasts(Vec);
45699     MVT BCVT = BC.getSimpleValueType();
45700     unsigned BCNumElts = BCVT.getVectorNumElements();
45701     unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
45702     if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
45703         BCNumEltBits > NumEltBits &&
45704         DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
45705       SDLoc DL(EFLAGS);
45706       APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
45707       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
45708                          DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
45709                          DAG.getConstant(CmpMask, DL, MVT::i32));
45710     }
45711   }
45712
45713   // MOVMSK(CONCAT(X,Y)) == 0 ->  MOVMSK(OR(X,Y)).
45714   // MOVMSK(CONCAT(X,Y)) != 0 ->  MOVMSK(OR(X,Y)).
45715   // MOVMSK(CONCAT(X,Y)) == -1 ->  MOVMSK(AND(X,Y)).
45716   // MOVMSK(CONCAT(X,Y)) != -1 ->  MOVMSK(AND(X,Y)).
45717   if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
45718     SmallVector<SDValue> Ops;
45719     if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
45720         Ops.size() == 2) {
45721       SDLoc DL(EFLAGS);
45722       EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
45723       APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
45724       SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
45725                               DAG.getBitcast(SubVT, Ops[0]),
45726                               DAG.getBitcast(SubVT, Ops[1]));
45727       V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
45728       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
45729                          DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
45730                          DAG.getConstant(CmpMask, DL, MVT::i32));
45731     }
45732   }
45733
45734   // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
45735   // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
45736   // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
45737   // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
45738   if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
45739     MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
45740     SDValue BC = peekThroughBitcasts(Vec);
45741     // Ensure MOVMSK was testing every signbit of BC.
45742     if (BC.getValueType().getVectorNumElements() <= NumElts) {
45743       if (BC.getOpcode() == X86ISD::PCMPEQ) {
45744         SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
45745                                 BC.getOperand(0), BC.getOperand(1));
45746         V = DAG.getBitcast(TestVT, V);
45747         return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
45748       }
45749       // Check for 256-bit split vector cases.
45750       if (BC.getOpcode() == ISD::AND &&
45751           BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
45752           BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
45753         SDValue LHS = BC.getOperand(0);
45754         SDValue RHS = BC.getOperand(1);
45755         LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
45756                           LHS.getOperand(0), LHS.getOperand(1));
45757         RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
45758                           RHS.getOperand(0), RHS.getOperand(1));
45759         LHS = DAG.getBitcast(TestVT, LHS);
45760         RHS = DAG.getBitcast(TestVT, RHS);
45761         SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
45762         return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
45763       }
45764     }
45765   }
45766
45767   // See if we can avoid a PACKSS by calling MOVMSK on the sources.
45768   // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
45769   // sign bits prior to the comparison with zero unless we know that
45770   // the vXi16 splats the sign bit down to the lower i8 half.
45771   // TODO: Handle all_of patterns.
45772   if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
45773     SDValue VecOp0 = Vec.getOperand(0);
45774     SDValue VecOp1 = Vec.getOperand(1);
45775     bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
45776     bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
45777     // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
45778     if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
45779       SDLoc DL(EFLAGS);
45780       SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
45781       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45782       Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
45783       if (!SignExt0) {
45784         Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
45785                              DAG.getConstant(0xAAAA, DL, MVT::i16));
45786       }
45787       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
45788                          DAG.getConstant(0, DL, MVT::i16));
45789     }
45790     // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
45791     // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
45792     if (CmpBits >= 16 && Subtarget.hasInt256() &&
45793         (IsAnyOf || (SignExt0 && SignExt1))) {
45794       if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
45795         SDLoc DL(EFLAGS);
45796         SDValue Result = peekThroughBitcasts(Src);
45797         if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
45798             Result.getValueType().getVectorNumElements() <= NumElts) {
45799           SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
45800                                   Result.getOperand(0), Result.getOperand(1));
45801           V = DAG.getBitcast(MVT::v4i64, V);
45802           return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
45803         }
45804         Result = DAG.getBitcast(MVT::v32i8, Result);
45805         Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45806         unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
45807         if (!SignExt0 || !SignExt1) {
45808           assert(IsAnyOf &&
45809                  "Only perform v16i16 signmasks for any_of patterns");
45810           Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
45811                                DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
45812         }
45813         return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
45814                            DAG.getConstant(CmpMask, DL, MVT::i32));
45815       }
45816     }
45817   }
45818
45819   // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
45820   // Since we peek through a bitcast, we need to be careful if the base vector
45821   // type has smaller elements than the MOVMSK type.  In that case, even if
45822   // all the elements are demanded by the shuffle mask, only the "high"
45823   // elements which have highbits that align with highbits in the MOVMSK vec
45824   // elements are actually demanded. A simplification of spurious operations
45825   // on the "low" elements take place during other simplifications.
45826   //
45827   // For example:
45828   // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
45829   // demanded, because we are swapping around the result can change.
45830   //
45831   // To address this, we check that we can scale the shuffle mask to MOVMSK
45832   // element width (this will ensure "high" elements match). Its slightly overly
45833   // conservative, but fine for an edge case fold.
45834   SmallVector<int, 32> ShuffleMask, ScaledMaskUnused;
45835   SmallVector<SDValue, 2> ShuffleInputs;
45836   if (NumElts <= CmpBits &&
45837       getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
45838                              ShuffleMask, DAG) &&
45839       ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
45840       ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
45841       scaleShuffleElements(ShuffleMask, NumElts, ScaledMaskUnused)) {
45842     unsigned NumShuffleElts = ShuffleMask.size();
45843     APInt DemandedElts = APInt::getZero(NumShuffleElts);
45844     for (int M : ShuffleMask) {
45845       assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
45846       DemandedElts.setBit(M);
45847     }
45848     if (DemandedElts.isAllOnes()) {
45849       SDLoc DL(EFLAGS);
45850       SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
45851       Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45852       Result =
45853           DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
45854       return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
45855                          EFLAGS.getOperand(1));
45856     }
45857   }
45858
45859   // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
45860   // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
45861   // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
45862   // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
45863   // iff every element is referenced.
45864   if (NumElts <= CmpBits && Subtarget.hasAVX() &&
45865       !Subtarget.preferMovmskOverVTest() && IsOneUse &&
45866       (NumEltBits == 32 || NumEltBits == 64)) {
45867     SDLoc DL(EFLAGS);
45868     MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
45869     MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
45870     MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
45871     SDValue LHS = Vec;
45872     SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
45873     CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
45874     return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
45875                        DAG.getBitcast(FloatVT, LHS),
45876                        DAG.getBitcast(FloatVT, RHS));
45877   }
45878
45879   return SDValue();
45880 }
45881
45882 /// Optimize an EFLAGS definition used according to the condition code \p CC
45883 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
45884 /// uses of chain values.
45885 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
45886                                   SelectionDAG &DAG,
45887                                   const X86Subtarget &Subtarget) {
45888   if (CC == X86::COND_B)
45889     if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
45890       return Flags;
45891
45892   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
45893     return R;
45894
45895   if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
45896     return R;
45897
45898   if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
45899     return R;
45900
45901   return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
45902 }
45903
45904 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
45905 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
45906                            TargetLowering::DAGCombinerInfo &DCI,
45907                            const X86Subtarget &Subtarget) {
45908   SDLoc DL(N);
45909
45910   SDValue FalseOp = N->getOperand(0);
45911   SDValue TrueOp = N->getOperand(1);
45912   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
45913   SDValue Cond = N->getOperand(3);
45914
45915   // cmov X, X, ?, ? --> X
45916   if (TrueOp == FalseOp)
45917     return TrueOp;
45918
45919   // Try to simplify the EFLAGS and condition code operands.
45920   // We can't always do this as FCMOV only supports a subset of X86 cond.
45921   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
45922     if (!(FalseOp.getValueType() == MVT::f80 ||
45923           (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
45924           (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
45925         !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
45926       SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
45927                        Flags};
45928       return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
45929     }
45930   }
45931
45932   // If this is a select between two integer constants, try to do some
45933   // optimizations.  Note that the operands are ordered the opposite of SELECT
45934   // operands.
45935   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
45936     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
45937       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
45938       // larger than FalseC (the false value).
45939       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
45940         CC = X86::GetOppositeBranchCondition(CC);
45941         std::swap(TrueC, FalseC);
45942         std::swap(TrueOp, FalseOp);
45943       }
45944
45945       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
45946       // This is efficient for any integer data type (including i8/i16) and
45947       // shift amount.
45948       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
45949         Cond = getSETCC(CC, Cond, DL, DAG);
45950
45951         // Zero extend the condition if needed.
45952         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
45953
45954         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
45955         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
45956                            DAG.getConstant(ShAmt, DL, MVT::i8));
45957         return Cond;
45958       }
45959
45960       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
45961       // for any integer data type, including i8/i16.
45962       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
45963         Cond = getSETCC(CC, Cond, DL, DAG);
45964
45965         // Zero extend the condition if needed.
45966         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
45967                            FalseC->getValueType(0), Cond);
45968         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
45969                            SDValue(FalseC, 0));
45970         return Cond;
45971       }
45972
45973       // Optimize cases that will turn into an LEA instruction.  This requires
45974       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
45975       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
45976         APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
45977         assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
45978                "Implicit constant truncation");
45979
45980         bool isFastMultiplier = false;
45981         if (Diff.ult(10)) {
45982           switch (Diff.getZExtValue()) {
45983           default: break;
45984           case 1:  // result = add base, cond
45985           case 2:  // result = lea base(    , cond*2)
45986           case 3:  // result = lea base(cond, cond*2)
45987           case 4:  // result = lea base(    , cond*4)
45988           case 5:  // result = lea base(cond, cond*4)
45989           case 8:  // result = lea base(    , cond*8)
45990           case 9:  // result = lea base(cond, cond*8)
45991             isFastMultiplier = true;
45992             break;
45993           }
45994         }
45995
45996         if (isFastMultiplier) {
45997           Cond = getSETCC(CC, Cond, DL ,DAG);
45998           // Zero extend the condition if needed.
45999           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
46000                              Cond);
46001           // Scale the condition by the difference.
46002           if (Diff != 1)
46003             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
46004                                DAG.getConstant(Diff, DL, Cond.getValueType()));
46005
46006           // Add the base if non-zero.
46007           if (FalseC->getAPIntValue() != 0)
46008             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
46009                                SDValue(FalseC, 0));
46010           return Cond;
46011         }
46012       }
46013     }
46014   }
46015
46016   // Handle these cases:
46017   //   (select (x != c), e, c) -> select (x != c), e, x),
46018   //   (select (x == c), c, e) -> select (x == c), x, e)
46019   // where the c is an integer constant, and the "select" is the combination
46020   // of CMOV and CMP.
46021   //
46022   // The rationale for this change is that the conditional-move from a constant
46023   // needs two instructions, however, conditional-move from a register needs
46024   // only one instruction.
46025   //
46026   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
46027   //  some instruction-combining opportunities. This opt needs to be
46028   //  postponed as late as possible.
46029   //
46030   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
46031     // the DCI.xxxx conditions are provided to postpone the optimization as
46032     // late as possible.
46033
46034     ConstantSDNode *CmpAgainst = nullptr;
46035     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
46036         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
46037         !isa<ConstantSDNode>(Cond.getOperand(0))) {
46038
46039       if (CC == X86::COND_NE &&
46040           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
46041         CC = X86::GetOppositeBranchCondition(CC);
46042         std::swap(TrueOp, FalseOp);
46043       }
46044
46045       if (CC == X86::COND_E &&
46046           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
46047         SDValue Ops[] = {FalseOp, Cond.getOperand(0),
46048                          DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
46049         return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46050       }
46051     }
46052   }
46053
46054   // Transform:
46055   //
46056   //   (cmov 1 T (uge T 2))
46057   //
46058   // to:
46059   //
46060   //   (adc T 0 (sub T 1))
46061   if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
46062       Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
46063     SDValue Cond0 = Cond.getOperand(0);
46064     if (Cond0.getOpcode() == ISD::TRUNCATE)
46065       Cond0 = Cond0.getOperand(0);
46066     auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
46067     if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
46068       EVT CondVT = Cond->getValueType(0);
46069       EVT OuterVT = N->getValueType(0);
46070       // Subtract 1 and generate a carry.
46071       SDValue NewSub =
46072           DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
46073                       DAG.getConstant(1, DL, CondVT));
46074       SDValue EFLAGS(NewSub.getNode(), 1);
46075       return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
46076                          TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
46077     }
46078   }
46079
46080   // Fold and/or of setcc's to double CMOV:
46081   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
46082   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
46083   //
46084   // This combine lets us generate:
46085   //   cmovcc1 (jcc1 if we don't have CMOV)
46086   //   cmovcc2 (same)
46087   // instead of:
46088   //   setcc1
46089   //   setcc2
46090   //   and/or
46091   //   cmovne (jne if we don't have CMOV)
46092   // When we can't use the CMOV instruction, it might increase branch
46093   // mispredicts.
46094   // When we can use CMOV, or when there is no mispredict, this improves
46095   // throughput and reduces register pressure.
46096   //
46097   if (CC == X86::COND_NE) {
46098     SDValue Flags;
46099     X86::CondCode CC0, CC1;
46100     bool isAndSetCC;
46101     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
46102       if (isAndSetCC) {
46103         std::swap(FalseOp, TrueOp);
46104         CC0 = X86::GetOppositeBranchCondition(CC0);
46105         CC1 = X86::GetOppositeBranchCondition(CC1);
46106       }
46107
46108       SDValue LOps[] = {FalseOp, TrueOp,
46109                         DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
46110       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
46111       SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
46112                        Flags};
46113       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46114       return CMOV;
46115     }
46116   }
46117
46118   // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
46119   //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
46120   // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
46121   //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
46122   if ((CC == X86::COND_NE || CC == X86::COND_E) &&
46123       Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
46124     SDValue Add = TrueOp;
46125     SDValue Const = FalseOp;
46126     // Canonicalize the condition code for easier matching and output.
46127     if (CC == X86::COND_E)
46128       std::swap(Add, Const);
46129
46130     // We might have replaced the constant in the cmov with the LHS of the
46131     // compare. If so change it to the RHS of the compare.
46132     if (Const == Cond.getOperand(0))
46133       Const = Cond.getOperand(1);
46134
46135     // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
46136     if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
46137         Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
46138         (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
46139          Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
46140         Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
46141       EVT VT = N->getValueType(0);
46142       // This should constant fold.
46143       SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
46144       SDValue CMov =
46145           DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
46146                       DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
46147       return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
46148     }
46149   }
46150
46151   return SDValue();
46152 }
46153
46154 /// Different mul shrinking modes.
46155 enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
46156
46157 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
46158   EVT VT = N->getOperand(0).getValueType();
46159   if (VT.getScalarSizeInBits() != 32)
46160     return false;
46161
46162   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
46163   unsigned SignBits[2] = {1, 1};
46164   bool IsPositive[2] = {false, false};
46165   for (unsigned i = 0; i < 2; i++) {
46166     SDValue Opd = N->getOperand(i);
46167
46168     SignBits[i] = DAG.ComputeNumSignBits(Opd);
46169     IsPositive[i] = DAG.SignBitIsZero(Opd);
46170   }
46171
46172   bool AllPositive = IsPositive[0] && IsPositive[1];
46173   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
46174   // When ranges are from -128 ~ 127, use MULS8 mode.
46175   if (MinSignBits >= 25)
46176     Mode = ShrinkMode::MULS8;
46177   // When ranges are from 0 ~ 255, use MULU8 mode.
46178   else if (AllPositive && MinSignBits >= 24)
46179     Mode = ShrinkMode::MULU8;
46180   // When ranges are from -32768 ~ 32767, use MULS16 mode.
46181   else if (MinSignBits >= 17)
46182     Mode = ShrinkMode::MULS16;
46183   // When ranges are from 0 ~ 65535, use MULU16 mode.
46184   else if (AllPositive && MinSignBits >= 16)
46185     Mode = ShrinkMode::MULU16;
46186   else
46187     return false;
46188   return true;
46189 }
46190
46191 /// When the operands of vector mul are extended from smaller size values,
46192 /// like i8 and i16, the type of mul may be shrinked to generate more
46193 /// efficient code. Two typical patterns are handled:
46194 /// Pattern1:
46195 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
46196 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
46197 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
46198 ///     %5 = mul <N x i32> %2, %4
46199 ///
46200 /// Pattern2:
46201 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
46202 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
46203 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
46204 ///     %5 = mul <N x i32> %2, %4
46205 ///
46206 /// There are four mul shrinking modes:
46207 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
46208 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
46209 /// generate pmullw+sext32 for it (MULS8 mode).
46210 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
46211 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
46212 /// generate pmullw+zext32 for it (MULU8 mode).
46213 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
46214 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
46215 /// generate pmullw+pmulhw for it (MULS16 mode).
46216 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
46217 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
46218 /// generate pmullw+pmulhuw for it (MULU16 mode).
46219 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
46220                                const X86Subtarget &Subtarget) {
46221   // Check for legality
46222   // pmullw/pmulhw are not supported by SSE.
46223   if (!Subtarget.hasSSE2())
46224     return SDValue();
46225
46226   // Check for profitability
46227   // pmulld is supported since SSE41. It is better to use pmulld
46228   // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
46229   // the expansion.
46230   bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
46231   if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
46232     return SDValue();
46233
46234   ShrinkMode Mode;
46235   if (!canReduceVMulWidth(N, DAG, Mode))
46236     return SDValue();
46237
46238   SDLoc DL(N);
46239   SDValue N0 = N->getOperand(0);
46240   SDValue N1 = N->getOperand(1);
46241   EVT VT = N->getOperand(0).getValueType();
46242   unsigned NumElts = VT.getVectorNumElements();
46243   if ((NumElts % 2) != 0)
46244     return SDValue();
46245
46246   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
46247
46248   // Shrink the operands of mul.
46249   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
46250   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
46251
46252   // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
46253   // lower part is needed.
46254   SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
46255   if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
46256     return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
46257                                                    : ISD::SIGN_EXTEND,
46258                        DL, VT, MulLo);
46259
46260   EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
46261   // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
46262   // the higher part is also needed.
46263   SDValue MulHi =
46264       DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
46265                   ReducedVT, NewN0, NewN1);
46266
46267   // Repack the lower part and higher part result of mul into a wider
46268   // result.
46269   // Generate shuffle functioning as punpcklwd.
46270   SmallVector<int, 16> ShuffleMask(NumElts);
46271   for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
46272     ShuffleMask[2 * i] = i;
46273     ShuffleMask[2 * i + 1] = i + NumElts;
46274   }
46275   SDValue ResLo =
46276       DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
46277   ResLo = DAG.getBitcast(ResVT, ResLo);
46278   // Generate shuffle functioning as punpckhwd.
46279   for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
46280     ShuffleMask[2 * i] = i + NumElts / 2;
46281     ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
46282   }
46283   SDValue ResHi =
46284       DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
46285   ResHi = DAG.getBitcast(ResVT, ResHi);
46286   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
46287 }
46288
46289 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
46290                                  EVT VT, const SDLoc &DL) {
46291
46292   auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
46293     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46294                                  DAG.getConstant(Mult, DL, VT));
46295     Result = DAG.getNode(ISD::SHL, DL, VT, Result,
46296                          DAG.getConstant(Shift, DL, MVT::i8));
46297     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
46298                          N->getOperand(0));
46299     return Result;
46300   };
46301
46302   auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
46303     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46304                                  DAG.getConstant(Mul1, DL, VT));
46305     Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
46306                          DAG.getConstant(Mul2, DL, VT));
46307     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
46308                          N->getOperand(0));
46309     return Result;
46310   };
46311
46312   switch (MulAmt) {
46313   default:
46314     break;
46315   case 11:
46316     // mul x, 11 => add ((shl (mul x, 5), 1), x)
46317     return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
46318   case 21:
46319     // mul x, 21 => add ((shl (mul x, 5), 2), x)
46320     return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
46321   case 41:
46322     // mul x, 41 => add ((shl (mul x, 5), 3), x)
46323     return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
46324   case 22:
46325     // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
46326     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
46327                        combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
46328   case 19:
46329     // mul x, 19 => add ((shl (mul x, 9), 1), x)
46330     return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
46331   case 37:
46332     // mul x, 37 => add ((shl (mul x, 9), 2), x)
46333     return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
46334   case 73:
46335     // mul x, 73 => add ((shl (mul x, 9), 3), x)
46336     return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
46337   case 13:
46338     // mul x, 13 => add ((shl (mul x, 3), 2), x)
46339     return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
46340   case 23:
46341     // mul x, 23 => sub ((shl (mul x, 3), 3), x)
46342     return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
46343   case 26:
46344     // mul x, 26 => add ((mul (mul x, 5), 5), x)
46345     return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
46346   case 28:
46347     // mul x, 28 => add ((mul (mul x, 9), 3), x)
46348     return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
46349   case 29:
46350     // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
46351     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
46352                        combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
46353   }
46354
46355   // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
46356   // by a single LEA.
46357   // First check if this a sum of two power of 2s because that's easy. Then
46358   // count how many zeros are up to the first bit.
46359   // TODO: We can do this even without LEA at a cost of two shifts and an add.
46360   if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
46361     unsigned ScaleShift = llvm::countr_zero(MulAmt);
46362     if (ScaleShift >= 1 && ScaleShift < 4) {
46363       unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
46364       SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46365                                    DAG.getConstant(ShiftAmt, DL, MVT::i8));
46366       SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46367                                    DAG.getConstant(ScaleShift, DL, MVT::i8));
46368       return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
46369     }
46370   }
46371
46372   return SDValue();
46373 }
46374
46375 // If the upper 17 bits of either element are zero and the other element are
46376 // zero/sign bits then we can use PMADDWD, which is always at least as quick as
46377 // PMULLD, except on KNL.
46378 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
46379                                    const X86Subtarget &Subtarget) {
46380   if (!Subtarget.hasSSE2())
46381     return SDValue();
46382
46383   if (Subtarget.isPMADDWDSlow())
46384     return SDValue();
46385
46386   EVT VT = N->getValueType(0);
46387
46388   // Only support vXi32 vectors.
46389   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
46390     return SDValue();
46391
46392   // Make sure the type is legal or can split/widen to a legal type.
46393   // With AVX512 but without BWI, we would need to split v32i16.
46394   unsigned NumElts = VT.getVectorNumElements();
46395   if (NumElts == 1 || !isPowerOf2_32(NumElts))
46396     return SDValue();
46397
46398   // With AVX512 but without BWI, we would need to split v32i16.
46399   if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
46400     return SDValue();
46401
46402   SDValue N0 = N->getOperand(0);
46403   SDValue N1 = N->getOperand(1);
46404
46405   // If we are zero/sign extending two steps without SSE4.1, its better to
46406   // reduce the vmul width instead.
46407   if (!Subtarget.hasSSE41() &&
46408       (((N0.getOpcode() == ISD::ZERO_EXTEND &&
46409          N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
46410         (N1.getOpcode() == ISD::ZERO_EXTEND &&
46411          N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
46412        ((N0.getOpcode() == ISD::SIGN_EXTEND &&
46413          N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
46414         (N1.getOpcode() == ISD::SIGN_EXTEND &&
46415          N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
46416     return SDValue();
46417
46418   // If we are sign extending a wide vector without SSE4.1, its better to reduce
46419   // the vmul width instead.
46420   if (!Subtarget.hasSSE41() &&
46421       (N0.getOpcode() == ISD::SIGN_EXTEND &&
46422        N0.getOperand(0).getValueSizeInBits() > 128) &&
46423       (N1.getOpcode() == ISD::SIGN_EXTEND &&
46424        N1.getOperand(0).getValueSizeInBits() > 128))
46425     return SDValue();
46426
46427   // Sign bits must extend down to the lowest i16.
46428   if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
46429       DAG.ComputeMaxSignificantBits(N0) > 16)
46430     return SDValue();
46431
46432   // At least one of the elements must be zero in the upper 17 bits, or can be
46433   // safely made zero without altering the final result.
46434   auto GetZeroableOp = [&](SDValue Op) {
46435     APInt Mask17 = APInt::getHighBitsSet(32, 17);
46436     if (DAG.MaskedValueIsZero(Op, Mask17))
46437       return Op;
46438     // Mask off upper 16-bits of sign-extended constants.
46439     if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))
46440       return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,
46441                          DAG.getConstant(0xFFFF, SDLoc(N), VT));
46442     if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
46443       SDValue Src = Op.getOperand(0);
46444       // Convert sext(vXi16) to zext(vXi16).
46445       if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
46446         return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
46447       // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
46448       // which will expand the extension.
46449       if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
46450         EVT ExtVT = VT.changeVectorElementType(MVT::i16);
46451         Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);
46452         return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);
46453       }
46454     }
46455     // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
46456     if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
46457         N->isOnlyUserOf(Op.getNode())) {
46458       SDValue Src = Op.getOperand(0);
46459       if (Src.getScalarValueSizeInBits() == 16)
46460         return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
46461     }
46462     // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
46463     if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
46464         N->isOnlyUserOf(Op.getNode())) {
46465       return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
46466                          Op.getOperand(1));
46467     }
46468     return SDValue();
46469   };
46470   SDValue ZeroN0 = GetZeroableOp(N0);
46471   SDValue ZeroN1 = GetZeroableOp(N1);
46472   if (!ZeroN0 && !ZeroN1)
46473     return SDValue();
46474   N0 = ZeroN0 ? ZeroN0 : N0;
46475   N1 = ZeroN1 ? ZeroN1 : N1;
46476
46477   // Use SplitOpsAndApply to handle AVX splitting.
46478   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46479                            ArrayRef<SDValue> Ops) {
46480     MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46481     MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
46482     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
46483                        DAG.getBitcast(OpVT, Ops[0]),
46484                        DAG.getBitcast(OpVT, Ops[1]));
46485   };
46486   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},
46487                           PMADDWDBuilder);
46488 }
46489
46490 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
46491                                   const X86Subtarget &Subtarget) {
46492   if (!Subtarget.hasSSE2())
46493     return SDValue();
46494
46495   EVT VT = N->getValueType(0);
46496
46497   // Only support vXi64 vectors.
46498   if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
46499       VT.getVectorNumElements() < 2 ||
46500       !isPowerOf2_32(VT.getVectorNumElements()))
46501     return SDValue();
46502
46503   SDValue N0 = N->getOperand(0);
46504   SDValue N1 = N->getOperand(1);
46505
46506   // MULDQ returns the 64-bit result of the signed multiplication of the lower
46507   // 32-bits. We can lower with this if the sign bits stretch that far.
46508   if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
46509       DAG.ComputeNumSignBits(N1) > 32) {
46510     auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46511                             ArrayRef<SDValue> Ops) {
46512       return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
46513     };
46514     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
46515                             PMULDQBuilder, /*CheckBWI*/false);
46516   }
46517
46518   // If the upper bits are zero we can use a single pmuludq.
46519   APInt Mask = APInt::getHighBitsSet(64, 32);
46520   if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
46521     auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46522                              ArrayRef<SDValue> Ops) {
46523       return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
46524     };
46525     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
46526                             PMULUDQBuilder, /*CheckBWI*/false);
46527   }
46528
46529   return SDValue();
46530 }
46531
46532 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
46533                           TargetLowering::DAGCombinerInfo &DCI,
46534                           const X86Subtarget &Subtarget) {
46535   EVT VT = N->getValueType(0);
46536
46537   if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
46538     return V;
46539
46540   if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
46541     return V;
46542
46543   if (DCI.isBeforeLegalize() && VT.isVector())
46544     return reduceVMULWidth(N, DAG, Subtarget);
46545
46546   // Optimize a single multiply with constant into two operations in order to
46547   // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
46548   if (!MulConstantOptimization)
46549     return SDValue();
46550
46551   // An imul is usually smaller than the alternative sequence.
46552   if (DAG.getMachineFunction().getFunction().hasMinSize())
46553     return SDValue();
46554
46555   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
46556     return SDValue();
46557
46558   if (VT != MVT::i64 && VT != MVT::i32 &&
46559       (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
46560     return SDValue();
46561
46562   ConstantSDNode *CNode = isConstOrConstSplat(
46563       N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false);
46564   const APInt *C = nullptr;
46565   if (!CNode) {
46566     if (VT.isVector())
46567       if (auto *RawC = getTargetConstantFromNode(N->getOperand(1)))
46568         if (auto *SplatC = RawC->getSplatValue())
46569           C = &(SplatC->getUniqueInteger());
46570
46571     if (!C || C->getBitWidth() != VT.getScalarSizeInBits())
46572       return SDValue();
46573   } else {
46574     C = &(CNode->getAPIntValue());
46575   }
46576
46577   if (isPowerOf2_64(C->getZExtValue()))
46578     return SDValue();
46579
46580   int64_t SignMulAmt = C->getSExtValue();
46581   assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
46582   uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
46583
46584   SDLoc DL(N);
46585   SDValue NewMul = SDValue();
46586   if (VT == MVT::i64 || VT == MVT::i32) {
46587     if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
46588       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46589                            DAG.getConstant(AbsMulAmt, DL, VT));
46590       if (SignMulAmt < 0)
46591         NewMul =
46592             DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
46593
46594       return NewMul;
46595     }
46596
46597     uint64_t MulAmt1 = 0;
46598     uint64_t MulAmt2 = 0;
46599     if ((AbsMulAmt % 9) == 0) {
46600       MulAmt1 = 9;
46601       MulAmt2 = AbsMulAmt / 9;
46602     } else if ((AbsMulAmt % 5) == 0) {
46603       MulAmt1 = 5;
46604       MulAmt2 = AbsMulAmt / 5;
46605     } else if ((AbsMulAmt % 3) == 0) {
46606       MulAmt1 = 3;
46607       MulAmt2 = AbsMulAmt / 3;
46608     }
46609
46610     // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
46611     if (MulAmt2 &&
46612         (isPowerOf2_64(MulAmt2) ||
46613          (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
46614
46615       if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
46616                                       N->use_begin()->getOpcode() == ISD::ADD))
46617         // If second multiplifer is pow2, issue it first. We want the multiply
46618         // by 3, 5, or 9 to be folded into the addressing mode unless the lone
46619         // use is an add. Only do this for positive multiply amounts since the
46620         // negate would prevent it from being used as an address mode anyway.
46621         std::swap(MulAmt1, MulAmt2);
46622
46623       if (isPowerOf2_64(MulAmt1))
46624         NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46625                              DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
46626       else
46627         NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46628                              DAG.getConstant(MulAmt1, DL, VT));
46629
46630       if (isPowerOf2_64(MulAmt2))
46631         NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
46632                              DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
46633       else
46634         NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
46635                              DAG.getConstant(MulAmt2, DL, VT));
46636
46637       // Negate the result.
46638       if (SignMulAmt < 0)
46639         NewMul =
46640             DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
46641     } else if (!Subtarget.slowLEA())
46642       NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
46643   }
46644   if (!NewMul) {
46645     EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
46646     assert(C->getZExtValue() != 0 &&
46647            C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) &&
46648            "Both cases that could cause potential overflows should have "
46649            "already been handled.");
46650     if (isPowerOf2_64(AbsMulAmt - 1)) {
46651       // (mul x, 2^N + 1) => (add (shl x, N), x)
46652       NewMul = DAG.getNode(
46653           ISD::ADD, DL, VT, N->getOperand(0),
46654           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46655                       DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
46656       // To negate, subtract the number from zero
46657       if (SignMulAmt < 0)
46658         NewMul =
46659             DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
46660     } else if (isPowerOf2_64(AbsMulAmt + 1)) {
46661       // (mul x, 2^N - 1) => (sub (shl x, N), x)
46662       NewMul =
46663           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46664                       DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
46665       // To negate, reverse the operands of the subtract.
46666       if (SignMulAmt < 0)
46667         NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
46668       else
46669         NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
46670     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
46671                (!VT.isVector() || Subtarget.fastImmVectorShift())) {
46672       // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
46673       NewMul =
46674           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46675                       DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
46676       NewMul = DAG.getNode(
46677           ISD::ADD, DL, VT, NewMul,
46678           DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
46679     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
46680                (!VT.isVector() || Subtarget.fastImmVectorShift())) {
46681       // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
46682       NewMul =
46683           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46684                       DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
46685       NewMul = DAG.getNode(
46686           ISD::SUB, DL, VT, NewMul,
46687           DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
46688     } else if (SignMulAmt >= 0 && VT.isVector() &&
46689                Subtarget.fastImmVectorShift()) {
46690       uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
46691       uint64_t ShiftAmt1;
46692       std::optional<unsigned> Opc;
46693       if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
46694         ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
46695         Opc = ISD::ADD;
46696       } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
46697         ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
46698         Opc = ISD::SUB;
46699       }
46700
46701       if (Opc) {
46702         SDValue Shift1 =
46703             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46704                         DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
46705         SDValue Shift2 =
46706             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
46707                         DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
46708         NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
46709       }
46710     }
46711   }
46712
46713   return NewMul;
46714 }
46715
46716 // Try to form a MULHU or MULHS node by looking for
46717 // (srl (mul ext, ext), 16)
46718 // TODO: This is X86 specific because we want to be able to handle wide types
46719 // before type legalization. But we can only do it if the vector will be
46720 // legalized via widening/splitting. Type legalization can't handle promotion
46721 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
46722 // combiner.
46723 static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
46724                                    const X86Subtarget &Subtarget) {
46725   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
46726            "SRL or SRA node is required here!");
46727   SDLoc DL(N);
46728
46729   if (!Subtarget.hasSSE2())
46730     return SDValue();
46731
46732   // The operation feeding into the shift must be a multiply.
46733   SDValue ShiftOperand = N->getOperand(0);
46734   if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
46735     return SDValue();
46736
46737   // Input type should be at least vXi32.
46738   EVT VT = N->getValueType(0);
46739   if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
46740     return SDValue();
46741
46742   // Need a shift by 16.
46743   APInt ShiftAmt;
46744   if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
46745       ShiftAmt != 16)
46746     return SDValue();
46747
46748   SDValue LHS = ShiftOperand.getOperand(0);
46749   SDValue RHS = ShiftOperand.getOperand(1);
46750
46751   unsigned ExtOpc = LHS.getOpcode();
46752   if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
46753       RHS.getOpcode() != ExtOpc)
46754     return SDValue();
46755
46756   // Peek through the extends.
46757   LHS = LHS.getOperand(0);
46758   RHS = RHS.getOperand(0);
46759
46760   // Ensure the input types match.
46761   EVT MulVT = LHS.getValueType();
46762   if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
46763     return SDValue();
46764
46765   unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
46766   SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
46767
46768   ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
46769   return DAG.getNode(ExtOpc, DL, VT, Mulh);
46770 }
46771
46772 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
46773   SDValue N0 = N->getOperand(0);
46774   SDValue N1 = N->getOperand(1);
46775   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
46776   EVT VT = N0.getValueType();
46777
46778   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
46779   // since the result of setcc_c is all zero's or all ones.
46780   if (VT.isInteger() && !VT.isVector() &&
46781       N1C && N0.getOpcode() == ISD::AND &&
46782       N0.getOperand(1).getOpcode() == ISD::Constant) {
46783     SDValue N00 = N0.getOperand(0);
46784     APInt Mask = N0.getConstantOperandAPInt(1);
46785     Mask <<= N1C->getAPIntValue();
46786     bool MaskOK = false;
46787     // We can handle cases concerning bit-widening nodes containing setcc_c if
46788     // we carefully interrogate the mask to make sure we are semantics
46789     // preserving.
46790     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
46791     // of the underlying setcc_c operation if the setcc_c was zero extended.
46792     // Consider the following example:
46793     //   zext(setcc_c)                 -> i32 0x0000FFFF
46794     //   c1                            -> i32 0x0000FFFF
46795     //   c2                            -> i32 0x00000001
46796     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
46797     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
46798     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
46799       MaskOK = true;
46800     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
46801                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
46802       MaskOK = true;
46803     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
46804                 N00.getOpcode() == ISD::ANY_EXTEND) &&
46805                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
46806       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
46807     }
46808     if (MaskOK && Mask != 0) {
46809       SDLoc DL(N);
46810       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
46811     }
46812   }
46813
46814   return SDValue();
46815 }
46816
46817 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
46818                                            const X86Subtarget &Subtarget) {
46819   SDValue N0 = N->getOperand(0);
46820   SDValue N1 = N->getOperand(1);
46821   EVT VT = N0.getValueType();
46822   unsigned Size = VT.getSizeInBits();
46823
46824   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
46825     return V;
46826
46827   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
46828   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
46829   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
46830   // depending on sign of (SarConst - [56,48,32,24,16])
46831
46832   // sexts in X86 are MOVs. The MOVs have the same code size
46833   // as above SHIFTs (only SHIFT on 1 has lower code size).
46834   // However the MOVs have 2 advantages to a SHIFT:
46835   // 1. MOVs can write to a register that differs from source
46836   // 2. MOVs accept memory operands
46837
46838   if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
46839       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
46840       N0.getOperand(1).getOpcode() != ISD::Constant)
46841     return SDValue();
46842
46843   SDValue N00 = N0.getOperand(0);
46844   SDValue N01 = N0.getOperand(1);
46845   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
46846   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
46847   EVT CVT = N1.getValueType();
46848
46849   if (SarConst.isNegative())
46850     return SDValue();
46851
46852   for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
46853     unsigned ShiftSize = SVT.getSizeInBits();
46854     // skipping types without corresponding sext/zext and
46855     // ShlConst that is not one of [56,48,32,24,16]
46856     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
46857       continue;
46858     SDLoc DL(N);
46859     SDValue NN =
46860         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
46861     SarConst = SarConst - (Size - ShiftSize);
46862     if (SarConst == 0)
46863       return NN;
46864     if (SarConst.isNegative())
46865       return DAG.getNode(ISD::SHL, DL, VT, NN,
46866                          DAG.getConstant(-SarConst, DL, CVT));
46867     return DAG.getNode(ISD::SRA, DL, VT, NN,
46868                        DAG.getConstant(SarConst, DL, CVT));
46869   }
46870   return SDValue();
46871 }
46872
46873 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
46874                                         TargetLowering::DAGCombinerInfo &DCI,
46875                                         const X86Subtarget &Subtarget) {
46876   SDValue N0 = N->getOperand(0);
46877   SDValue N1 = N->getOperand(1);
46878   EVT VT = N0.getValueType();
46879
46880   if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
46881     return V;
46882
46883   // Only do this on the last DAG combine as it can interfere with other
46884   // combines.
46885   if (!DCI.isAfterLegalizeDAG())
46886     return SDValue();
46887
46888   // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
46889   // TODO: This is a generic DAG combine that became an x86-only combine to
46890   // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
46891   // and-not ('andn').
46892   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
46893     return SDValue();
46894
46895   auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
46896   auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
46897   if (!ShiftC || !AndC)
46898     return SDValue();
46899
46900   // If we can shrink the constant mask below 8-bits or 32-bits, then this
46901   // transform should reduce code size. It may also enable secondary transforms
46902   // from improved known-bits analysis or instruction selection.
46903   APInt MaskVal = AndC->getAPIntValue();
46904
46905   // If this can be matched by a zero extend, don't optimize.
46906   if (MaskVal.isMask()) {
46907     unsigned TO = MaskVal.countr_one();
46908     if (TO >= 8 && isPowerOf2_32(TO))
46909       return SDValue();
46910   }
46911
46912   APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
46913   unsigned OldMaskSize = MaskVal.getSignificantBits();
46914   unsigned NewMaskSize = NewMaskVal.getSignificantBits();
46915   if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
46916       (OldMaskSize > 32 && NewMaskSize <= 32)) {
46917     // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
46918     SDLoc DL(N);
46919     SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
46920     SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
46921     return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
46922   }
46923   return SDValue();
46924 }
46925
46926 static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
46927                                          const X86Subtarget &Subtarget) {
46928   unsigned Opcode = N->getOpcode();
46929   assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
46930
46931   SDLoc DL(N);
46932   EVT VT = N->getValueType(0);
46933   SDValue N0 = N->getOperand(0);
46934   SDValue N1 = N->getOperand(1);
46935   EVT SrcVT = N0.getValueType();
46936
46937   SDValue BC0 =
46938       N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
46939   SDValue BC1 =
46940       N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
46941
46942   // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
46943   // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
46944   // truncation trees that help us avoid lane crossing shuffles.
46945   // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
46946   // TODO: We don't handle vXf64 shuffles yet.
46947   if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
46948     if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
46949       SmallVector<SDValue> ShuffleOps;
46950       SmallVector<int> ShuffleMask, ScaledMask;
46951       SDValue Vec = peekThroughBitcasts(BCSrc);
46952       if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
46953         resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
46954         // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
46955         // shuffle to a v4X64 width - we can probably relax this in the future.
46956         if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
46957             ShuffleOps[0].getValueType().is256BitVector() &&
46958             scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
46959           SDValue Lo, Hi;
46960           MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
46961           std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
46962           Lo = DAG.getBitcast(SrcVT, Lo);
46963           Hi = DAG.getBitcast(SrcVT, Hi);
46964           SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
46965           Res = DAG.getBitcast(ShufVT, Res);
46966           Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
46967           return DAG.getBitcast(VT, Res);
46968         }
46969       }
46970     }
46971   }
46972
46973   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
46974   if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
46975     // If either/both ops are a shuffle that can scale to v2x64,
46976     // then see if we can perform this as a v4x32 post shuffle.
46977     SmallVector<SDValue> Ops0, Ops1;
46978     SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
46979     bool IsShuf0 =
46980         getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
46981         scaleShuffleElements(Mask0, 2, ScaledMask0) &&
46982         all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
46983     bool IsShuf1 =
46984         getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
46985         scaleShuffleElements(Mask1, 2, ScaledMask1) &&
46986         all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
46987     if (IsShuf0 || IsShuf1) {
46988       if (!IsShuf0) {
46989         Ops0.assign({BC0});
46990         ScaledMask0.assign({0, 1});
46991       }
46992       if (!IsShuf1) {
46993         Ops1.assign({BC1});
46994         ScaledMask1.assign({0, 1});
46995       }
46996
46997       SDValue LHS, RHS;
46998       int PostShuffle[4] = {-1, -1, -1, -1};
46999       auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
47000         if (M < 0)
47001           return true;
47002         Idx = M % 2;
47003         SDValue Src = Ops[M / 2];
47004         if (!LHS || LHS == Src) {
47005           LHS = Src;
47006           return true;
47007         }
47008         if (!RHS || RHS == Src) {
47009           Idx += 2;
47010           RHS = Src;
47011           return true;
47012         }
47013         return false;
47014       };
47015       if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
47016           FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
47017           FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
47018           FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
47019         LHS = DAG.getBitcast(SrcVT, LHS);
47020         RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
47021         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
47022         SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
47023         Res = DAG.getBitcast(ShufVT, Res);
47024         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
47025         return DAG.getBitcast(VT, Res);
47026       }
47027     }
47028   }
47029
47030   // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
47031   if (VT.is256BitVector() && Subtarget.hasInt256()) {
47032     SmallVector<int> Mask0, Mask1;
47033     SmallVector<SDValue> Ops0, Ops1;
47034     SmallVector<int, 2> ScaledMask0, ScaledMask1;
47035     if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
47036         getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
47037         !Ops0.empty() && !Ops1.empty() &&
47038         all_of(Ops0,
47039                [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47040         all_of(Ops1,
47041                [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47042         scaleShuffleElements(Mask0, 2, ScaledMask0) &&
47043         scaleShuffleElements(Mask1, 2, ScaledMask1)) {
47044       SDValue Op00 = peekThroughBitcasts(Ops0.front());
47045       SDValue Op10 = peekThroughBitcasts(Ops1.front());
47046       SDValue Op01 = peekThroughBitcasts(Ops0.back());
47047       SDValue Op11 = peekThroughBitcasts(Ops1.back());
47048       if ((Op00 == Op11) && (Op01 == Op10)) {
47049         std::swap(Op10, Op11);
47050         ShuffleVectorSDNode::commuteMask(ScaledMask1);
47051       }
47052       if ((Op00 == Op10) && (Op01 == Op11)) {
47053         const int Map[4] = {0, 2, 1, 3};
47054         SmallVector<int, 4> ShuffleMask(
47055             {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
47056              Map[ScaledMask1[1]]});
47057         MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
47058         SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
47059                                   DAG.getBitcast(SrcVT, Op01));
47060         Res = DAG.getBitcast(ShufVT, Res);
47061         Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
47062         return DAG.getBitcast(VT, Res);
47063       }
47064     }
47065   }
47066
47067   return SDValue();
47068 }
47069
47070 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
47071                                  TargetLowering::DAGCombinerInfo &DCI,
47072                                  const X86Subtarget &Subtarget) {
47073   unsigned Opcode = N->getOpcode();
47074   assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
47075          "Unexpected pack opcode");
47076
47077   EVT VT = N->getValueType(0);
47078   SDValue N0 = N->getOperand(0);
47079   SDValue N1 = N->getOperand(1);
47080   unsigned NumDstElts = VT.getVectorNumElements();
47081   unsigned DstBitsPerElt = VT.getScalarSizeInBits();
47082   unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
47083   assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
47084          N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
47085          "Unexpected PACKSS/PACKUS input type");
47086
47087   bool IsSigned = (X86ISD::PACKSS == Opcode);
47088
47089   // Constant Folding.
47090   APInt UndefElts0, UndefElts1;
47091   SmallVector<APInt, 32> EltBits0, EltBits1;
47092   if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
47093       (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
47094       getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
47095       getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
47096     unsigned NumLanes = VT.getSizeInBits() / 128;
47097     unsigned NumSrcElts = NumDstElts / 2;
47098     unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
47099     unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
47100
47101     APInt Undefs(NumDstElts, 0);
47102     SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
47103     for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
47104       for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
47105         unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
47106         auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
47107         auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
47108
47109         if (UndefElts[SrcIdx]) {
47110           Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
47111           continue;
47112         }
47113
47114         APInt &Val = EltBits[SrcIdx];
47115         if (IsSigned) {
47116           // PACKSS: Truncate signed value with signed saturation.
47117           // Source values less than dst minint are saturated to minint.
47118           // Source values greater than dst maxint are saturated to maxint.
47119           if (Val.isSignedIntN(DstBitsPerElt))
47120             Val = Val.trunc(DstBitsPerElt);
47121           else if (Val.isNegative())
47122             Val = APInt::getSignedMinValue(DstBitsPerElt);
47123           else
47124             Val = APInt::getSignedMaxValue(DstBitsPerElt);
47125         } else {
47126           // PACKUS: Truncate signed value with unsigned saturation.
47127           // Source values less than zero are saturated to zero.
47128           // Source values greater than dst maxuint are saturated to maxuint.
47129           if (Val.isIntN(DstBitsPerElt))
47130             Val = Val.trunc(DstBitsPerElt);
47131           else if (Val.isNegative())
47132             Val = APInt::getZero(DstBitsPerElt);
47133           else
47134             Val = APInt::getAllOnes(DstBitsPerElt);
47135         }
47136         Bits[Lane * NumDstEltsPerLane + Elt] = Val;
47137       }
47138     }
47139
47140     return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
47141   }
47142
47143   // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
47144   if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
47145     return V;
47146
47147   // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
47148   // Currently limit this to allsignbits cases only.
47149   if (IsSigned &&
47150       (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
47151       (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
47152     SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
47153     SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
47154     if (Not0 && Not1) {
47155       SDLoc DL(N);
47156       MVT SrcVT = N0.getSimpleValueType();
47157       SDValue Pack =
47158           DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
47159                       DAG.getBitcast(SrcVT, Not1));
47160       return DAG.getNOT(DL, Pack, VT);
47161     }
47162   }
47163
47164   // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
47165   // truncate to create a larger truncate.
47166   if (Subtarget.hasAVX512() &&
47167       N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
47168       N0.getOperand(0).getValueType() == MVT::v8i32) {
47169     if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
47170         (!IsSigned &&
47171          DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
47172       if (Subtarget.hasVLX())
47173         return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
47174
47175       // Widen input to v16i32 so we can truncate that.
47176       SDLoc dl(N);
47177       SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
47178                                    N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
47179       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
47180     }
47181   }
47182
47183   // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
47184   if (VT.is128BitVector()) {
47185     unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
47186     SDValue Src0, Src1;
47187     if (N0.getOpcode() == ExtOpc &&
47188         N0.getOperand(0).getValueType().is64BitVector() &&
47189         N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
47190       Src0 = N0.getOperand(0);
47191     }
47192     if (N1.getOpcode() == ExtOpc &&
47193         N1.getOperand(0).getValueType().is64BitVector() &&
47194         N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
47195       Src1 = N1.getOperand(0);
47196     }
47197     if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
47198       assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
47199       Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
47200       Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
47201       return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
47202     }
47203
47204     // Try again with pack(*_extend_vector_inreg, undef).
47205     unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
47206                                     : ISD::ZERO_EXTEND_VECTOR_INREG;
47207     if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
47208         N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
47209       return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
47210                                     DAG);
47211   }
47212
47213   // Attempt to combine as shuffle.
47214   SDValue Op(N, 0);
47215   if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47216     return Res;
47217
47218   return SDValue();
47219 }
47220
47221 static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
47222                                     TargetLowering::DAGCombinerInfo &DCI,
47223                                     const X86Subtarget &Subtarget) {
47224   assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
47225           X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
47226          "Unexpected horizontal add/sub opcode");
47227
47228   if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
47229     MVT VT = N->getSimpleValueType(0);
47230     SDValue LHS = N->getOperand(0);
47231     SDValue RHS = N->getOperand(1);
47232
47233     // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
47234     if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
47235         LHS.getOpcode() == RHS.getOpcode() &&
47236         LHS.getValueType() == RHS.getValueType() &&
47237         N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
47238       SDValue LHS0 = LHS.getOperand(0);
47239       SDValue LHS1 = LHS.getOperand(1);
47240       SDValue RHS0 = RHS.getOperand(0);
47241       SDValue RHS1 = RHS.getOperand(1);
47242       if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
47243           (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
47244         SDLoc DL(N);
47245         SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
47246                                   LHS0.isUndef() ? LHS1 : LHS0,
47247                                   RHS0.isUndef() ? RHS1 : RHS0);
47248         MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
47249         Res = DAG.getBitcast(ShufVT, Res);
47250         SDValue NewLHS =
47251             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
47252                         getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
47253         SDValue NewRHS =
47254             DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
47255                         getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
47256         return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
47257                            DAG.getBitcast(VT, NewRHS));
47258       }
47259     }
47260   }
47261
47262   // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
47263   if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
47264     return V;
47265
47266   return SDValue();
47267 }
47268
47269 static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
47270                                      TargetLowering::DAGCombinerInfo &DCI,
47271                                      const X86Subtarget &Subtarget) {
47272   assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
47273           X86ISD::VSRL == N->getOpcode()) &&
47274          "Unexpected shift opcode");
47275   EVT VT = N->getValueType(0);
47276   SDValue N0 = N->getOperand(0);
47277   SDValue N1 = N->getOperand(1);
47278
47279   // Shift zero -> zero.
47280   if (ISD::isBuildVectorAllZeros(N0.getNode()))
47281     return DAG.getConstant(0, SDLoc(N), VT);
47282
47283   // Detect constant shift amounts.
47284   APInt UndefElts;
47285   SmallVector<APInt, 32> EltBits;
47286   if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
47287     unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
47288     return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
47289                                       EltBits[0].getZExtValue(), DAG);
47290   }
47291
47292   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47293   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
47294   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
47295     return SDValue(N, 0);
47296
47297   return SDValue();
47298 }
47299
47300 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
47301                                      TargetLowering::DAGCombinerInfo &DCI,
47302                                      const X86Subtarget &Subtarget) {
47303   unsigned Opcode = N->getOpcode();
47304   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
47305           X86ISD::VSRLI == Opcode) &&
47306          "Unexpected shift opcode");
47307   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
47308   EVT VT = N->getValueType(0);
47309   SDValue N0 = N->getOperand(0);
47310   SDValue N1 = N->getOperand(1);
47311   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
47312   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
47313          "Unexpected value type");
47314   assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
47315
47316   // (shift undef, X) -> 0
47317   if (N0.isUndef())
47318     return DAG.getConstant(0, SDLoc(N), VT);
47319
47320   // Out of range logical bit shifts are guaranteed to be zero.
47321   // Out of range arithmetic bit shifts splat the sign bit.
47322   unsigned ShiftVal = N->getConstantOperandVal(1);
47323   if (ShiftVal >= NumBitsPerElt) {
47324     if (LogicalShift)
47325       return DAG.getConstant(0, SDLoc(N), VT);
47326     ShiftVal = NumBitsPerElt - 1;
47327   }
47328
47329   // (shift X, 0) -> X
47330   if (!ShiftVal)
47331     return N0;
47332
47333   // (shift 0, C) -> 0
47334   if (ISD::isBuildVectorAllZeros(N0.getNode()))
47335     // N0 is all zeros or undef. We guarantee that the bits shifted into the
47336     // result are all zeros, not undef.
47337     return DAG.getConstant(0, SDLoc(N), VT);
47338
47339   // (VSRAI -1, C) -> -1
47340   if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
47341     // N0 is all ones or undef. We guarantee that the bits shifted into the
47342     // result are all ones, not undef.
47343     return DAG.getConstant(-1, SDLoc(N), VT);
47344
47345   auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
47346     unsigned NewShiftVal = Amt0 + Amt1;
47347     if (NewShiftVal >= NumBitsPerElt) {
47348       // Out of range logical bit shifts are guaranteed to be zero.
47349       // Out of range arithmetic bit shifts splat the sign bit.
47350       if (LogicalShift)
47351         return DAG.getConstant(0, SDLoc(N), VT);
47352       NewShiftVal = NumBitsPerElt - 1;
47353     }
47354     return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
47355                        DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
47356   };
47357
47358   // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
47359   if (Opcode == N0.getOpcode())
47360     return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
47361
47362   // (shl (add X, X), C) -> (shl X, (C + 1))
47363   if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
47364       N0.getOperand(0) == N0.getOperand(1))
47365     return MergeShifts(N0.getOperand(0), ShiftVal, 1);
47366
47367   // We can decode 'whole byte' logical bit shifts as shuffles.
47368   if (LogicalShift && (ShiftVal % 8) == 0) {
47369     SDValue Op(N, 0);
47370     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47371       return Res;
47372   }
47373
47374   // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
47375   // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
47376   // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
47377   // pshufd(psrad(pslld(X,31),31),0,0,2,2).
47378   if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
47379       N0.getOpcode() == X86ISD::PSHUFD &&
47380       N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
47381       N0->hasOneUse()) {
47382     SDValue BC = peekThroughOneUseBitcasts(N0.getOperand(0));
47383     if (BC.getOpcode() == X86ISD::VSHLI &&
47384         BC.getScalarValueSizeInBits() == 64 &&
47385         BC.getConstantOperandVal(1) == 63) {
47386       SDLoc DL(N);
47387       SDValue Src = BC.getOperand(0);
47388       Src = DAG.getBitcast(VT, Src);
47389       Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
47390                         getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
47391       Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
47392       Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
47393       return Src;
47394     }
47395   }
47396
47397   auto TryConstantFold = [&](SDValue V) {
47398     APInt UndefElts;
47399     SmallVector<APInt, 32> EltBits;
47400     if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits))
47401       return SDValue();
47402     assert(EltBits.size() == VT.getVectorNumElements() &&
47403            "Unexpected shift value type");
47404     // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
47405     // created an undef input due to no input bits being demanded, but user
47406     // still expects 0 in other bits.
47407     for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
47408       APInt &Elt = EltBits[i];
47409       if (UndefElts[i])
47410         Elt = 0;
47411       else if (X86ISD::VSHLI == Opcode)
47412         Elt <<= ShiftVal;
47413       else if (X86ISD::VSRAI == Opcode)
47414         Elt.ashrInPlace(ShiftVal);
47415       else
47416         Elt.lshrInPlace(ShiftVal);
47417     }
47418     // Reset undef elements since they were zeroed above.
47419     UndefElts = 0;
47420     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
47421   };
47422
47423   // Constant Folding.
47424   if (N->isOnlyUserOf(N0.getNode())) {
47425     if (SDValue C = TryConstantFold(N0))
47426       return C;
47427
47428     // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
47429     // Don't break NOT patterns.
47430     SDValue BC = peekThroughOneUseBitcasts(N0);
47431     if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
47432         BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
47433         !ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) {
47434       if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
47435         SDLoc DL(N);
47436         SDValue LHS = DAG.getNode(Opcode, DL, VT,
47437                                   DAG.getBitcast(VT, BC.getOperand(0)), N1);
47438         return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
47439       }
47440     }
47441   }
47442
47443   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47444   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
47445                                DCI))
47446     return SDValue(N, 0);
47447
47448   return SDValue();
47449 }
47450
47451 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
47452                                    TargetLowering::DAGCombinerInfo &DCI,
47453                                    const X86Subtarget &Subtarget) {
47454   EVT VT = N->getValueType(0);
47455   unsigned Opcode = N->getOpcode();
47456   assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
47457           (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
47458           Opcode == ISD::INSERT_VECTOR_ELT) &&
47459          "Unexpected vector insertion");
47460
47461   SDValue Vec = N->getOperand(0);
47462   SDValue Scl = N->getOperand(1);
47463   SDValue Idx = N->getOperand(2);
47464
47465   // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
47466   if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
47467     return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
47468
47469   if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
47470     unsigned NumBitsPerElt = VT.getScalarSizeInBits();
47471     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47472     if (TLI.SimplifyDemandedBits(SDValue(N, 0),
47473                                  APInt::getAllOnes(NumBitsPerElt), DCI))
47474       return SDValue(N, 0);
47475   }
47476
47477   // Attempt to combine insertion patterns to a shuffle.
47478   if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
47479     SDValue Op(N, 0);
47480     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47481       return Res;
47482   }
47483
47484   return SDValue();
47485 }
47486
47487 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
47488 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
47489 /// OR -> CMPNEQSS.
47490 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
47491                                    TargetLowering::DAGCombinerInfo &DCI,
47492                                    const X86Subtarget &Subtarget) {
47493   unsigned opcode;
47494
47495   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
47496   // we're requiring SSE2 for both.
47497   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
47498     SDValue N0 = N->getOperand(0);
47499     SDValue N1 = N->getOperand(1);
47500     SDValue CMP0 = N0.getOperand(1);
47501     SDValue CMP1 = N1.getOperand(1);
47502     SDLoc DL(N);
47503
47504     // The SETCCs should both refer to the same CMP.
47505     if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
47506       return SDValue();
47507
47508     SDValue CMP00 = CMP0->getOperand(0);
47509     SDValue CMP01 = CMP0->getOperand(1);
47510     EVT     VT    = CMP00.getValueType();
47511
47512     if (VT == MVT::f32 || VT == MVT::f64 ||
47513         (VT == MVT::f16 && Subtarget.hasFP16())) {
47514       bool ExpectingFlags = false;
47515       // Check for any users that want flags:
47516       for (const SDNode *U : N->uses()) {
47517         if (ExpectingFlags)
47518           break;
47519
47520         switch (U->getOpcode()) {
47521         default:
47522         case ISD::BR_CC:
47523         case ISD::BRCOND:
47524         case ISD::SELECT:
47525           ExpectingFlags = true;
47526           break;
47527         case ISD::CopyToReg:
47528         case ISD::SIGN_EXTEND:
47529         case ISD::ZERO_EXTEND:
47530         case ISD::ANY_EXTEND:
47531           break;
47532         }
47533       }
47534
47535       if (!ExpectingFlags) {
47536         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
47537         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
47538
47539         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
47540           X86::CondCode tmp = cc0;
47541           cc0 = cc1;
47542           cc1 = tmp;
47543         }
47544
47545         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
47546             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
47547           // FIXME: need symbolic constants for these magic numbers.
47548           // See X86ATTInstPrinter.cpp:printSSECC().
47549           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
47550           if (Subtarget.hasAVX512()) {
47551             SDValue FSetCC =
47552                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
47553                             DAG.getTargetConstant(x86cc, DL, MVT::i8));
47554             // Need to fill with zeros to ensure the bitcast will produce zeroes
47555             // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
47556             SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
47557                                       DAG.getConstant(0, DL, MVT::v16i1),
47558                                       FSetCC, DAG.getIntPtrConstant(0, DL));
47559             return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
47560                                       N->getSimpleValueType(0));
47561           }
47562           SDValue OnesOrZeroesF =
47563               DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
47564                           CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
47565
47566           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
47567           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
47568
47569           if (is64BitFP && !Subtarget.is64Bit()) {
47570             // On a 32-bit target, we cannot bitcast the 64-bit float to a
47571             // 64-bit integer, since that's not a legal type. Since
47572             // OnesOrZeroesF is all ones or all zeroes, we don't need all the
47573             // bits, but can do this little dance to extract the lowest 32 bits
47574             // and work with those going forward.
47575             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
47576                                            OnesOrZeroesF);
47577             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
47578             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
47579                                         Vector32, DAG.getIntPtrConstant(0, DL));
47580             IntVT = MVT::i32;
47581           }
47582
47583           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
47584           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
47585                                       DAG.getConstant(1, DL, IntVT));
47586           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
47587                                               ANDed);
47588           return OneBitOfTruth;
47589         }
47590       }
47591     }
47592   }
47593   return SDValue();
47594 }
47595
47596 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
47597 static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {
47598   assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
47599
47600   MVT VT = N->getSimpleValueType(0);
47601   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
47602     return SDValue();
47603
47604   SDValue X, Y;
47605   SDValue N0 = N->getOperand(0);
47606   SDValue N1 = N->getOperand(1);
47607
47608   if (SDValue Not = IsNOT(N0, DAG)) {
47609     X = Not;
47610     Y = N1;
47611   } else if (SDValue Not = IsNOT(N1, DAG)) {
47612     X = Not;
47613     Y = N0;
47614   } else
47615     return SDValue();
47616
47617   X = DAG.getBitcast(VT, X);
47618   Y = DAG.getBitcast(VT, Y);
47619   return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
47620 }
47621
47622 /// Try to fold:
47623 ///   and (vector_shuffle<Z,...,Z>
47624 ///            (insert_vector_elt undef, (xor X, -1), Z), undef), Y
47625 ///   ->
47626 ///   andnp (vector_shuffle<Z,...,Z>
47627 ///              (insert_vector_elt undef, X, Z), undef), Y
47628 static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
47629                                     const X86Subtarget &Subtarget) {
47630   assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
47631
47632   EVT VT = N->getValueType(0);
47633   // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
47634   // value and require extra moves.
47635   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
47636         ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
47637     return SDValue();
47638
47639   auto GetNot = [&DAG](SDValue V) {
47640     auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
47641     // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
47642     // end-users are ISD::AND including cases
47643     // (and(extract_vector_element(SVN), Y)).
47644     if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
47645         !SVN->getOperand(1).isUndef()) {
47646       return SDValue();
47647     }
47648     SDValue IVEN = SVN->getOperand(0);
47649     if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
47650         !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
47651       return SDValue();
47652     if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
47653         IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
47654       return SDValue();
47655     SDValue Src = IVEN.getOperand(1);
47656     if (SDValue Not = IsNOT(Src, DAG)) {
47657       SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
47658       SDValue NotIVEN =
47659           DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),
47660                       IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
47661       return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
47662                                   SVN->getOperand(1), SVN->getMask());
47663     }
47664     return SDValue();
47665   };
47666
47667   SDValue X, Y;
47668   SDValue N0 = N->getOperand(0);
47669   SDValue N1 = N->getOperand(1);
47670
47671   if (SDValue Not = GetNot(N0)) {
47672     X = Not;
47673     Y = N1;
47674   } else if (SDValue Not = GetNot(N1)) {
47675     X = Not;
47676     Y = N0;
47677   } else
47678     return SDValue();
47679
47680   X = DAG.getBitcast(VT, X);
47681   Y = DAG.getBitcast(VT, Y);
47682   SDLoc DL(N);
47683   // We do not split for SSE at all, but we need to split vectors for AVX1 and
47684   // AVX2.
47685   if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {
47686     SDValue LoX, HiX;
47687     std::tie(LoX, HiX) = splitVector(X, DAG, DL);
47688     SDValue LoY, HiY;
47689     std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
47690     EVT SplitVT = LoX.getValueType();
47691     SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
47692     SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
47693     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
47694   }
47695   return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
47696 }
47697
47698 // Try to widen AND, OR and XOR nodes to VT in order to remove casts around
47699 // logical operations, like in the example below.
47700 //   or (and (truncate x, truncate y)),
47701 //      (xor (truncate z, build_vector (constants)))
47702 // Given a target type \p VT, we generate
47703 //   or (and x, y), (xor z, zext(build_vector (constants)))
47704 // given x, y and z are of type \p VT. We can do so, if operands are either
47705 // truncates from VT types, the second operand is a vector of constants or can
47706 // be recursively promoted.
47707 static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
47708                                      unsigned Depth) {
47709   // Limit recursion to avoid excessive compile times.
47710   if (Depth >= SelectionDAG::MaxRecursionDepth)
47711     return SDValue();
47712
47713   if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
47714       N->getOpcode() != ISD::OR)
47715     return SDValue();
47716
47717   SDValue N0 = N->getOperand(0);
47718   SDValue N1 = N->getOperand(1);
47719   SDLoc DL(N);
47720
47721   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47722   if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
47723     return SDValue();
47724
47725   if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
47726     N0 = NN0;
47727   else {
47728     // The Left side has to be a trunc.
47729     if (N0.getOpcode() != ISD::TRUNCATE)
47730       return SDValue();
47731
47732     // The type of the truncated inputs.
47733     if (N0.getOperand(0).getValueType() != VT)
47734       return SDValue();
47735
47736     N0 = N0.getOperand(0);
47737   }
47738
47739   if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
47740     N1 = NN1;
47741   else {
47742     // The right side has to be a 'trunc' or a constant vector.
47743     bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
47744                     N1.getOperand(0).getValueType() == VT;
47745     if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
47746       return SDValue();
47747
47748     if (RHSTrunc)
47749       N1 = N1.getOperand(0);
47750     else
47751       N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
47752   }
47753
47754   return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
47755 }
47756
47757 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
47758 // register. In most cases we actually compare or select YMM-sized registers
47759 // and mixing the two types creates horrible code. This method optimizes
47760 // some of the transition sequences.
47761 // Even with AVX-512 this is still useful for removing casts around logical
47762 // operations on vXi1 mask types.
47763 static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
47764                                      const X86Subtarget &Subtarget) {
47765   EVT VT = N->getValueType(0);
47766   assert(VT.isVector() && "Expected vector type");
47767
47768   SDLoc DL(N);
47769   assert((N->getOpcode() == ISD::ANY_EXTEND ||
47770           N->getOpcode() == ISD::ZERO_EXTEND ||
47771           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
47772
47773   SDValue Narrow = N->getOperand(0);
47774   EVT NarrowVT = Narrow.getValueType();
47775
47776   // Generate the wide operation.
47777   SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
47778   if (!Op)
47779     return SDValue();
47780   switch (N->getOpcode()) {
47781   default: llvm_unreachable("Unexpected opcode");
47782   case ISD::ANY_EXTEND:
47783     return Op;
47784   case ISD::ZERO_EXTEND:
47785     return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
47786   case ISD::SIGN_EXTEND:
47787     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
47788                        Op, DAG.getValueType(NarrowVT));
47789   }
47790 }
47791
47792 static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
47793   unsigned FPOpcode;
47794   switch (Opcode) {
47795   default: llvm_unreachable("Unexpected input node for FP logic conversion");
47796   case ISD::AND: FPOpcode = X86ISD::FAND; break;
47797   case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
47798   case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
47799   }
47800   return FPOpcode;
47801 }
47802
47803 /// If both input operands of a logic op are being cast from floating-point
47804 /// types or FP compares, try to convert this into a floating-point logic node
47805 /// to avoid unnecessary moves from SSE to integer registers.
47806 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
47807                                         TargetLowering::DAGCombinerInfo &DCI,
47808                                         const X86Subtarget &Subtarget) {
47809   EVT VT = N->getValueType(0);
47810   SDValue N0 = N->getOperand(0);
47811   SDValue N1 = N->getOperand(1);
47812   SDLoc DL(N);
47813
47814   if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
47815         (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
47816     return SDValue();
47817
47818   SDValue N00 = N0.getOperand(0);
47819   SDValue N10 = N1.getOperand(0);
47820   EVT N00Type = N00.getValueType();
47821   EVT N10Type = N10.getValueType();
47822
47823   // Ensure that both types are the same and are legal scalar fp types.
47824   if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
47825                               (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
47826                               (Subtarget.hasFP16() && N00Type == MVT::f16)))
47827     return SDValue();
47828
47829   if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
47830     unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
47831     SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
47832     return DAG.getBitcast(VT, FPLogic);
47833   }
47834
47835   if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
47836       !N1.hasOneUse())
47837     return SDValue();
47838
47839   ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
47840   ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
47841
47842   // The vector ISA for FP predicates is incomplete before AVX, so converting
47843   // COMIS* to CMPS* may not be a win before AVX.
47844   if (!Subtarget.hasAVX() &&
47845       !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
47846     return SDValue();
47847
47848   // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
47849   // and vector logic:
47850   // logic (setcc N00, N01), (setcc N10, N11) -->
47851   // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
47852   unsigned NumElts = 128 / N00Type.getSizeInBits();
47853   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
47854   EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47855   SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
47856   SDValue N01 = N0.getOperand(1);
47857   SDValue N11 = N1.getOperand(1);
47858   SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
47859   SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
47860   SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
47861   SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
47862   SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
47863   SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
47864   SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
47865   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
47866 }
47867
47868 // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
47869 // to reduce XMM->GPR traffic.
47870 static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
47871   unsigned Opc = N->getOpcode();
47872   assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
47873          "Unexpected bit opcode");
47874
47875   SDValue N0 = N->getOperand(0);
47876   SDValue N1 = N->getOperand(1);
47877
47878   // Both operands must be single use MOVMSK.
47879   if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
47880       N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
47881     return SDValue();
47882
47883   SDValue Vec0 = N0.getOperand(0);
47884   SDValue Vec1 = N1.getOperand(0);
47885   EVT VecVT0 = Vec0.getValueType();
47886   EVT VecVT1 = Vec1.getValueType();
47887
47888   // Both MOVMSK operands must be from vectors of the same size and same element
47889   // size, but its OK for a fp/int diff.
47890   if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
47891       VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
47892     return SDValue();
47893
47894   SDLoc DL(N);
47895   unsigned VecOpc =
47896       VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
47897   SDValue Result =
47898       DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
47899   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
47900 }
47901
47902 // Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
47903 // NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
47904 // handles in InstCombine.
47905 static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
47906   unsigned Opc = N->getOpcode();
47907   assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
47908          "Unexpected bit opcode");
47909
47910   SDValue N0 = N->getOperand(0);
47911   SDValue N1 = N->getOperand(1);
47912   EVT VT = N->getValueType(0);
47913
47914   // Both operands must be single use.
47915   if (!N0.hasOneUse() || !N1.hasOneUse())
47916     return SDValue();
47917
47918   // Search for matching shifts.
47919   SDValue BC0 = peekThroughOneUseBitcasts(N0);
47920   SDValue BC1 = peekThroughOneUseBitcasts(N1);
47921
47922   unsigned BCOpc = BC0.getOpcode();
47923   EVT BCVT = BC0.getValueType();
47924   if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
47925     return SDValue();
47926
47927   switch (BCOpc) {
47928   case X86ISD::VSHLI:
47929   case X86ISD::VSRLI:
47930   case X86ISD::VSRAI: {
47931     if (BC0.getOperand(1) != BC1.getOperand(1))
47932       return SDValue();
47933
47934     SDLoc DL(N);
47935     SDValue BitOp =
47936         DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
47937     SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
47938     return DAG.getBitcast(VT, Shift);
47939   }
47940   }
47941
47942   return SDValue();
47943 }
47944
47945 // Attempt to fold:
47946 // BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
47947 // TODO: Handle PACKUS handling.
47948 static SDValue combineBitOpWithPACK(SDNode *N, SelectionDAG &DAG) {
47949   unsigned Opc = N->getOpcode();
47950   assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
47951          "Unexpected bit opcode");
47952
47953   SDValue N0 = N->getOperand(0);
47954   SDValue N1 = N->getOperand(1);
47955   EVT VT = N->getValueType(0);
47956
47957   // Both operands must be single use.
47958   if (!N0.hasOneUse() || !N1.hasOneUse())
47959     return SDValue();
47960
47961   // Search for matching packs.
47962   N0 = peekThroughOneUseBitcasts(N0);
47963   N1 = peekThroughOneUseBitcasts(N1);
47964
47965   if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
47966     return SDValue();
47967
47968   MVT DstVT = N0.getSimpleValueType();
47969   if (DstVT != N1.getSimpleValueType())
47970     return SDValue();
47971
47972   MVT SrcVT = N0.getOperand(0).getSimpleValueType();
47973   unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
47974
47975   // Limit to allsignbits packing.
47976   if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
47977       DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
47978       DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
47979       DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
47980     return SDValue();
47981
47982   SDLoc DL(N);
47983   SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
47984   SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
47985   return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
47986 }
47987
47988 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
47989 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
47990 /// with a shift-right to eliminate loading the vector constant mask value.
47991 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
47992                                      const X86Subtarget &Subtarget) {
47993   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
47994   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
47995   EVT VT = Op0.getValueType();
47996   if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
47997     return SDValue();
47998
47999   // Try to convert an "is positive" signbit masking operation into arithmetic
48000   // shift and "andn". This saves a materialization of a -1 vector constant.
48001   // The "is negative" variant should be handled more generally because it only
48002   // requires "and" rather than "andn":
48003   // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
48004   //
48005   // This is limited to the original type to avoid producing even more bitcasts.
48006   // If the bitcasts can't be eliminated, then it is unlikely that this fold
48007   // will be profitable.
48008   if (N->getValueType(0) == VT &&
48009       supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
48010     SDValue X, Y;
48011     if (Op1.getOpcode() == X86ISD::PCMPGT &&
48012         isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
48013       X = Op1.getOperand(0);
48014       Y = Op0;
48015     } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
48016                isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
48017       X = Op0.getOperand(0);
48018       Y = Op1;
48019     }
48020     if (X && Y) {
48021       SDLoc DL(N);
48022       SDValue Sra =
48023           getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,
48024                                      VT.getScalarSizeInBits() - 1, DAG);
48025       return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
48026     }
48027   }
48028
48029   APInt SplatVal;
48030   if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
48031     return SDValue();
48032
48033   // Don't prevent creation of ANDN.
48034   if (isBitwiseNot(Op0))
48035     return SDValue();
48036
48037   if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
48038     return SDValue();
48039
48040   unsigned EltBitWidth = VT.getScalarSizeInBits();
48041   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
48042     return SDValue();
48043
48044   SDLoc DL(N);
48045   unsigned ShiftVal = SplatVal.countr_one();
48046   SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
48047   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
48048   return DAG.getBitcast(N->getValueType(0), Shift);
48049 }
48050
48051 // Get the index node from the lowered DAG of a GEP IR instruction with one
48052 // indexing dimension.
48053 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
48054   if (Ld->isIndexed())
48055     return SDValue();
48056
48057   SDValue Base = Ld->getBasePtr();
48058
48059   if (Base.getOpcode() != ISD::ADD)
48060     return SDValue();
48061
48062   SDValue ShiftedIndex = Base.getOperand(0);
48063
48064   if (ShiftedIndex.getOpcode() != ISD::SHL)
48065     return SDValue();
48066
48067   return ShiftedIndex.getOperand(0);
48068
48069 }
48070
48071 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
48072   if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
48073     switch (VT.getSizeInBits()) {
48074     default: return false;
48075     case 64: return Subtarget.is64Bit() ? true : false;
48076     case 32: return true;
48077     }
48078   }
48079   return false;
48080 }
48081
48082 // This function recognizes cases where X86 bzhi instruction can replace and
48083 // 'and-load' sequence.
48084 // In case of loading integer value from an array of constants which is defined
48085 // as follows:
48086 //
48087 //   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
48088 //
48089 // then applying a bitwise and on the result with another input.
48090 // It's equivalent to performing bzhi (zero high bits) on the input, with the
48091 // same index of the load.
48092 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
48093                                     const X86Subtarget &Subtarget) {
48094   MVT VT = Node->getSimpleValueType(0);
48095   SDLoc dl(Node);
48096
48097   // Check if subtarget has BZHI instruction for the node's type
48098   if (!hasBZHI(Subtarget, VT))
48099     return SDValue();
48100
48101   // Try matching the pattern for both operands.
48102   for (unsigned i = 0; i < 2; i++) {
48103     SDValue N = Node->getOperand(i);
48104     LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
48105
48106      // continue if the operand is not a load instruction
48107     if (!Ld)
48108       return SDValue();
48109
48110     const Value *MemOp = Ld->getMemOperand()->getValue();
48111
48112     if (!MemOp)
48113       return SDValue();
48114
48115     if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
48116       if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
48117         if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
48118
48119           Constant *Init = GV->getInitializer();
48120           Type *Ty = Init->getType();
48121           if (!isa<ConstantDataArray>(Init) ||
48122               !Ty->getArrayElementType()->isIntegerTy() ||
48123               Ty->getArrayElementType()->getScalarSizeInBits() !=
48124                   VT.getSizeInBits() ||
48125               Ty->getArrayNumElements() >
48126                   Ty->getArrayElementType()->getScalarSizeInBits())
48127             continue;
48128
48129           // Check if the array's constant elements are suitable to our case.
48130           uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
48131           bool ConstantsMatch = true;
48132           for (uint64_t j = 0; j < ArrayElementCount; j++) {
48133             auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
48134             if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
48135               ConstantsMatch = false;
48136               break;
48137             }
48138           }
48139           if (!ConstantsMatch)
48140             continue;
48141
48142           // Do the transformation (For 32-bit type):
48143           // -> (and (load arr[idx]), inp)
48144           // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
48145           //    that will be replaced with one bzhi instruction.
48146           SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
48147           SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
48148
48149           // Get the Node which indexes into the array.
48150           SDValue Index = getIndexFromUnindexedLoad(Ld);
48151           if (!Index)
48152             return SDValue();
48153           Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
48154
48155           SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
48156           Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
48157
48158           SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
48159           SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
48160
48161           return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
48162         }
48163       }
48164     }
48165   }
48166   return SDValue();
48167 }
48168
48169 // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
48170 // Where C is a mask containing the same number of bits as the setcc and
48171 // where the setcc will freely 0 upper bits of k-register. We can replace the
48172 // undef in the concat with 0s and remove the AND. This mainly helps with
48173 // v2i1/v4i1 setcc being casted to scalar.
48174 static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
48175                                              const X86Subtarget &Subtarget) {
48176   assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
48177
48178   EVT VT = N->getValueType(0);
48179
48180   // Make sure this is an AND with constant. We will check the value of the
48181   // constant later.
48182   auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
48183   if (!C1)
48184     return SDValue();
48185
48186   // This is implied by the ConstantSDNode.
48187   assert(!VT.isVector() && "Expected scalar VT!");
48188
48189   SDValue Src = N->getOperand(0);
48190   if (!Src.hasOneUse())
48191     return SDValue();
48192
48193   // (Optionally) peek through any_extend().
48194   if (Src.getOpcode() == ISD::ANY_EXTEND) {
48195     if (!Src.getOperand(0).hasOneUse())
48196       return SDValue();
48197     Src = Src.getOperand(0);
48198   }
48199
48200   if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
48201     return SDValue();
48202
48203   Src = Src.getOperand(0);
48204   EVT SrcVT = Src.getValueType();
48205
48206   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48207   if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
48208       !TLI.isTypeLegal(SrcVT))
48209     return SDValue();
48210
48211   if (Src.getOpcode() != ISD::CONCAT_VECTORS)
48212     return SDValue();
48213
48214   // We only care about the first subvector of the concat, we expect the
48215   // other subvectors to be ignored due to the AND if we make the change.
48216   SDValue SubVec = Src.getOperand(0);
48217   EVT SubVecVT = SubVec.getValueType();
48218
48219   // The RHS of the AND should be a mask with as many bits as SubVec.
48220   if (!TLI.isTypeLegal(SubVecVT) ||
48221       !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
48222     return SDValue();
48223
48224   // First subvector should be a setcc with a legal result type or a
48225   // AND containing at least one setcc with a legal result type.
48226   auto IsLegalSetCC = [&](SDValue V) {
48227     if (V.getOpcode() != ISD::SETCC)
48228       return false;
48229     EVT SetccVT = V.getOperand(0).getValueType();
48230     if (!TLI.isTypeLegal(SetccVT) ||
48231         !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
48232       return false;
48233     if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
48234       return false;
48235     return true;
48236   };
48237   if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
48238                                  (IsLegalSetCC(SubVec.getOperand(0)) ||
48239                                   IsLegalSetCC(SubVec.getOperand(1))))))
48240     return SDValue();
48241
48242   // We passed all the checks. Rebuild the concat_vectors with zeroes
48243   // and cast it back to VT.
48244   SDLoc dl(N);
48245   SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
48246                               DAG.getConstant(0, dl, SubVecVT));
48247   Ops[0] = SubVec;
48248   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
48249                                Ops);
48250   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
48251   return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
48252 }
48253
48254 static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
48255                                 SDValue OpMustEq, SDValue Op, unsigned Depth) {
48256   // We don't want to go crazy with the recursion here. This isn't a super
48257   // important optimization.
48258   static constexpr unsigned kMaxDepth = 2;
48259
48260   // Only do this re-ordering if op has one use.
48261   if (!Op.hasOneUse())
48262     return SDValue();
48263
48264   SDLoc DL(Op);
48265   // If we hit another assosiative op, recurse further.
48266   if (Op.getOpcode() == Opc) {
48267     // Done recursing.
48268     if (Depth++ >= kMaxDepth)
48269       return SDValue();
48270
48271     for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
48272       if (SDValue R =
48273               getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
48274         return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
48275                            Op.getOperand(1 - OpIdx));
48276
48277   } else if (Op.getOpcode() == ISD::SUB) {
48278     if (Opc == ISD::AND) {
48279       // BLSI: (and x, (sub 0, x))
48280       if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
48281         return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48282     }
48283     // Opc must be ISD::AND or ISD::XOR
48284     // BLSR: (and x, (sub x, 1))
48285     // BLSMSK: (xor x, (sub x, 1))
48286     if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
48287       return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48288
48289   } else if (Op.getOpcode() == ISD::ADD) {
48290     // Opc must be ISD::AND or ISD::XOR
48291     // BLSR: (and x, (add x, -1))
48292     // BLSMSK: (xor x, (add x, -1))
48293     if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
48294       return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48295   }
48296   return SDValue();
48297 }
48298
48299 static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,
48300                                  const X86Subtarget &Subtarget) {
48301   EVT VT = N->getValueType(0);
48302   // Make sure this node is a candidate for BMI instructions.
48303   if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
48304       (VT != MVT::i32 && VT != MVT::i64))
48305     return SDValue();
48306
48307   assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
48308
48309   // Try and match LHS and RHS.
48310   for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
48311     if (SDValue OpMatch =
48312             getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
48313                              N->getOperand(1 - OpIdx), 0))
48314       return OpMatch;
48315   return SDValue();
48316 }
48317
48318 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
48319                           TargetLowering::DAGCombinerInfo &DCI,
48320                           const X86Subtarget &Subtarget) {
48321   SDValue N0 = N->getOperand(0);
48322   SDValue N1 = N->getOperand(1);
48323   EVT VT = N->getValueType(0);
48324   SDLoc dl(N);
48325   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48326
48327   // If this is SSE1 only convert to FAND to avoid scalarization.
48328   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
48329     return DAG.getBitcast(MVT::v4i32,
48330                           DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
48331                                       DAG.getBitcast(MVT::v4f32, N0),
48332                                       DAG.getBitcast(MVT::v4f32, N1)));
48333   }
48334
48335   // Use a 32-bit and+zext if upper bits known zero.
48336   if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
48337     APInt HiMask = APInt::getHighBitsSet(64, 32);
48338     if (DAG.MaskedValueIsZero(N1, HiMask) ||
48339         DAG.MaskedValueIsZero(N0, HiMask)) {
48340       SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
48341       SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
48342       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
48343                          DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
48344     }
48345   }
48346
48347   // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
48348   // TODO: Support multiple SrcOps.
48349   if (VT == MVT::i1) {
48350     SmallVector<SDValue, 2> SrcOps;
48351     SmallVector<APInt, 2> SrcPartials;
48352     if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
48353         SrcOps.size() == 1) {
48354       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
48355       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
48356       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
48357       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
48358         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
48359       if (Mask) {
48360         assert(SrcPartials[0].getBitWidth() == NumElts &&
48361                "Unexpected partial reduction mask");
48362         SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
48363         Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
48364         return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
48365       }
48366     }
48367   }
48368
48369   // InstCombine converts:
48370   //    `(-x << C0) & C1`
48371   // to
48372   //    `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
48373   // This saves an IR instruction but on x86 the neg/shift version is preferable
48374   // so undo the transform.
48375
48376   if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
48377     // TODO: We don't actually need a splat for this, we just need the checks to
48378     // hold for each element.
48379     ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
48380                                               /*AllowTruncation*/ false);
48381     ConstantSDNode *N01C =
48382         isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
48383                             /*AllowTruncation*/ false);
48384     if (N1C && N01C) {
48385       const APInt &MulC = N01C->getAPIntValue();
48386       const APInt &AndC = N1C->getAPIntValue();
48387       APInt MulCLowBit = MulC & (-MulC);
48388       if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
48389           (MulCLowBit + MulC).isPowerOf2()) {
48390         SDValue Neg = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT),
48391                                   N0.getOperand(0));
48392         int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
48393         assert(MulCLowBitLog != -1 &&
48394                "Isolated lowbit is somehow not a power of 2!");
48395         SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
48396                                     DAG.getConstant(MulCLowBitLog, dl, VT));
48397         return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
48398       }
48399     }
48400   }
48401
48402   if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
48403     return V;
48404
48405   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
48406     return R;
48407
48408   if (SDValue R = combineBitOpWithShift(N, DAG))
48409     return R;
48410
48411   if (SDValue R = combineBitOpWithPACK(N, DAG))
48412     return R;
48413
48414   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
48415     return FPLogic;
48416
48417   if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
48418     return R;
48419
48420   if (DCI.isBeforeLegalizeOps())
48421     return SDValue();
48422
48423   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
48424     return R;
48425
48426   if (SDValue R = combineAndNotIntoANDNP(N, DAG))
48427     return R;
48428
48429   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
48430     return ShiftRight;
48431
48432   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
48433     return R;
48434
48435   // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
48436   // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
48437   // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
48438   if (VT.isVector() && getTargetConstantFromNode(N1)) {
48439     unsigned Opc0 = N0.getOpcode();
48440     if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
48441         getTargetConstantFromNode(N0.getOperand(1)) &&
48442         DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
48443         N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
48444       SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
48445       return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
48446     }
48447   }
48448
48449   // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
48450   // avoids slow variable shift (moving shift amount to ECX etc.)
48451   if (isOneConstant(N1) && N0->hasOneUse()) {
48452     SDValue Src = N0;
48453     while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
48454             Src.getOpcode() == ISD::TRUNCATE) &&
48455            Src.getOperand(0)->hasOneUse())
48456       Src = Src.getOperand(0);
48457     bool ContainsNOT = false;
48458     X86::CondCode X86CC = X86::COND_B;
48459     // Peek through AND(NOT(SRL(X,Y)),1).
48460     if (isBitwiseNot(Src)) {
48461       Src = Src.getOperand(0);
48462       X86CC = X86::COND_AE;
48463       ContainsNOT = true;
48464     }
48465     if (Src.getOpcode() == ISD::SRL &&
48466         !isa<ConstantSDNode>(Src.getOperand(1))) {
48467       SDValue BitNo = Src.getOperand(1);
48468       Src = Src.getOperand(0);
48469       // Peek through AND(SRL(NOT(X),Y),1).
48470       if (isBitwiseNot(Src)) {
48471         Src = Src.getOperand(0);
48472         X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
48473         ContainsNOT = true;
48474       }
48475       // If we have BMI2 then SHRX should be faster for i32/i64 cases.
48476       if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
48477         if (SDValue BT = getBT(Src, BitNo, dl, DAG))
48478           return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
48479     }
48480   }
48481
48482   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
48483     // Attempt to recursively combine a bitmask AND with shuffles.
48484     SDValue Op(N, 0);
48485     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48486       return Res;
48487
48488     // If either operand is a constant mask, then only the elements that aren't
48489     // zero are actually demanded by the other operand.
48490     auto GetDemandedMasks = [&](SDValue Op) {
48491       APInt UndefElts;
48492       SmallVector<APInt> EltBits;
48493       int NumElts = VT.getVectorNumElements();
48494       int EltSizeInBits = VT.getScalarSizeInBits();
48495       APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
48496       APInt DemandedElts = APInt::getAllOnes(NumElts);
48497       if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
48498                                         EltBits)) {
48499         DemandedBits.clearAllBits();
48500         DemandedElts.clearAllBits();
48501         for (int I = 0; I != NumElts; ++I) {
48502           if (UndefElts[I]) {
48503             // We can't assume an undef src element gives an undef dst - the
48504             // other src might be zero.
48505             DemandedBits.setAllBits();
48506             DemandedElts.setBit(I);
48507           } else if (!EltBits[I].isZero()) {
48508             DemandedBits |= EltBits[I];
48509             DemandedElts.setBit(I);
48510           }
48511         }
48512       }
48513       return std::make_pair(DemandedBits, DemandedElts);
48514     };
48515     APInt Bits0, Elts0;
48516     APInt Bits1, Elts1;
48517     std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
48518     std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
48519
48520     if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
48521         TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
48522         TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
48523         TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
48524       if (N->getOpcode() != ISD::DELETED_NODE)
48525         DCI.AddToWorklist(N);
48526       return SDValue(N, 0);
48527     }
48528
48529     SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
48530     SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
48531     if (NewN0 || NewN1)
48532       return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
48533                          NewN1 ? NewN1 : N1);
48534   }
48535
48536   // Attempt to combine a scalar bitmask AND with an extracted shuffle.
48537   if ((VT.getScalarSizeInBits() % 8) == 0 &&
48538       N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
48539       isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
48540     SDValue BitMask = N1;
48541     SDValue SrcVec = N0.getOperand(0);
48542     EVT SrcVecVT = SrcVec.getValueType();
48543
48544     // Check that the constant bitmask masks whole bytes.
48545     APInt UndefElts;
48546     SmallVector<APInt, 64> EltBits;
48547     if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
48548         getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
48549         llvm::all_of(EltBits, [](const APInt &M) {
48550           return M.isZero() || M.isAllOnes();
48551         })) {
48552       unsigned NumElts = SrcVecVT.getVectorNumElements();
48553       unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
48554       unsigned Idx = N0.getConstantOperandVal(1);
48555
48556       // Create a root shuffle mask from the byte mask and the extracted index.
48557       SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
48558       for (unsigned i = 0; i != Scale; ++i) {
48559         if (UndefElts[i])
48560           continue;
48561         int VecIdx = Scale * Idx + i;
48562         ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
48563       }
48564
48565       if (SDValue Shuffle = combineX86ShufflesRecursively(
48566               {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
48567               X86::MaxShuffleCombineDepth,
48568               /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
48569               /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
48570         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
48571                            N0.getOperand(1));
48572     }
48573   }
48574
48575   if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
48576     return R;
48577
48578   return SDValue();
48579 }
48580
48581 // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
48582 static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
48583                                      const X86Subtarget &Subtarget) {
48584   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
48585
48586   MVT VT = N->getSimpleValueType(0);
48587   unsigned EltSizeInBits = VT.getScalarSizeInBits();
48588   if (!VT.isVector() || (EltSizeInBits % 8) != 0)
48589     return SDValue();
48590
48591   SDValue N0 = peekThroughBitcasts(N->getOperand(0));
48592   SDValue N1 = peekThroughBitcasts(N->getOperand(1));
48593   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
48594     return SDValue();
48595
48596   // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
48597   // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
48598   if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
48599         !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
48600     return SDValue();
48601
48602   // Attempt to extract constant byte masks.
48603   APInt UndefElts0, UndefElts1;
48604   SmallVector<APInt, 32> EltBits0, EltBits1;
48605   if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
48606                                      false, false))
48607     return SDValue();
48608   if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
48609                                      false, false))
48610     return SDValue();
48611
48612   for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
48613     // TODO - add UNDEF elts support.
48614     if (UndefElts0[i] || UndefElts1[i])
48615       return SDValue();
48616     if (EltBits0[i] != ~EltBits1[i])
48617       return SDValue();
48618   }
48619
48620   SDLoc DL(N);
48621
48622   if (useVPTERNLOG(Subtarget, VT)) {
48623     // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
48624     // VPTERNLOG is only available as vXi32/64-bit types.
48625     MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;
48626     MVT OpVT =
48627         MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
48628     SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
48629     SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
48630     SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
48631     SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
48632     SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
48633                                 DAG, Subtarget);
48634     return DAG.getBitcast(VT, Res);
48635   }
48636
48637   SDValue X = N->getOperand(0);
48638   SDValue Y =
48639       DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
48640                   DAG.getBitcast(VT, N1.getOperand(0)));
48641   return DAG.getNode(ISD::OR, DL, VT, X, Y);
48642 }
48643
48644 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
48645 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
48646   if (N->getOpcode() != ISD::OR)
48647     return false;
48648
48649   SDValue N0 = N->getOperand(0);
48650   SDValue N1 = N->getOperand(1);
48651
48652   // Canonicalize AND to LHS.
48653   if (N1.getOpcode() == ISD::AND)
48654     std::swap(N0, N1);
48655
48656   // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
48657   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
48658     return false;
48659
48660   Mask = N1.getOperand(0);
48661   X = N1.getOperand(1);
48662
48663   // Check to see if the mask appeared in both the AND and ANDNP.
48664   if (N0.getOperand(0) == Mask)
48665     Y = N0.getOperand(1);
48666   else if (N0.getOperand(1) == Mask)
48667     Y = N0.getOperand(0);
48668   else
48669     return false;
48670
48671   // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
48672   // ANDNP combine allows other combines to happen that prevent matching.
48673   return true;
48674 }
48675
48676 // Try to fold:
48677 //   (or (and (m, y), (pandn m, x)))
48678 // into:
48679 //   (vselect m, x, y)
48680 // As a special case, try to fold:
48681 //   (or (and (m, (sub 0, x)), (pandn m, x)))
48682 // into:
48683 //   (sub (xor X, M), M)
48684 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
48685                                             const X86Subtarget &Subtarget) {
48686   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
48687
48688   EVT VT = N->getValueType(0);
48689   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
48690         (VT.is256BitVector() && Subtarget.hasInt256())))
48691     return SDValue();
48692
48693   SDValue X, Y, Mask;
48694   if (!matchLogicBlend(N, X, Y, Mask))
48695     return SDValue();
48696
48697   // Validate that X, Y, and Mask are bitcasts, and see through them.
48698   Mask = peekThroughBitcasts(Mask);
48699   X = peekThroughBitcasts(X);
48700   Y = peekThroughBitcasts(Y);
48701
48702   EVT MaskVT = Mask.getValueType();
48703   unsigned EltBits = MaskVT.getScalarSizeInBits();
48704
48705   // TODO: Attempt to handle floating point cases as well?
48706   if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
48707     return SDValue();
48708
48709   SDLoc DL(N);
48710
48711   // Attempt to combine to conditional negate: (sub (xor X, M), M)
48712   if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
48713                                                            DAG, Subtarget))
48714     return Res;
48715
48716   // PBLENDVB is only available on SSE 4.1.
48717   if (!Subtarget.hasSSE41())
48718     return SDValue();
48719
48720   // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
48721   if (Subtarget.hasVLX())
48722     return SDValue();
48723
48724   MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
48725
48726   X = DAG.getBitcast(BlendVT, X);
48727   Y = DAG.getBitcast(BlendVT, Y);
48728   Mask = DAG.getBitcast(BlendVT, Mask);
48729   Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
48730   return DAG.getBitcast(VT, Mask);
48731 }
48732
48733 // Helper function for combineOrCmpEqZeroToCtlzSrl
48734 // Transforms:
48735 //   seteq(cmp x, 0)
48736 //   into:
48737 //   srl(ctlz x), log2(bitsize(x))
48738 // Input pattern is checked by caller.
48739 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {
48740   SDValue Cmp = Op.getOperand(1);
48741   EVT VT = Cmp.getOperand(0).getValueType();
48742   unsigned Log2b = Log2_32(VT.getSizeInBits());
48743   SDLoc dl(Op);
48744   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
48745   // The result of the shift is true or false, and on X86, the 32-bit
48746   // encoding of shr and lzcnt is more desirable.
48747   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
48748   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
48749                             DAG.getConstant(Log2b, dl, MVT::i8));
48750   return Scc;
48751 }
48752
48753 // Try to transform:
48754 //   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
48755 //   into:
48756 //   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
48757 // Will also attempt to match more generic cases, eg:
48758 //   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
48759 // Only applies if the target supports the FastLZCNT feature.
48760 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
48761                                            TargetLowering::DAGCombinerInfo &DCI,
48762                                            const X86Subtarget &Subtarget) {
48763   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
48764     return SDValue();
48765
48766   auto isORCandidate = [](SDValue N) {
48767     return (N->getOpcode() == ISD::OR && N->hasOneUse());
48768   };
48769
48770   // Check the zero extend is extending to 32-bit or more. The code generated by
48771   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
48772   // instructions to clear the upper bits.
48773   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
48774       !isORCandidate(N->getOperand(0)))
48775     return SDValue();
48776
48777   // Check the node matches: setcc(eq, cmp 0)
48778   auto isSetCCCandidate = [](SDValue N) {
48779     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
48780            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
48781            N->getOperand(1).getOpcode() == X86ISD::CMP &&
48782            isNullConstant(N->getOperand(1).getOperand(1)) &&
48783            N->getOperand(1).getValueType().bitsGE(MVT::i32);
48784   };
48785
48786   SDNode *OR = N->getOperand(0).getNode();
48787   SDValue LHS = OR->getOperand(0);
48788   SDValue RHS = OR->getOperand(1);
48789
48790   // Save nodes matching or(or, setcc(eq, cmp 0)).
48791   SmallVector<SDNode *, 2> ORNodes;
48792   while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
48793           (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
48794     ORNodes.push_back(OR);
48795     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
48796     LHS = OR->getOperand(0);
48797     RHS = OR->getOperand(1);
48798   }
48799
48800   // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
48801   if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
48802       !isORCandidate(SDValue(OR, 0)))
48803     return SDValue();
48804
48805   // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
48806   // to
48807   // or(srl(ctlz),srl(ctlz)).
48808   // The dag combiner can then fold it into:
48809   // srl(or(ctlz, ctlz)).
48810   SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
48811   SDValue Ret, NewRHS;
48812   if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
48813     Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
48814
48815   if (!Ret)
48816     return SDValue();
48817
48818   // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
48819   while (!ORNodes.empty()) {
48820     OR = ORNodes.pop_back_val();
48821     LHS = OR->getOperand(0);
48822     RHS = OR->getOperand(1);
48823     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
48824     if (RHS->getOpcode() == ISD::OR)
48825       std::swap(LHS, RHS);
48826     NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
48827     if (!NewRHS)
48828       return SDValue();
48829     Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
48830   }
48831
48832   return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
48833 }
48834
48835 static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
48836                                    SDValue And1_L, SDValue And1_R,
48837                                    const SDLoc &DL, SelectionDAG &DAG) {
48838   if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
48839     return SDValue();
48840   SDValue NotOp = And0_L->getOperand(0);
48841   if (NotOp == And1_R)
48842     std::swap(And1_R, And1_L);
48843   if (NotOp != And1_L)
48844     return SDValue();
48845
48846   // (~(NotOp) & And0_R) | (NotOp & And1_R)
48847   // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
48848   EVT VT = And1_L->getValueType(0);
48849   SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
48850   SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
48851   SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
48852   SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
48853   return Xor1;
48854 }
48855
48856 /// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
48857 /// equivalent `((x ^ y) & m) ^ y)` pattern.
48858 /// This is typically a better representation for  targets without a fused
48859 /// "and-not" operation. This function is intended to be called from a
48860 /// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
48861 static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
48862   // Note that masked-merge variants using XOR or ADD expressions are
48863   // normalized to OR by InstCombine so we only check for OR.
48864   assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
48865   SDValue N0 = Node->getOperand(0);
48866   if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
48867     return SDValue();
48868   SDValue N1 = Node->getOperand(1);
48869   if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
48870     return SDValue();
48871
48872   SDLoc DL(Node);
48873   SDValue N00 = N0->getOperand(0);
48874   SDValue N01 = N0->getOperand(1);
48875   SDValue N10 = N1->getOperand(0);
48876   SDValue N11 = N1->getOperand(1);
48877   if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
48878     return Result;
48879   if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
48880     return Result;
48881   if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
48882     return Result;
48883   if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
48884     return Result;
48885   return SDValue();
48886 }
48887
48888 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
48889 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
48890 /// with CMP+{ADC, SBB}.
48891 /// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
48892 static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
48893                                          SDValue X, SDValue Y,
48894                                          SelectionDAG &DAG,
48895                                          bool ZeroSecondOpOnly = false) {
48896   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
48897     return SDValue();
48898
48899   // Look through a one-use zext.
48900   if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
48901     Y = Y.getOperand(0);
48902
48903   X86::CondCode CC;
48904   SDValue EFLAGS;
48905   if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
48906     CC = (X86::CondCode)Y.getConstantOperandVal(0);
48907     EFLAGS = Y.getOperand(1);
48908   } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
48909              Y.hasOneUse()) {
48910     EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
48911   }
48912
48913   if (!EFLAGS)
48914     return SDValue();
48915
48916   // If X is -1 or 0, then we have an opportunity to avoid constants required in
48917   // the general case below.
48918   auto *ConstantX = dyn_cast<ConstantSDNode>(X);
48919   if (ConstantX && !ZeroSecondOpOnly) {
48920     if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
48921         (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
48922       // This is a complicated way to get -1 or 0 from the carry flag:
48923       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
48924       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
48925       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
48926                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
48927                          EFLAGS);
48928     }
48929
48930     if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
48931         (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
48932       if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
48933           EFLAGS.getValueType().isInteger() &&
48934           !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
48935         // Swap the operands of a SUB, and we have the same pattern as above.
48936         // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
48937         //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
48938         SDValue NewSub = DAG.getNode(
48939             X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
48940             EFLAGS.getOperand(1), EFLAGS.getOperand(0));
48941         SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
48942         return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
48943                            DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
48944                            NewEFLAGS);
48945       }
48946     }
48947   }
48948
48949   if (CC == X86::COND_B) {
48950     // X + SETB Z --> adc X, 0
48951     // X - SETB Z --> sbb X, 0
48952     return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
48953                        DAG.getVTList(VT, MVT::i32), X,
48954                        DAG.getConstant(0, DL, VT), EFLAGS);
48955   }
48956
48957   if (ZeroSecondOpOnly)
48958     return SDValue();
48959
48960   if (CC == X86::COND_A) {
48961     // Try to convert COND_A into COND_B in an attempt to facilitate
48962     // materializing "setb reg".
48963     //
48964     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
48965     // cannot take an immediate as its first operand.
48966     //
48967     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
48968         EFLAGS.getValueType().isInteger() &&
48969         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
48970       SDValue NewSub =
48971           DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
48972                       EFLAGS.getOperand(1), EFLAGS.getOperand(0));
48973       SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
48974       return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
48975                          DAG.getVTList(VT, MVT::i32), X,
48976                          DAG.getConstant(0, DL, VT), NewEFLAGS);
48977     }
48978   }
48979
48980   if (CC == X86::COND_AE) {
48981     // X + SETAE --> sbb X, -1
48982     // X - SETAE --> adc X, -1
48983     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
48984                        DAG.getVTList(VT, MVT::i32), X,
48985                        DAG.getConstant(-1, DL, VT), EFLAGS);
48986   }
48987
48988   if (CC == X86::COND_BE) {
48989     // X + SETBE --> sbb X, -1
48990     // X - SETBE --> adc X, -1
48991     // Try to convert COND_BE into COND_AE in an attempt to facilitate
48992     // materializing "setae reg".
48993     //
48994     // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
48995     // cannot take an immediate as its first operand.
48996     //
48997     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
48998         EFLAGS.getValueType().isInteger() &&
48999         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49000       SDValue NewSub =
49001           DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49002                       EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49003       SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49004       return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49005                          DAG.getVTList(VT, MVT::i32), X,
49006                          DAG.getConstant(-1, DL, VT), NewEFLAGS);
49007     }
49008   }
49009
49010   if (CC != X86::COND_E && CC != X86::COND_NE)
49011     return SDValue();
49012
49013   if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
49014       !X86::isZeroNode(EFLAGS.getOperand(1)) ||
49015       !EFLAGS.getOperand(0).getValueType().isInteger())
49016     return SDValue();
49017
49018   SDValue Z = EFLAGS.getOperand(0);
49019   EVT ZVT = Z.getValueType();
49020
49021   // If X is -1 or 0, then we have an opportunity to avoid constants required in
49022   // the general case below.
49023   if (ConstantX) {
49024     // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
49025     // fake operands:
49026     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
49027     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
49028     if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
49029         (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
49030       SDValue Zero = DAG.getConstant(0, DL, ZVT);
49031       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49032       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
49033       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49034                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49035                          SDValue(Neg.getNode(), 1));
49036     }
49037
49038     // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
49039     // with fake operands:
49040     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
49041     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
49042     if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
49043         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
49044       SDValue One = DAG.getConstant(1, DL, ZVT);
49045       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49046       SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49047       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49048                          DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49049                          Cmp1.getValue(1));
49050     }
49051   }
49052
49053   // (cmp Z, 1) sets the carry flag if Z is 0.
49054   SDValue One = DAG.getConstant(1, DL, ZVT);
49055   SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49056   SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49057
49058   // Add the flags type for ADC/SBB nodes.
49059   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49060
49061   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
49062   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
49063   if (CC == X86::COND_NE)
49064     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
49065                        DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
49066
49067   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
49068   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
49069   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
49070                      DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
49071 }
49072
49073 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
49074 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49075 /// with CMP+{ADC, SBB}.
49076 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
49077   bool IsSub = N->getOpcode() == ISD::SUB;
49078   SDValue X = N->getOperand(0);
49079   SDValue Y = N->getOperand(1);
49080   EVT VT = N->getValueType(0);
49081   SDLoc DL(N);
49082
49083   if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
49084     return ADCOrSBB;
49085
49086   // Commute and try again (negate the result for subtracts).
49087   if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
49088     if (IsSub)
49089       ADCOrSBB =
49090           DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);
49091     return ADCOrSBB;
49092   }
49093
49094   return SDValue();
49095 }
49096
49097 static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1,
49098                                      SelectionDAG &DAG) {
49099   assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&
49100          "Unexpected opcode");
49101
49102   // Delegate to combineAddOrSubToADCOrSBB if we have:
49103   //
49104   //   (xor/or (zero_extend (setcc)) imm)
49105   //
49106   // where imm is odd if and only if we have xor, in which case the XOR/OR are
49107   // equivalent to a SUB/ADD, respectively.
49108   if (N0.getOpcode() == ISD::ZERO_EXTEND &&
49109       N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
49110     if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
49111       bool IsSub = N->getOpcode() == ISD::XOR;
49112       bool N1COdd = N1C->getZExtValue() & 1;
49113       if (IsSub ? N1COdd : !N1COdd) {
49114         SDLoc DL(N);
49115         EVT VT = N->getValueType(0);
49116         if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
49117           return R;
49118       }
49119     }
49120   }
49121
49122   return SDValue();
49123 }
49124
49125 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
49126                          TargetLowering::DAGCombinerInfo &DCI,
49127                          const X86Subtarget &Subtarget) {
49128   SDValue N0 = N->getOperand(0);
49129   SDValue N1 = N->getOperand(1);
49130   EVT VT = N->getValueType(0);
49131   SDLoc dl(N);
49132   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49133
49134   // If this is SSE1 only convert to FOR to avoid scalarization.
49135   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49136     return DAG.getBitcast(MVT::v4i32,
49137                           DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
49138                                       DAG.getBitcast(MVT::v4f32, N0),
49139                                       DAG.getBitcast(MVT::v4f32, N1)));
49140   }
49141
49142   // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
49143   // TODO: Support multiple SrcOps.
49144   if (VT == MVT::i1) {
49145     SmallVector<SDValue, 2> SrcOps;
49146     SmallVector<APInt, 2> SrcPartials;
49147     if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
49148         SrcOps.size() == 1) {
49149       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49150       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49151       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49152       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49153         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49154       if (Mask) {
49155         assert(SrcPartials[0].getBitWidth() == NumElts &&
49156                "Unexpected partial reduction mask");
49157         SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
49158         SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49159         Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49160         return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
49161       }
49162     }
49163   }
49164
49165   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49166     return R;
49167
49168   if (SDValue R = combineBitOpWithShift(N, DAG))
49169     return R;
49170
49171   if (SDValue R = combineBitOpWithPACK(N, DAG))
49172     return R;
49173
49174   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49175     return FPLogic;
49176
49177   if (DCI.isBeforeLegalizeOps())
49178     return SDValue();
49179
49180   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49181     return R;
49182
49183   if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
49184     return R;
49185
49186   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
49187     return R;
49188
49189   // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
49190   if ((VT == MVT::i32 || VT == MVT::i64) &&
49191       N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
49192       isNullConstant(N0.getOperand(0))) {
49193     SDValue Cond = N0.getOperand(1);
49194     if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
49195       Cond = Cond.getOperand(0);
49196
49197     if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
49198       if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
49199         uint64_t Val = CN->getZExtValue();
49200         if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
49201           X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
49202           CCode = X86::GetOppositeBranchCondition(CCode);
49203           SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
49204
49205           SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
49206           R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
49207           R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
49208           return R;
49209         }
49210       }
49211     }
49212   }
49213
49214   // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
49215   // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
49216   // iff the upper elements of the non-shifted arg are zero.
49217   // KUNPCK require 16+ bool vector elements.
49218   if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
49219     unsigned NumElts = VT.getVectorNumElements();
49220     unsigned HalfElts = NumElts / 2;
49221     APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
49222     if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
49223         N1.getConstantOperandAPInt(1) == HalfElts &&
49224         DAG.MaskedVectorIsZero(N0, UpperElts)) {
49225       return DAG.getNode(
49226           ISD::CONCAT_VECTORS, dl, VT,
49227           extractSubVector(N0, 0, DAG, dl, HalfElts),
49228           extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
49229     }
49230     if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
49231         N0.getConstantOperandAPInt(1) == HalfElts &&
49232         DAG.MaskedVectorIsZero(N1, UpperElts)) {
49233       return DAG.getNode(
49234           ISD::CONCAT_VECTORS, dl, VT,
49235           extractSubVector(N1, 0, DAG, dl, HalfElts),
49236           extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
49237     }
49238   }
49239
49240   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
49241     // Attempt to recursively combine an OR of shuffles.
49242     SDValue Op(N, 0);
49243     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49244       return Res;
49245
49246     // If either operand is a constant mask, then only the elements that aren't
49247     // allones are actually demanded by the other operand.
49248     auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
49249       APInt UndefElts;
49250       SmallVector<APInt> EltBits;
49251       int NumElts = VT.getVectorNumElements();
49252       int EltSizeInBits = VT.getScalarSizeInBits();
49253       if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
49254         return false;
49255
49256       APInt DemandedElts = APInt::getZero(NumElts);
49257       for (int I = 0; I != NumElts; ++I)
49258         if (!EltBits[I].isAllOnes())
49259           DemandedElts.setBit(I);
49260
49261       return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
49262     };
49263     if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
49264       if (N->getOpcode() != ISD::DELETED_NODE)
49265         DCI.AddToWorklist(N);
49266       return SDValue(N, 0);
49267     }
49268   }
49269
49270   // We should fold "masked merge" patterns when `andn` is not available.
49271   if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
49272     if (SDValue R = foldMaskedMerge(N, DAG))
49273       return R;
49274
49275   if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
49276     return R;
49277
49278   return SDValue();
49279 }
49280
49281 /// Try to turn tests against the signbit in the form of:
49282 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
49283 /// into:
49284 ///   SETGT(X, -1)
49285 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
49286   // This is only worth doing if the output type is i8 or i1.
49287   EVT ResultType = N->getValueType(0);
49288   if (ResultType != MVT::i8 && ResultType != MVT::i1)
49289     return SDValue();
49290
49291   SDValue N0 = N->getOperand(0);
49292   SDValue N1 = N->getOperand(1);
49293
49294   // We should be performing an xor against a truncated shift.
49295   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
49296     return SDValue();
49297
49298   // Make sure we are performing an xor against one.
49299   if (!isOneConstant(N1))
49300     return SDValue();
49301
49302   // SetCC on x86 zero extends so only act on this if it's a logical shift.
49303   SDValue Shift = N0.getOperand(0);
49304   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
49305     return SDValue();
49306
49307   // Make sure we are truncating from one of i16, i32 or i64.
49308   EVT ShiftTy = Shift.getValueType();
49309   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
49310     return SDValue();
49311
49312   // Make sure the shift amount extracts the sign bit.
49313   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
49314       Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
49315     return SDValue();
49316
49317   // Create a greater-than comparison against -1.
49318   // N.B. Using SETGE against 0 works but we want a canonical looking
49319   // comparison, using SETGT matches up with what TranslateX86CC.
49320   SDLoc DL(N);
49321   SDValue ShiftOp = Shift.getOperand(0);
49322   EVT ShiftOpTy = ShiftOp.getValueType();
49323   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49324   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
49325                                                *DAG.getContext(), ResultType);
49326   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
49327                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
49328   if (SetCCResultType != ResultType)
49329     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
49330   return Cond;
49331 }
49332
49333 /// Turn vector tests of the signbit in the form of:
49334 ///   xor (sra X, elt_size(X)-1), -1
49335 /// into:
49336 ///   pcmpgt X, -1
49337 ///
49338 /// This should be called before type legalization because the pattern may not
49339 /// persist after that.
49340 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
49341                                          const X86Subtarget &Subtarget) {
49342   EVT VT = N->getValueType(0);
49343   if (!VT.isSimple())
49344     return SDValue();
49345
49346   switch (VT.getSimpleVT().SimpleTy) {
49347   default: return SDValue();
49348   case MVT::v16i8:
49349   case MVT::v8i16:
49350   case MVT::v4i32:
49351   case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
49352   case MVT::v32i8:
49353   case MVT::v16i16:
49354   case MVT::v8i32:
49355   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
49356   }
49357
49358   // There must be a shift right algebraic before the xor, and the xor must be a
49359   // 'not' operation.
49360   SDValue Shift = N->getOperand(0);
49361   SDValue Ones = N->getOperand(1);
49362   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
49363       !ISD::isBuildVectorAllOnes(Ones.getNode()))
49364     return SDValue();
49365
49366   // The shift should be smearing the sign bit across each vector element.
49367   auto *ShiftAmt =
49368       isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
49369   if (!ShiftAmt ||
49370       ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
49371     return SDValue();
49372
49373   // Create a greater-than comparison against -1. We don't use the more obvious
49374   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
49375   return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
49376 }
49377
49378 /// Detect patterns of truncation with unsigned saturation:
49379 ///
49380 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
49381 ///   Return the source value x to be truncated or SDValue() if the pattern was
49382 ///   not matched.
49383 ///
49384 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
49385 ///   where C1 >= 0 and C2 is unsigned max of destination type.
49386 ///
49387 ///    (truncate (smax (smin (x, C2), C1)) to dest_type)
49388 ///   where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
49389 ///
49390 ///   These two patterns are equivalent to:
49391 ///   (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
49392 ///   So return the smax(x, C1) value to be truncated or SDValue() if the
49393 ///   pattern was not matched.
49394 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
49395                                  const SDLoc &DL) {
49396   EVT InVT = In.getValueType();
49397
49398   // Saturation with truncation. We truncate from InVT to VT.
49399   assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
49400          "Unexpected types for truncate operation");
49401
49402   // Match min/max and return limit value as a parameter.
49403   auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
49404     if (V.getOpcode() == Opcode &&
49405         ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
49406       return V.getOperand(0);
49407     return SDValue();
49408   };
49409
49410   APInt C1, C2;
49411   if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
49412     // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
49413     // the element size of the destination type.
49414     if (C2.isMask(VT.getScalarSizeInBits()))
49415       return UMin;
49416
49417   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
49418     if (MatchMinMax(SMin, ISD::SMAX, C1))
49419       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
49420         return SMin;
49421
49422   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
49423     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
49424       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
49425           C2.uge(C1)) {
49426         return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
49427       }
49428
49429   return SDValue();
49430 }
49431
49432 /// Detect patterns of truncation with signed saturation:
49433 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
49434 ///                  signed_max_of_dest_type)) to dest_type)
49435 /// or:
49436 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
49437 ///                  signed_min_of_dest_type)) to dest_type).
49438 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
49439 /// Return the source value to be truncated or SDValue() if the pattern was not
49440 /// matched.
49441 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
49442   unsigned NumDstBits = VT.getScalarSizeInBits();
49443   unsigned NumSrcBits = In.getScalarValueSizeInBits();
49444   assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
49445
49446   auto MatchMinMax = [](SDValue V, unsigned Opcode,
49447                         const APInt &Limit) -> SDValue {
49448     APInt C;
49449     if (V.getOpcode() == Opcode &&
49450         ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
49451       return V.getOperand(0);
49452     return SDValue();
49453   };
49454
49455   APInt SignedMax, SignedMin;
49456   if (MatchPackUS) {
49457     SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
49458     SignedMin = APInt(NumSrcBits, 0);
49459   } else {
49460     SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
49461     SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
49462   }
49463
49464   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
49465     if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
49466       return SMax;
49467
49468   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
49469     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
49470       return SMin;
49471
49472   return SDValue();
49473 }
49474
49475 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
49476                                       SelectionDAG &DAG,
49477                                       const X86Subtarget &Subtarget) {
49478   if (!Subtarget.hasSSE2() || !VT.isVector())
49479     return SDValue();
49480
49481   EVT SVT = VT.getVectorElementType();
49482   EVT InVT = In.getValueType();
49483   EVT InSVT = InVT.getVectorElementType();
49484
49485   // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
49486   // split across two registers. We can use a packusdw+perm to clamp to 0-65535
49487   // and concatenate at the same time. Then we can use a final vpmovuswb to
49488   // clip to 0-255.
49489   if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
49490       InVT == MVT::v16i32 && VT == MVT::v16i8) {
49491     if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
49492       // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
49493       SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
49494                                            DL, DAG, Subtarget);
49495       assert(Mid && "Failed to pack!");
49496       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
49497     }
49498   }
49499
49500   // vXi32 truncate instructions are available with AVX512F.
49501   // vXi16 truncate instructions are only available with AVX512BW.
49502   // For 256-bit or smaller vectors, we require VLX.
49503   // FIXME: We could widen truncates to 512 to remove the VLX restriction.
49504   // If the result type is 256-bits or larger and we have disable 512-bit
49505   // registers, we should go ahead and use the pack instructions if possible.
49506   bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
49507                        (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
49508                       (InVT.getSizeInBits() > 128) &&
49509                       (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
49510                       !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
49511
49512   if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
49513       isPowerOf2_32(VT.getVectorNumElements()) &&
49514       (SVT == MVT::i8 || SVT == MVT::i16) &&
49515       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
49516     if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
49517       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
49518       if (SVT == MVT::i8 && InSVT == MVT::i32) {
49519         EVT MidVT = VT.changeVectorElementType(MVT::i16);
49520         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
49521                                              DAG, Subtarget);
49522         assert(Mid && "Failed to pack!");
49523         SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
49524                                            Subtarget);
49525         assert(V && "Failed to pack!");
49526         return V;
49527       } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
49528         return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
49529                                       Subtarget);
49530     }
49531     if (SDValue SSatVal = detectSSatPattern(In, VT))
49532       return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
49533                                     Subtarget);
49534   }
49535
49536   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49537   if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
49538       Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
49539       (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
49540     unsigned TruncOpc = 0;
49541     SDValue SatVal;
49542     if (SDValue SSatVal = detectSSatPattern(In, VT)) {
49543       SatVal = SSatVal;
49544       TruncOpc = X86ISD::VTRUNCS;
49545     } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
49546       SatVal = USatVal;
49547       TruncOpc = X86ISD::VTRUNCUS;
49548     }
49549     if (SatVal) {
49550       unsigned ResElts = VT.getVectorNumElements();
49551       // If the input type is less than 512 bits and we don't have VLX, we need
49552       // to widen to 512 bits.
49553       if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
49554         unsigned NumConcats = 512 / InVT.getSizeInBits();
49555         ResElts *= NumConcats;
49556         SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
49557         ConcatOps[0] = SatVal;
49558         InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
49559                                 NumConcats * InVT.getVectorNumElements());
49560         SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
49561       }
49562       // Widen the result if its narrower than 128 bits.
49563       if (ResElts * SVT.getSizeInBits() < 128)
49564         ResElts = 128 / SVT.getSizeInBits();
49565       EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
49566       SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
49567       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
49568                          DAG.getIntPtrConstant(0, DL));
49569     }
49570   }
49571
49572   return SDValue();
49573 }
49574
49575 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
49576 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
49577 /// ISD::AVGCEILU (AVG) instruction.
49578 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
49579                                 const X86Subtarget &Subtarget,
49580                                 const SDLoc &DL) {
49581   if (!VT.isVector())
49582     return SDValue();
49583   EVT InVT = In.getValueType();
49584   unsigned NumElems = VT.getVectorNumElements();
49585
49586   EVT ScalarVT = VT.getVectorElementType();
49587   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
49588     return SDValue();
49589
49590   // InScalarVT is the intermediate type in AVG pattern and it should be greater
49591   // than the original input type (i8/i16).
49592   EVT InScalarVT = InVT.getVectorElementType();
49593   if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
49594     return SDValue();
49595
49596   if (!Subtarget.hasSSE2())
49597     return SDValue();
49598
49599   // Detect the following pattern:
49600   //
49601   //   %1 = zext <N x i8> %a to <N x i32>
49602   //   %2 = zext <N x i8> %b to <N x i32>
49603   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
49604   //   %4 = add nuw nsw <N x i32> %3, %2
49605   //   %5 = lshr <N x i32> %N, <i32 1 x N>
49606   //   %6 = trunc <N x i32> %5 to <N x i8>
49607   //
49608   // In AVX512, the last instruction can also be a trunc store.
49609   if (In.getOpcode() != ISD::SRL)
49610     return SDValue();
49611
49612   // A lambda checking the given SDValue is a constant vector and each element
49613   // is in the range [Min, Max].
49614   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
49615     return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
49616       return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
49617     });
49618   };
49619
49620   auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
49621     unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
49622     return MaxActiveBits <= ScalarVT.getSizeInBits();
49623   };
49624
49625   // Check if each element of the vector is right-shifted by one.
49626   SDValue LHS = In.getOperand(0);
49627   SDValue RHS = In.getOperand(1);
49628   if (!IsConstVectorInRange(RHS, 1, 1))
49629     return SDValue();
49630   if (LHS.getOpcode() != ISD::ADD)
49631     return SDValue();
49632
49633   // Detect a pattern of a + b + 1 where the order doesn't matter.
49634   SDValue Operands[3];
49635   Operands[0] = LHS.getOperand(0);
49636   Operands[1] = LHS.getOperand(1);
49637
49638   auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49639                        ArrayRef<SDValue> Ops) {
49640     return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
49641   };
49642
49643   auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
49644     for (SDValue &Op : Ops)
49645       if (Op.getValueType() != VT)
49646         Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
49647     // Pad to a power-of-2 vector, split+apply and extract the original vector.
49648     unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
49649     EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
49650     if (NumElemsPow2 != NumElems) {
49651       for (SDValue &Op : Ops) {
49652         SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
49653         for (unsigned i = 0; i != NumElems; ++i) {
49654           SDValue Idx = DAG.getIntPtrConstant(i, DL);
49655           EltsOfOp[i] =
49656               DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
49657         }
49658         Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
49659       }
49660     }
49661     SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
49662     if (NumElemsPow2 == NumElems)
49663       return Res;
49664     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
49665                        DAG.getIntPtrConstant(0, DL));
49666   };
49667
49668   // Take care of the case when one of the operands is a constant vector whose
49669   // element is in the range [1, 256].
49670   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
49671       IsZExtLike(Operands[0])) {
49672     // The pattern is detected. Subtract one from the constant vector, then
49673     // demote it and emit X86ISD::AVG instruction.
49674     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
49675     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
49676     return AVGSplitter({Operands[0], Operands[1]});
49677   }
49678
49679   // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
49680   // Match the or case only if its 'add-like' - can be replaced by an add.
49681   auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
49682     if (ISD::ADD == V.getOpcode()) {
49683       Op0 = V.getOperand(0);
49684       Op1 = V.getOperand(1);
49685       return true;
49686     }
49687     if (ISD::ZERO_EXTEND != V.getOpcode())
49688       return false;
49689     V = V.getOperand(0);
49690     if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
49691         !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
49692       return false;
49693     Op0 = V.getOperand(0);
49694     Op1 = V.getOperand(1);
49695     return true;
49696   };
49697
49698   SDValue Op0, Op1;
49699   if (FindAddLike(Operands[0], Op0, Op1))
49700     std::swap(Operands[0], Operands[1]);
49701   else if (!FindAddLike(Operands[1], Op0, Op1))
49702     return SDValue();
49703   Operands[2] = Op0;
49704   Operands[1] = Op1;
49705
49706   // Now we have three operands of two additions. Check that one of them is a
49707   // constant vector with ones, and the other two can be promoted from i8/i16.
49708   for (SDValue &Op : Operands) {
49709     if (!IsConstVectorInRange(Op, 1, 1))
49710       continue;
49711     std::swap(Op, Operands[2]);
49712
49713     // Check if Operands[0] and Operands[1] are results of type promotion.
49714     for (int j = 0; j < 2; ++j)
49715       if (Operands[j].getValueType() != VT)
49716         if (!IsZExtLike(Operands[j]))
49717           return SDValue();
49718
49719     // The pattern is detected, emit X86ISD::AVG instruction(s).
49720     return AVGSplitter({Operands[0], Operands[1]});
49721   }
49722
49723   return SDValue();
49724 }
49725
49726 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
49727                            TargetLowering::DAGCombinerInfo &DCI,
49728                            const X86Subtarget &Subtarget) {
49729   LoadSDNode *Ld = cast<LoadSDNode>(N);
49730   EVT RegVT = Ld->getValueType(0);
49731   EVT MemVT = Ld->getMemoryVT();
49732   SDLoc dl(Ld);
49733   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49734
49735   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
49736   // into two 16-byte operations. Also split non-temporal aligned loads on
49737   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
49738   ISD::LoadExtType Ext = Ld->getExtensionType();
49739   unsigned Fast;
49740   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
49741       Ext == ISD::NON_EXTLOAD &&
49742       ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
49743         Ld->getAlign() >= Align(16)) ||
49744        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
49745                                *Ld->getMemOperand(), &Fast) &&
49746         !Fast))) {
49747     unsigned NumElems = RegVT.getVectorNumElements();
49748     if (NumElems < 2)
49749       return SDValue();
49750
49751     unsigned HalfOffset = 16;
49752     SDValue Ptr1 = Ld->getBasePtr();
49753     SDValue Ptr2 =
49754         DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
49755     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
49756                                   NumElems / 2);
49757     SDValue Load1 =
49758         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
49759                     Ld->getOriginalAlign(),
49760                     Ld->getMemOperand()->getFlags());
49761     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
49762                                 Ld->getPointerInfo().getWithOffset(HalfOffset),
49763                                 Ld->getOriginalAlign(),
49764                                 Ld->getMemOperand()->getFlags());
49765     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
49766                              Load1.getValue(1), Load2.getValue(1));
49767
49768     SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
49769     return DCI.CombineTo(N, NewVec, TF, true);
49770   }
49771
49772   // Bool vector load - attempt to cast to an integer, as we have good
49773   // (vXiY *ext(vXi1 bitcast(iX))) handling.
49774   if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
49775       RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
49776     unsigned NumElts = RegVT.getVectorNumElements();
49777     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49778     if (TLI.isTypeLegal(IntVT)) {
49779       SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
49780                                     Ld->getPointerInfo(),
49781                                     Ld->getOriginalAlign(),
49782                                     Ld->getMemOperand()->getFlags());
49783       SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
49784       return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
49785     }
49786   }
49787
49788   // If we also broadcast this as a subvector to a wider type, then just extract
49789   // the lowest subvector.
49790   if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
49791       (RegVT.is128BitVector() || RegVT.is256BitVector())) {
49792     SDValue Ptr = Ld->getBasePtr();
49793     SDValue Chain = Ld->getChain();
49794     for (SDNode *User : Ptr->uses()) {
49795       if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
49796           cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
49797           cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
49798           cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
49799               MemVT.getSizeInBits() &&
49800           !User->hasAnyUseOfValue(1) &&
49801           User->getValueSizeInBits(0).getFixedValue() >
49802               RegVT.getFixedSizeInBits()) {
49803         SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
49804                                            RegVT.getSizeInBits());
49805         Extract = DAG.getBitcast(RegVT, Extract);
49806         return DCI.CombineTo(N, Extract, SDValue(User, 1));
49807       }
49808     }
49809   }
49810
49811   // Cast ptr32 and ptr64 pointers to the default address space before a load.
49812   unsigned AddrSpace = Ld->getAddressSpace();
49813   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
49814       AddrSpace == X86AS::PTR32_UPTR) {
49815     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
49816     if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
49817       SDValue Cast =
49818           DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
49819       return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
49820                             Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
49821                             Ld->getMemOperand()->getFlags());
49822     }
49823   }
49824
49825   return SDValue();
49826 }
49827
49828 /// If V is a build vector of boolean constants and exactly one of those
49829 /// constants is true, return the operand index of that true element.
49830 /// Otherwise, return -1.
49831 static int getOneTrueElt(SDValue V) {
49832   // This needs to be a build vector of booleans.
49833   // TODO: Checking for the i1 type matches the IR definition for the mask,
49834   // but the mask check could be loosened to i8 or other types. That might
49835   // also require checking more than 'allOnesValue'; eg, the x86 HW
49836   // instructions only require that the MSB is set for each mask element.
49837   // The ISD::MSTORE comments/definition do not specify how the mask operand
49838   // is formatted.
49839   auto *BV = dyn_cast<BuildVectorSDNode>(V);
49840   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
49841     return -1;
49842
49843   int TrueIndex = -1;
49844   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
49845   for (unsigned i = 0; i < NumElts; ++i) {
49846     const SDValue &Op = BV->getOperand(i);
49847     if (Op.isUndef())
49848       continue;
49849     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
49850     if (!ConstNode)
49851       return -1;
49852     if (ConstNode->getAPIntValue().countr_one() >= 1) {
49853       // If we already found a one, this is too many.
49854       if (TrueIndex >= 0)
49855         return -1;
49856       TrueIndex = i;
49857     }
49858   }
49859   return TrueIndex;
49860 }
49861
49862 /// Given a masked memory load/store operation, return true if it has one mask
49863 /// bit set. If it has one mask bit set, then also return the memory address of
49864 /// the scalar element to load/store, the vector index to insert/extract that
49865 /// scalar element, and the alignment for the scalar memory access.
49866 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
49867                                          SelectionDAG &DAG, SDValue &Addr,
49868                                          SDValue &Index, Align &Alignment,
49869                                          unsigned &Offset) {
49870   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
49871   if (TrueMaskElt < 0)
49872     return false;
49873
49874   // Get the address of the one scalar element that is specified by the mask
49875   // using the appropriate offset from the base pointer.
49876   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
49877   Offset = 0;
49878   Addr = MaskedOp->getBasePtr();
49879   if (TrueMaskElt != 0) {
49880     Offset = TrueMaskElt * EltVT.getStoreSize();
49881     Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
49882                                     SDLoc(MaskedOp));
49883   }
49884
49885   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
49886   Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
49887                               EltVT.getStoreSize());
49888   return true;
49889 }
49890
49891 /// If exactly one element of the mask is set for a non-extending masked load,
49892 /// it is a scalar load and vector insert.
49893 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
49894 /// mask have already been optimized in IR, so we don't bother with those here.
49895 static SDValue
49896 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
49897                              TargetLowering::DAGCombinerInfo &DCI,
49898                              const X86Subtarget &Subtarget) {
49899   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
49900   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
49901   // However, some target hooks may need to be added to know when the transform
49902   // is profitable. Endianness would also have to be considered.
49903
49904   SDValue Addr, VecIndex;
49905   Align Alignment;
49906   unsigned Offset;
49907   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
49908     return SDValue();
49909
49910   // Load the one scalar element that is specified by the mask using the
49911   // appropriate offset from the base pointer.
49912   SDLoc DL(ML);
49913   EVT VT = ML->getValueType(0);
49914   EVT EltVT = VT.getVectorElementType();
49915
49916   EVT CastVT = VT;
49917   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
49918     EltVT = MVT::f64;
49919     CastVT = VT.changeVectorElementType(EltVT);
49920   }
49921
49922   SDValue Load =
49923       DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
49924                   ML->getPointerInfo().getWithOffset(Offset),
49925                   Alignment, ML->getMemOperand()->getFlags());
49926
49927   SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
49928
49929   // Insert the loaded element into the appropriate place in the vector.
49930   SDValue Insert =
49931       DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
49932   Insert = DAG.getBitcast(VT, Insert);
49933   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
49934 }
49935
49936 static SDValue
49937 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
49938                               TargetLowering::DAGCombinerInfo &DCI) {
49939   assert(ML->isUnindexed() && "Unexpected indexed masked load!");
49940   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
49941     return SDValue();
49942
49943   SDLoc DL(ML);
49944   EVT VT = ML->getValueType(0);
49945
49946   // If we are loading the first and last elements of a vector, it is safe and
49947   // always faster to load the whole vector. Replace the masked load with a
49948   // vector load and select.
49949   unsigned NumElts = VT.getVectorNumElements();
49950   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
49951   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
49952   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
49953   if (LoadFirstElt && LoadLastElt) {
49954     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
49955                                 ML->getMemOperand());
49956     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
49957                                   ML->getPassThru());
49958     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
49959   }
49960
49961   // Convert a masked load with a constant mask into a masked load and a select.
49962   // This allows the select operation to use a faster kind of select instruction
49963   // (for example, vblendvps -> vblendps).
49964
49965   // Don't try this if the pass-through operand is already undefined. That would
49966   // cause an infinite loop because that's what we're about to create.
49967   if (ML->getPassThru().isUndef())
49968     return SDValue();
49969
49970   if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
49971     return SDValue();
49972
49973   // The new masked load has an undef pass-through operand. The select uses the
49974   // original pass-through operand.
49975   SDValue NewML = DAG.getMaskedLoad(
49976       VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
49977       DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
49978       ML->getAddressingMode(), ML->getExtensionType());
49979   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
49980                                 ML->getPassThru());
49981
49982   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
49983 }
49984
49985 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
49986                                  TargetLowering::DAGCombinerInfo &DCI,
49987                                  const X86Subtarget &Subtarget) {
49988   auto *Mld = cast<MaskedLoadSDNode>(N);
49989
49990   // TODO: Expanding load with constant mask may be optimized as well.
49991   if (Mld->isExpandingLoad())
49992     return SDValue();
49993
49994   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
49995     if (SDValue ScalarLoad =
49996             reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
49997       return ScalarLoad;
49998
49999     // TODO: Do some AVX512 subsets benefit from this transform?
50000     if (!Subtarget.hasAVX512())
50001       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
50002         return Blend;
50003   }
50004
50005   // If the mask value has been legalized to a non-boolean vector, try to
50006   // simplify ops leading up to it. We only demand the MSB of each lane.
50007   SDValue Mask = Mld->getMask();
50008   if (Mask.getScalarValueSizeInBits() != 1) {
50009     EVT VT = Mld->getValueType(0);
50010     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50011     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
50012     if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50013       if (N->getOpcode() != ISD::DELETED_NODE)
50014         DCI.AddToWorklist(N);
50015       return SDValue(N, 0);
50016     }
50017     if (SDValue NewMask =
50018             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
50019       return DAG.getMaskedLoad(
50020           VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
50021           NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
50022           Mld->getAddressingMode(), Mld->getExtensionType());
50023   }
50024
50025   return SDValue();
50026 }
50027
50028 /// If exactly one element of the mask is set for a non-truncating masked store,
50029 /// it is a vector extract and scalar store.
50030 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
50031 /// mask have already been optimized in IR, so we don't bother with those here.
50032 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
50033                                               SelectionDAG &DAG,
50034                                               const X86Subtarget &Subtarget) {
50035   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
50036   // However, some target hooks may need to be added to know when the transform
50037   // is profitable. Endianness would also have to be considered.
50038
50039   SDValue Addr, VecIndex;
50040   Align Alignment;
50041   unsigned Offset;
50042   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
50043     return SDValue();
50044
50045   // Extract the one scalar element that is actually being stored.
50046   SDLoc DL(MS);
50047   SDValue Value = MS->getValue();
50048   EVT VT = Value.getValueType();
50049   EVT EltVT = VT.getVectorElementType();
50050   if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
50051     EltVT = MVT::f64;
50052     EVT CastVT = VT.changeVectorElementType(EltVT);
50053     Value = DAG.getBitcast(CastVT, Value);
50054   }
50055   SDValue Extract =
50056       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
50057
50058   // Store that element at the appropriate offset from the base pointer.
50059   return DAG.getStore(MS->getChain(), DL, Extract, Addr,
50060                       MS->getPointerInfo().getWithOffset(Offset),
50061                       Alignment, MS->getMemOperand()->getFlags());
50062 }
50063
50064 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
50065                                   TargetLowering::DAGCombinerInfo &DCI,
50066                                   const X86Subtarget &Subtarget) {
50067   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
50068   if (Mst->isCompressingStore())
50069     return SDValue();
50070
50071   EVT VT = Mst->getValue().getValueType();
50072   SDLoc dl(Mst);
50073   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50074
50075   if (Mst->isTruncatingStore())
50076     return SDValue();
50077
50078   if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
50079     return ScalarStore;
50080
50081   // If the mask value has been legalized to a non-boolean vector, try to
50082   // simplify ops leading up to it. We only demand the MSB of each lane.
50083   SDValue Mask = Mst->getMask();
50084   if (Mask.getScalarValueSizeInBits() != 1) {
50085     APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
50086     if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50087       if (N->getOpcode() != ISD::DELETED_NODE)
50088         DCI.AddToWorklist(N);
50089       return SDValue(N, 0);
50090     }
50091     if (SDValue NewMask =
50092             TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
50093       return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
50094                                 Mst->getBasePtr(), Mst->getOffset(), NewMask,
50095                                 Mst->getMemoryVT(), Mst->getMemOperand(),
50096                                 Mst->getAddressingMode());
50097   }
50098
50099   SDValue Value = Mst->getValue();
50100   if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
50101       TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
50102                             Mst->getMemoryVT())) {
50103     return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
50104                               Mst->getBasePtr(), Mst->getOffset(), Mask,
50105                               Mst->getMemoryVT(), Mst->getMemOperand(),
50106                               Mst->getAddressingMode(), true);
50107   }
50108
50109   return SDValue();
50110 }
50111
50112 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
50113                             TargetLowering::DAGCombinerInfo &DCI,
50114                             const X86Subtarget &Subtarget) {
50115   StoreSDNode *St = cast<StoreSDNode>(N);
50116   EVT StVT = St->getMemoryVT();
50117   SDLoc dl(St);
50118   SDValue StoredVal = St->getValue();
50119   EVT VT = StoredVal.getValueType();
50120   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50121
50122   // Convert a store of vXi1 into a store of iX and a bitcast.
50123   if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
50124       VT.getVectorElementType() == MVT::i1) {
50125
50126     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
50127     StoredVal = DAG.getBitcast(NewVT, StoredVal);
50128
50129     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50130                         St->getPointerInfo(), St->getOriginalAlign(),
50131                         St->getMemOperand()->getFlags());
50132   }
50133
50134   // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
50135   // This will avoid a copy to k-register.
50136   if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
50137       StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
50138       StoredVal.getOperand(0).getValueType() == MVT::i8) {
50139     SDValue Val = StoredVal.getOperand(0);
50140     // We must store zeros to the unused bits.
50141     Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
50142     return DAG.getStore(St->getChain(), dl, Val,
50143                         St->getBasePtr(), St->getPointerInfo(),
50144                         St->getOriginalAlign(),
50145                         St->getMemOperand()->getFlags());
50146   }
50147
50148   // Widen v2i1/v4i1 stores to v8i1.
50149   if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
50150       Subtarget.hasAVX512()) {
50151     unsigned NumConcats = 8 / VT.getVectorNumElements();
50152     // We must store zeros to the unused bits.
50153     SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
50154     Ops[0] = StoredVal;
50155     StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
50156     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50157                         St->getPointerInfo(), St->getOriginalAlign(),
50158                         St->getMemOperand()->getFlags());
50159   }
50160
50161   // Turn vXi1 stores of constants into a scalar store.
50162   if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
50163        VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
50164       ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
50165     // If its a v64i1 store without 64-bit support, we need two stores.
50166     if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
50167       SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
50168                                       StoredVal->ops().slice(0, 32));
50169       Lo = combinevXi1ConstantToInteger(Lo, DAG);
50170       SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
50171                                       StoredVal->ops().slice(32, 32));
50172       Hi = combinevXi1ConstantToInteger(Hi, DAG);
50173
50174       SDValue Ptr0 = St->getBasePtr();
50175       SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
50176
50177       SDValue Ch0 =
50178           DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
50179                        St->getOriginalAlign(),
50180                        St->getMemOperand()->getFlags());
50181       SDValue Ch1 =
50182           DAG.getStore(St->getChain(), dl, Hi, Ptr1,
50183                        St->getPointerInfo().getWithOffset(4),
50184                        St->getOriginalAlign(),
50185                        St->getMemOperand()->getFlags());
50186       return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
50187     }
50188
50189     StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
50190     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50191                         St->getPointerInfo(), St->getOriginalAlign(),
50192                         St->getMemOperand()->getFlags());
50193   }
50194
50195   // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
50196   // Sandy Bridge, perform two 16-byte stores.
50197   unsigned Fast;
50198   if (VT.is256BitVector() && StVT == VT &&
50199       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
50200                              *St->getMemOperand(), &Fast) &&
50201       !Fast) {
50202     unsigned NumElems = VT.getVectorNumElements();
50203     if (NumElems < 2)
50204       return SDValue();
50205
50206     return splitVectorStore(St, DAG);
50207   }
50208
50209   // Split under-aligned vector non-temporal stores.
50210   if (St->isNonTemporal() && StVT == VT &&
50211       St->getAlign().value() < VT.getStoreSize()) {
50212     // ZMM/YMM nt-stores - either it can be stored as a series of shorter
50213     // vectors or the legalizer can scalarize it to use MOVNTI.
50214     if (VT.is256BitVector() || VT.is512BitVector()) {
50215       unsigned NumElems = VT.getVectorNumElements();
50216       if (NumElems < 2)
50217         return SDValue();
50218       return splitVectorStore(St, DAG);
50219     }
50220
50221     // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
50222     // to use MOVNTI.
50223     if (VT.is128BitVector() && Subtarget.hasSSE2()) {
50224       MVT NTVT = Subtarget.hasSSE4A()
50225                      ? MVT::v2f64
50226                      : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
50227       return scalarizeVectorStore(St, NTVT, DAG);
50228     }
50229   }
50230
50231   // Try to optimize v16i16->v16i8 truncating stores when BWI is not
50232   // supported, but avx512f is by extending to v16i32 and truncating.
50233   if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
50234       St->getValue().getOpcode() == ISD::TRUNCATE &&
50235       St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
50236       TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
50237       St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
50238     SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
50239                               St->getValue().getOperand(0));
50240     return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
50241                              MVT::v16i8, St->getMemOperand());
50242   }
50243
50244   // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
50245   if (!St->isTruncatingStore() &&
50246       (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
50247        StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
50248       StoredVal.hasOneUse() &&
50249       TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
50250     bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
50251     return EmitTruncSStore(IsSigned, St->getChain(),
50252                            dl, StoredVal.getOperand(0), St->getBasePtr(),
50253                            VT, St->getMemOperand(), DAG);
50254   }
50255
50256   // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
50257   if (!St->isTruncatingStore()) {
50258     auto IsExtractedElement = [](SDValue V) {
50259       if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
50260         V = V.getOperand(0);
50261       unsigned Opc = V.getOpcode();
50262       if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
50263           isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
50264           V.getOperand(0).hasOneUse())
50265         return V.getOperand(0);
50266       return SDValue();
50267     };
50268     if (SDValue Extract = IsExtractedElement(StoredVal)) {
50269       SDValue Trunc = peekThroughOneUseBitcasts(Extract);
50270       if (Trunc.getOpcode() == X86ISD::VTRUNC) {
50271         SDValue Src = Trunc.getOperand(0);
50272         MVT DstVT = Trunc.getSimpleValueType();
50273         MVT SrcVT = Src.getSimpleValueType();
50274         unsigned NumSrcElts = SrcVT.getVectorNumElements();
50275         unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
50276         MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
50277         if (NumTruncBits == VT.getSizeInBits() &&
50278             TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
50279           return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
50280                                    TruncVT, St->getMemOperand());
50281         }
50282       }
50283     }
50284   }
50285
50286   // Optimize trunc store (of multiple scalars) to shuffle and store.
50287   // First, pack all of the elements in one place. Next, store to memory
50288   // in fewer chunks.
50289   if (St->isTruncatingStore() && VT.isVector()) {
50290     // Check if we can detect an AVG pattern from the truncation. If yes,
50291     // replace the trunc store by a normal store with the result of X86ISD::AVG
50292     // instruction.
50293     if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
50294       if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
50295                                          Subtarget, dl))
50296         return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
50297                             St->getPointerInfo(), St->getOriginalAlign(),
50298                             St->getMemOperand()->getFlags());
50299
50300     if (TLI.isTruncStoreLegal(VT, StVT)) {
50301       if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
50302         return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
50303                                dl, Val, St->getBasePtr(),
50304                                St->getMemoryVT(), St->getMemOperand(), DAG);
50305       if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
50306                                           DAG, dl))
50307         return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
50308                                dl, Val, St->getBasePtr(),
50309                                St->getMemoryVT(), St->getMemOperand(), DAG);
50310     }
50311
50312     return SDValue();
50313   }
50314
50315   // Cast ptr32 and ptr64 pointers to the default address space before a store.
50316   unsigned AddrSpace = St->getAddressSpace();
50317   if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
50318       AddrSpace == X86AS::PTR32_UPTR) {
50319     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
50320     if (PtrVT != St->getBasePtr().getSimpleValueType()) {
50321       SDValue Cast =
50322           DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
50323       return DAG.getTruncStore(
50324           St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
50325           St->getOriginalAlign(), St->getMemOperand()->getFlags(),
50326           St->getAAInfo());
50327     }
50328   }
50329
50330   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
50331   // the FP state in cases where an emms may be missing.
50332   // A preferable solution to the general problem is to figure out the right
50333   // places to insert EMMS.  This qualifies as a quick hack.
50334
50335   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
50336   if (VT.getSizeInBits() != 64)
50337     return SDValue();
50338
50339   const Function &F = DAG.getMachineFunction().getFunction();
50340   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
50341   bool F64IsLegal =
50342       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
50343   if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
50344       isa<LoadSDNode>(St->getValue()) &&
50345       cast<LoadSDNode>(St->getValue())->isSimple() &&
50346       St->getChain().hasOneUse() && St->isSimple()) {
50347     LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
50348
50349     if (!ISD::isNormalLoad(Ld))
50350       return SDValue();
50351
50352     // Avoid the transformation if there are multiple uses of the loaded value.
50353     if (!Ld->hasNUsesOfValue(1, 0))
50354       return SDValue();
50355
50356     SDLoc LdDL(Ld);
50357     SDLoc StDL(N);
50358     // Lower to a single movq load/store pair.
50359     SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
50360                                 Ld->getBasePtr(), Ld->getMemOperand());
50361
50362     // Make sure new load is placed in same chain order.
50363     DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
50364     return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
50365                         St->getMemOperand());
50366   }
50367
50368   // This is similar to the above case, but here we handle a scalar 64-bit
50369   // integer store that is extracted from a vector on a 32-bit target.
50370   // If we have SSE2, then we can treat it like a floating-point double
50371   // to get past legalization. The execution dependencies fixup pass will
50372   // choose the optimal machine instruction for the store if this really is
50373   // an integer or v2f32 rather than an f64.
50374   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
50375       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
50376     SDValue OldExtract = St->getOperand(1);
50377     SDValue ExtOp0 = OldExtract.getOperand(0);
50378     unsigned VecSize = ExtOp0.getValueSizeInBits();
50379     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
50380     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
50381     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
50382                                      BitCast, OldExtract.getOperand(1));
50383     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
50384                         St->getPointerInfo(), St->getOriginalAlign(),
50385                         St->getMemOperand()->getFlags());
50386   }
50387
50388   return SDValue();
50389 }
50390
50391 static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
50392                                      TargetLowering::DAGCombinerInfo &DCI,
50393                                      const X86Subtarget &Subtarget) {
50394   auto *St = cast<MemIntrinsicSDNode>(N);
50395
50396   SDValue StoredVal = N->getOperand(1);
50397   MVT VT = StoredVal.getSimpleValueType();
50398   EVT MemVT = St->getMemoryVT();
50399
50400   // Figure out which elements we demand.
50401   unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
50402   APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
50403
50404   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50405   if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
50406     if (N->getOpcode() != ISD::DELETED_NODE)
50407       DCI.AddToWorklist(N);
50408     return SDValue(N, 0);
50409   }
50410
50411   return SDValue();
50412 }
50413
50414 /// Return 'true' if this vector operation is "horizontal"
50415 /// and return the operands for the horizontal operation in LHS and RHS.  A
50416 /// horizontal operation performs the binary operation on successive elements
50417 /// of its first operand, then on successive elements of its second operand,
50418 /// returning the resulting values in a vector.  For example, if
50419 ///   A = < float a0, float a1, float a2, float a3 >
50420 /// and
50421 ///   B = < float b0, float b1, float b2, float b3 >
50422 /// then the result of doing a horizontal operation on A and B is
50423 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
50424 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
50425 /// A horizontal-op B, for some already available A and B, and if so then LHS is
50426 /// set to A, RHS to B, and the routine returns 'true'.
50427 static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
50428                               SelectionDAG &DAG, const X86Subtarget &Subtarget,
50429                               bool IsCommutative,
50430                               SmallVectorImpl<int> &PostShuffleMask) {
50431   // If either operand is undef, bail out. The binop should be simplified.
50432   if (LHS.isUndef() || RHS.isUndef())
50433     return false;
50434
50435   // Look for the following pattern:
50436   //   A = < float a0, float a1, float a2, float a3 >
50437   //   B = < float b0, float b1, float b2, float b3 >
50438   // and
50439   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
50440   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
50441   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
50442   // which is A horizontal-op B.
50443
50444   MVT VT = LHS.getSimpleValueType();
50445   assert((VT.is128BitVector() || VT.is256BitVector()) &&
50446          "Unsupported vector type for horizontal add/sub");
50447   unsigned NumElts = VT.getVectorNumElements();
50448
50449   auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
50450                         SmallVectorImpl<int> &ShuffleMask) {
50451     bool UseSubVector = false;
50452     if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50453         Op.getOperand(0).getValueType().is256BitVector() &&
50454         llvm::isNullConstant(Op.getOperand(1))) {
50455       Op = Op.getOperand(0);
50456       UseSubVector = true;
50457     }
50458     SmallVector<SDValue, 2> SrcOps;
50459     SmallVector<int, 16> SrcMask, ScaledMask;
50460     SDValue BC = peekThroughBitcasts(Op);
50461     if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
50462         !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
50463           return Op.getValueSizeInBits() == BC.getValueSizeInBits();
50464         })) {
50465       resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
50466       if (!UseSubVector && SrcOps.size() <= 2 &&
50467           scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
50468         N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
50469         N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
50470         ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
50471       }
50472       if (UseSubVector && SrcOps.size() == 1 &&
50473           scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
50474         std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
50475         ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
50476         ShuffleMask.assign(Mask.begin(), Mask.end());
50477       }
50478     }
50479   };
50480
50481   // View LHS in the form
50482   //   LHS = VECTOR_SHUFFLE A, B, LMask
50483   // If LHS is not a shuffle, then pretend it is the identity shuffle:
50484   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
50485   // NOTE: A default initialized SDValue represents an UNDEF of type VT.
50486   SDValue A, B;
50487   SmallVector<int, 16> LMask;
50488   GetShuffle(LHS, A, B, LMask);
50489
50490   // Likewise, view RHS in the form
50491   //   RHS = VECTOR_SHUFFLE C, D, RMask
50492   SDValue C, D;
50493   SmallVector<int, 16> RMask;
50494   GetShuffle(RHS, C, D, RMask);
50495
50496   // At least one of the operands should be a vector shuffle.
50497   unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
50498   if (NumShuffles == 0)
50499     return false;
50500
50501   if (LMask.empty()) {
50502     A = LHS;
50503     for (unsigned i = 0; i != NumElts; ++i)
50504       LMask.push_back(i);
50505   }
50506
50507   if (RMask.empty()) {
50508     C = RHS;
50509     for (unsigned i = 0; i != NumElts; ++i)
50510       RMask.push_back(i);
50511   }
50512
50513   // If we have an unary mask, ensure the other op is set to null.
50514   if (isUndefOrInRange(LMask, 0, NumElts))
50515     B = SDValue();
50516   else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
50517     A = SDValue();
50518
50519   if (isUndefOrInRange(RMask, 0, NumElts))
50520     D = SDValue();
50521   else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
50522     C = SDValue();
50523
50524   // If A and B occur in reverse order in RHS, then canonicalize by commuting
50525   // RHS operands and shuffle mask.
50526   if (A != C) {
50527     std::swap(C, D);
50528     ShuffleVectorSDNode::commuteMask(RMask);
50529   }
50530   // Check that the shuffles are both shuffling the same vectors.
50531   if (!(A == C && B == D))
50532     return false;
50533
50534   PostShuffleMask.clear();
50535   PostShuffleMask.append(NumElts, SM_SentinelUndef);
50536
50537   // LHS and RHS are now:
50538   //   LHS = shuffle A, B, LMask
50539   //   RHS = shuffle A, B, RMask
50540   // Check that the masks correspond to performing a horizontal operation.
50541   // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
50542   // so we just repeat the inner loop if this is a 256-bit op.
50543   unsigned Num128BitChunks = VT.getSizeInBits() / 128;
50544   unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
50545   unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
50546   assert((NumEltsPer128BitChunk % 2 == 0) &&
50547          "Vector type should have an even number of elements in each lane");
50548   for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
50549     for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
50550       // Ignore undefined components.
50551       int LIdx = LMask[i + j], RIdx = RMask[i + j];
50552       if (LIdx < 0 || RIdx < 0 ||
50553           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
50554           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
50555         continue;
50556
50557       // Check that successive odd/even elements are being operated on. If not,
50558       // this is not a horizontal operation.
50559       if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
50560           !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
50561         return false;
50562
50563       // Compute the post-shuffle mask index based on where the element
50564       // is stored in the HOP result, and where it needs to be moved to.
50565       int Base = LIdx & ~1u;
50566       int Index = ((Base % NumEltsPer128BitChunk) / 2) +
50567                   ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
50568
50569       // The  low half of the 128-bit result must choose from A.
50570       // The high half of the 128-bit result must choose from B,
50571       // unless B is undef. In that case, we are always choosing from A.
50572       if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
50573         Index += NumEltsPer64BitChunk;
50574       PostShuffleMask[i + j] = Index;
50575     }
50576   }
50577
50578   SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
50579   SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
50580
50581   bool IsIdentityPostShuffle =
50582       isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
50583   if (IsIdentityPostShuffle)
50584     PostShuffleMask.clear();
50585
50586   // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
50587   if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
50588       isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
50589     return false;
50590
50591   // If the source nodes are already used in HorizOps then always accept this.
50592   // Shuffle folding should merge these back together.
50593   bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
50594     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
50595   });
50596   bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
50597     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
50598   });
50599   bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
50600
50601   // Assume a SingleSource HOP if we only shuffle one input and don't need to
50602   // shuffle the result.
50603   if (!ForceHorizOp &&
50604       !shouldUseHorizontalOp(NewLHS == NewRHS &&
50605                                  (NumShuffles < 2 || !IsIdentityPostShuffle),
50606                              DAG, Subtarget))
50607     return false;
50608
50609   LHS = DAG.getBitcast(VT, NewLHS);
50610   RHS = DAG.getBitcast(VT, NewRHS);
50611   return true;
50612 }
50613
50614 // Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
50615 static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
50616                                          const X86Subtarget &Subtarget) {
50617   EVT VT = N->getValueType(0);
50618   unsigned Opcode = N->getOpcode();
50619   bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
50620   SmallVector<int, 8> PostShuffleMask;
50621
50622   switch (Opcode) {
50623   case ISD::FADD:
50624   case ISD::FSUB:
50625     if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
50626         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
50627       SDValue LHS = N->getOperand(0);
50628       SDValue RHS = N->getOperand(1);
50629       auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
50630       if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
50631                             PostShuffleMask)) {
50632         SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
50633         if (!PostShuffleMask.empty())
50634           HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
50635                                             DAG.getUNDEF(VT), PostShuffleMask);
50636         return HorizBinOp;
50637       }
50638     }
50639     break;
50640   case ISD::ADD:
50641   case ISD::SUB:
50642     if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
50643                                  VT == MVT::v16i16 || VT == MVT::v8i32)) {
50644       SDValue LHS = N->getOperand(0);
50645       SDValue RHS = N->getOperand(1);
50646       auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
50647       if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
50648                             PostShuffleMask)) {
50649         auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
50650                                         ArrayRef<SDValue> Ops) {
50651           return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
50652         };
50653         SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
50654                                               {LHS, RHS}, HOpBuilder);
50655         if (!PostShuffleMask.empty())
50656           HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
50657                                             DAG.getUNDEF(VT), PostShuffleMask);
50658         return HorizBinOp;
50659       }
50660     }
50661     break;
50662   }
50663
50664   return SDValue();
50665 }
50666
50667 //  Try to combine the following nodes
50668 //  t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
50669 //    <i32 -2147483648[float -0.000000e+00]> 0
50670 //  t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
50671 //    <(load 4 from constant-pool)> t0, t29
50672 //  [t30: v16i32 = bitcast t27]
50673 //  t6: v16i32 = xor t7, t27[t30]
50674 //  t11: v16f32 = bitcast t6
50675 //  t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
50676 //  into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
50677 //  t22: v16f32 = bitcast t7
50678 //  t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
50679 //  t24: v32f16 = bitcast t23
50680 static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
50681                                   const X86Subtarget &Subtarget) {
50682   EVT VT = N->getValueType(0);
50683   SDValue LHS = N->getOperand(0);
50684   SDValue RHS = N->getOperand(1);
50685   int CombineOpcode =
50686       N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
50687   auto isConjugationConstant = [](const Constant *c) {
50688     if (const auto *CI = dyn_cast<ConstantInt>(c)) {
50689       APInt ConjugationInt32 = APInt(32, 0x80000000, true);
50690       APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
50691       switch (CI->getBitWidth()) {
50692       case 16:
50693         return false;
50694       case 32:
50695         return CI->getValue() == ConjugationInt32;
50696       case 64:
50697         return CI->getValue() == ConjugationInt64;
50698       default:
50699         llvm_unreachable("Unexpected bit width");
50700       }
50701     }
50702     if (const auto *CF = dyn_cast<ConstantFP>(c))
50703       return CF->getType()->isFloatTy() && CF->isNegativeZeroValue();
50704     return false;
50705   };
50706   auto combineConjugation = [&](SDValue &r) {
50707     if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
50708       SDValue XOR = LHS.getOperand(0);
50709       if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
50710         SDValue XORRHS = XOR.getOperand(1);
50711         if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())
50712           XORRHS = XORRHS.getOperand(0);
50713         if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&
50714             XORRHS.getOperand(1).getNumOperands()) {
50715           ConstantPoolSDNode *CP =
50716               dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));
50717           if (CP && isConjugationConstant(CP->getConstVal())) {
50718             SelectionDAG::FlagInserter FlagsInserter(DAG, N);
50719             SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
50720             SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
50721             r = DAG.getBitcast(VT, FCMulC);
50722             return true;
50723           }
50724         }
50725       }
50726     }
50727     return false;
50728   };
50729   SDValue Res;
50730   if (combineConjugation(Res))
50731     return Res;
50732   std::swap(LHS, RHS);
50733   if (combineConjugation(Res))
50734     return Res;
50735   return Res;
50736 }
50737
50738 //  Try to combine the following nodes:
50739 //  FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
50740 static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
50741                                 const X86Subtarget &Subtarget) {
50742   auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
50743     return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
50744            Flags.hasAllowContract();
50745   };
50746
50747   auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
50748     return DAG.getTarget().Options.NoSignedZerosFPMath ||
50749            Flags.hasNoSignedZeros();
50750   };
50751   auto IsVectorAllNegativeZero = [](const SDNode *N) {
50752     if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)
50753       return false;
50754     assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&
50755            "Unexpected vector type!");
50756     if (ConstantPoolSDNode *CP =
50757             dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {
50758       APInt AI = APInt(32, 0x80008000, true);
50759       if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))
50760         return CI->getValue() == AI;
50761       if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))
50762         return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);
50763     }
50764     return false;
50765   };
50766
50767   if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
50768       !AllowContract(N->getFlags()))
50769     return SDValue();
50770
50771   EVT VT = N->getValueType(0);
50772   if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
50773     return SDValue();
50774
50775   SDValue LHS = N->getOperand(0);
50776   SDValue RHS = N->getOperand(1);
50777   bool IsConj;
50778   SDValue FAddOp1, MulOp0, MulOp1;
50779   auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
50780                        &IsVectorAllNegativeZero,
50781                        &HasNoSignedZero](SDValue N) -> bool {
50782     if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
50783       return false;
50784     SDValue Op0 = N.getOperand(0);
50785     unsigned Opcode = Op0.getOpcode();
50786     if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
50787       if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
50788         MulOp0 = Op0.getOperand(0);
50789         MulOp1 = Op0.getOperand(1);
50790         IsConj = Opcode == X86ISD::VFCMULC;
50791         return true;
50792       }
50793       if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
50794           ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&
50795             HasNoSignedZero(Op0->getFlags())) ||
50796            IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {
50797         MulOp0 = Op0.getOperand(0);
50798         MulOp1 = Op0.getOperand(1);
50799         IsConj = Opcode == X86ISD::VFCMADDC;
50800         return true;
50801       }
50802     }
50803     return false;
50804   };
50805
50806   if (GetCFmulFrom(LHS))
50807     FAddOp1 = RHS;
50808   else if (GetCFmulFrom(RHS))
50809     FAddOp1 = LHS;
50810   else
50811     return SDValue();
50812
50813   MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
50814   FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
50815   unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
50816   // FIXME: How do we handle when fast math flags of FADD are different from
50817   // CFMUL's?
50818   SDValue CFmul =
50819       DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
50820   return DAG.getBitcast(VT, CFmul);
50821 }
50822
50823 /// Do target-specific dag combines on floating-point adds/subs.
50824 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
50825                                const X86Subtarget &Subtarget) {
50826   if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
50827     return HOp;
50828
50829   if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
50830     return COp;
50831
50832   return SDValue();
50833 }
50834
50835 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
50836 /// the codegen.
50837 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
50838 /// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
50839 ///       anything that is guaranteed to be transformed by DAGCombiner.
50840 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
50841                                           const X86Subtarget &Subtarget,
50842                                           const SDLoc &DL) {
50843   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
50844   SDValue Src = N->getOperand(0);
50845   unsigned SrcOpcode = Src.getOpcode();
50846   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50847
50848   EVT VT = N->getValueType(0);
50849   EVT SrcVT = Src.getValueType();
50850
50851   auto IsFreeTruncation = [VT](SDValue Op) {
50852     unsigned TruncSizeInBits = VT.getScalarSizeInBits();
50853
50854     // See if this has been extended from a smaller/equal size to
50855     // the truncation size, allowing a truncation to combine with the extend.
50856     unsigned Opcode = Op.getOpcode();
50857     if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
50858          Opcode == ISD::ZERO_EXTEND) &&
50859         Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
50860       return true;
50861
50862     // See if this is a single use constant which can be constant folded.
50863     // NOTE: We don't peek throught bitcasts here because there is currently
50864     // no support for constant folding truncate+bitcast+vector_of_constants. So
50865     // we'll just send up with a truncate on both operands which will
50866     // get turned back into (truncate (binop)) causing an infinite loop.
50867     return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
50868   };
50869
50870   auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
50871     SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
50872     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
50873     return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
50874   };
50875
50876   // Don't combine if the operation has other uses.
50877   if (!Src.hasOneUse())
50878     return SDValue();
50879
50880   // Only support vector truncation for now.
50881   // TODO: i64 scalar math would benefit as well.
50882   if (!VT.isVector())
50883     return SDValue();
50884
50885   // In most cases its only worth pre-truncating if we're only facing the cost
50886   // of one truncation.
50887   // i.e. if one of the inputs will constant fold or the input is repeated.
50888   switch (SrcOpcode) {
50889   case ISD::MUL:
50890     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
50891     // better to truncate if we have the chance.
50892     if (SrcVT.getScalarType() == MVT::i64 &&
50893         TLI.isOperationLegal(SrcOpcode, VT) &&
50894         !TLI.isOperationLegal(SrcOpcode, SrcVT))
50895       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
50896     [[fallthrough]];
50897   case ISD::AND:
50898   case ISD::XOR:
50899   case ISD::OR:
50900   case ISD::ADD:
50901   case ISD::SUB: {
50902     SDValue Op0 = Src.getOperand(0);
50903     SDValue Op1 = Src.getOperand(1);
50904     if (TLI.isOperationLegal(SrcOpcode, VT) &&
50905         (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
50906       return TruncateArithmetic(Op0, Op1);
50907     break;
50908   }
50909   }
50910
50911   return SDValue();
50912 }
50913
50914 // Try to form a MULHU or MULHS node by looking for
50915 // (trunc (srl (mul ext, ext), 16))
50916 // TODO: This is X86 specific because we want to be able to handle wide types
50917 // before type legalization. But we can only do it if the vector will be
50918 // legalized via widening/splitting. Type legalization can't handle promotion
50919 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
50920 // combiner.
50921 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
50922                             SelectionDAG &DAG, const X86Subtarget &Subtarget) {
50923   // First instruction should be a right shift of a multiply.
50924   if (Src.getOpcode() != ISD::SRL ||
50925       Src.getOperand(0).getOpcode() != ISD::MUL)
50926     return SDValue();
50927
50928   if (!Subtarget.hasSSE2())
50929     return SDValue();
50930
50931   // Only handle vXi16 types that are at least 128-bits unless they will be
50932   // widened.
50933   if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
50934     return SDValue();
50935
50936   // Input type should be at least vXi32.
50937   EVT InVT = Src.getValueType();
50938   if (InVT.getVectorElementType().getSizeInBits() < 32)
50939     return SDValue();
50940
50941   // Need a shift by 16.
50942   APInt ShiftAmt;
50943   if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
50944       ShiftAmt != 16)
50945     return SDValue();
50946
50947   SDValue LHS = Src.getOperand(0).getOperand(0);
50948   SDValue RHS = Src.getOperand(0).getOperand(1);
50949
50950   // Count leading sign/zero bits on both inputs - if there are enough then
50951   // truncation back to vXi16 will be cheap - either as a pack/shuffle
50952   // sequence or using AVX512 truncations. If the inputs are sext/zext then the
50953   // truncations may actually be free by peeking through to the ext source.
50954   auto IsSext = [&DAG](SDValue V) {
50955     return DAG.ComputeMaxSignificantBits(V) <= 16;
50956   };
50957   auto IsZext = [&DAG](SDValue V) {
50958     return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
50959   };
50960
50961   bool IsSigned = IsSext(LHS) && IsSext(RHS);
50962   bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
50963   if (!IsSigned && !IsUnsigned)
50964     return SDValue();
50965
50966   // Check if both inputs are extensions, which will be removed by truncation.
50967   bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
50968                          LHS.getOpcode() == ISD::ZERO_EXTEND) &&
50969                         (RHS.getOpcode() == ISD::SIGN_EXTEND ||
50970                          RHS.getOpcode() == ISD::ZERO_EXTEND) &&
50971                         LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
50972                         RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
50973
50974   // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
50975   // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
50976   // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
50977   // will have to split anyway.
50978   unsigned InSizeInBits = InVT.getSizeInBits();
50979   if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
50980       !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
50981       (InSizeInBits % 16) == 0) {
50982     EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
50983                                 InVT.getSizeInBits() / 16);
50984     SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
50985                               DAG.getBitcast(BCVT, RHS));
50986     return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
50987   }
50988
50989   // Truncate back to source type.
50990   LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
50991   RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
50992
50993   unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
50994   return DAG.getNode(Opc, DL, VT, LHS, RHS);
50995 }
50996
50997 // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
50998 // from one vector with signed bytes from another vector, adds together
50999 // adjacent pairs of 16-bit products, and saturates the result before
51000 // truncating to 16-bits.
51001 //
51002 // Which looks something like this:
51003 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
51004 //                 (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
51005 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
51006                                const X86Subtarget &Subtarget,
51007                                const SDLoc &DL) {
51008   if (!VT.isVector() || !Subtarget.hasSSSE3())
51009     return SDValue();
51010
51011   unsigned NumElems = VT.getVectorNumElements();
51012   EVT ScalarVT = VT.getVectorElementType();
51013   if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
51014     return SDValue();
51015
51016   SDValue SSatVal = detectSSatPattern(In, VT);
51017   if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
51018     return SDValue();
51019
51020   // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
51021   // of multiplies from even/odd elements.
51022   SDValue N0 = SSatVal.getOperand(0);
51023   SDValue N1 = SSatVal.getOperand(1);
51024
51025   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
51026     return SDValue();
51027
51028   SDValue N00 = N0.getOperand(0);
51029   SDValue N01 = N0.getOperand(1);
51030   SDValue N10 = N1.getOperand(0);
51031   SDValue N11 = N1.getOperand(1);
51032
51033   // TODO: Handle constant vectors and use knownbits/computenumsignbits?
51034   // Canonicalize zero_extend to LHS.
51035   if (N01.getOpcode() == ISD::ZERO_EXTEND)
51036     std::swap(N00, N01);
51037   if (N11.getOpcode() == ISD::ZERO_EXTEND)
51038     std::swap(N10, N11);
51039
51040   // Ensure we have a zero_extend and a sign_extend.
51041   if (N00.getOpcode() != ISD::ZERO_EXTEND ||
51042       N01.getOpcode() != ISD::SIGN_EXTEND ||
51043       N10.getOpcode() != ISD::ZERO_EXTEND ||
51044       N11.getOpcode() != ISD::SIGN_EXTEND)
51045     return SDValue();
51046
51047   // Peek through the extends.
51048   N00 = N00.getOperand(0);
51049   N01 = N01.getOperand(0);
51050   N10 = N10.getOperand(0);
51051   N11 = N11.getOperand(0);
51052
51053   // Ensure the extend is from vXi8.
51054   if (N00.getValueType().getVectorElementType() != MVT::i8 ||
51055       N01.getValueType().getVectorElementType() != MVT::i8 ||
51056       N10.getValueType().getVectorElementType() != MVT::i8 ||
51057       N11.getValueType().getVectorElementType() != MVT::i8)
51058     return SDValue();
51059
51060   // All inputs should be build_vectors.
51061   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
51062       N01.getOpcode() != ISD::BUILD_VECTOR ||
51063       N10.getOpcode() != ISD::BUILD_VECTOR ||
51064       N11.getOpcode() != ISD::BUILD_VECTOR)
51065     return SDValue();
51066
51067   // N00/N10 are zero extended. N01/N11 are sign extended.
51068
51069   // For each element, we need to ensure we have an odd element from one vector
51070   // multiplied by the odd element of another vector and the even element from
51071   // one of the same vectors being multiplied by the even element from the
51072   // other vector. So we need to make sure for each element i, this operator
51073   // is being performed:
51074   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
51075   SDValue ZExtIn, SExtIn;
51076   for (unsigned i = 0; i != NumElems; ++i) {
51077     SDValue N00Elt = N00.getOperand(i);
51078     SDValue N01Elt = N01.getOperand(i);
51079     SDValue N10Elt = N10.getOperand(i);
51080     SDValue N11Elt = N11.getOperand(i);
51081     // TODO: Be more tolerant to undefs.
51082     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51083         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51084         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51085         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
51086       return SDValue();
51087     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
51088     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
51089     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
51090     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
51091     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
51092       return SDValue();
51093     unsigned IdxN00 = ConstN00Elt->getZExtValue();
51094     unsigned IdxN01 = ConstN01Elt->getZExtValue();
51095     unsigned IdxN10 = ConstN10Elt->getZExtValue();
51096     unsigned IdxN11 = ConstN11Elt->getZExtValue();
51097     // Add is commutative so indices can be reordered.
51098     if (IdxN00 > IdxN10) {
51099       std::swap(IdxN00, IdxN10);
51100       std::swap(IdxN01, IdxN11);
51101     }
51102     // N0 indices be the even element. N1 indices must be the next odd element.
51103     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
51104         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
51105       return SDValue();
51106     SDValue N00In = N00Elt.getOperand(0);
51107     SDValue N01In = N01Elt.getOperand(0);
51108     SDValue N10In = N10Elt.getOperand(0);
51109     SDValue N11In = N11Elt.getOperand(0);
51110     // First time we find an input capture it.
51111     if (!ZExtIn) {
51112       ZExtIn = N00In;
51113       SExtIn = N01In;
51114     }
51115     if (ZExtIn != N00In || SExtIn != N01In ||
51116         ZExtIn != N10In || SExtIn != N11In)
51117       return SDValue();
51118   }
51119
51120   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
51121                          ArrayRef<SDValue> Ops) {
51122     // Shrink by adding truncate nodes and let DAGCombine fold with the
51123     // sources.
51124     EVT InVT = Ops[0].getValueType();
51125     assert(InVT.getScalarType() == MVT::i8 &&
51126            "Unexpected scalar element type");
51127     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
51128     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51129                                  InVT.getVectorNumElements() / 2);
51130     return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
51131   };
51132   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
51133                           PMADDBuilder);
51134 }
51135
51136 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
51137                                const X86Subtarget &Subtarget) {
51138   EVT VT = N->getValueType(0);
51139   SDValue Src = N->getOperand(0);
51140   SDLoc DL(N);
51141
51142   // Attempt to pre-truncate inputs to arithmetic ops instead.
51143   if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
51144     return V;
51145
51146   // Try to detect AVG pattern first.
51147   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
51148     return Avg;
51149
51150   // Try to detect PMADD
51151   if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
51152     return PMAdd;
51153
51154   // Try to combine truncation with signed/unsigned saturation.
51155   if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
51156     return Val;
51157
51158   // Try to combine PMULHUW/PMULHW for vXi16.
51159   if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
51160     return V;
51161
51162   // The bitcast source is a direct mmx result.
51163   // Detect bitcasts between i32 to x86mmx
51164   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
51165     SDValue BCSrc = Src.getOperand(0);
51166     if (BCSrc.getValueType() == MVT::x86mmx)
51167       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
51168   }
51169
51170   return SDValue();
51171 }
51172
51173 static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
51174                              TargetLowering::DAGCombinerInfo &DCI) {
51175   EVT VT = N->getValueType(0);
51176   SDValue In = N->getOperand(0);
51177   SDLoc DL(N);
51178
51179   if (SDValue SSatVal = detectSSatPattern(In, VT))
51180     return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
51181   if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
51182     return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
51183
51184   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51185   APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
51186   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
51187     return SDValue(N, 0);
51188
51189   return SDValue();
51190 }
51191
51192 /// Returns the negated value if the node \p N flips sign of FP value.
51193 ///
51194 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
51195 /// or FSUB(0, x)
51196 /// AVX512F does not have FXOR, so FNEG is lowered as
51197 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
51198 /// In this case we go though all bitcasts.
51199 /// This also recognizes splat of a negated value and returns the splat of that
51200 /// value.
51201 static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
51202   if (N->getOpcode() == ISD::FNEG)
51203     return N->getOperand(0);
51204
51205   // Don't recurse exponentially.
51206   if (Depth > SelectionDAG::MaxRecursionDepth)
51207     return SDValue();
51208
51209   unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
51210
51211   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
51212   EVT VT = Op->getValueType(0);
51213
51214   // Make sure the element size doesn't change.
51215   if (VT.getScalarSizeInBits() != ScalarSize)
51216     return SDValue();
51217
51218   unsigned Opc = Op.getOpcode();
51219   switch (Opc) {
51220   case ISD::VECTOR_SHUFFLE: {
51221     // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
51222     // of this is VECTOR_SHUFFLE(-VEC1, UNDEF).  The mask can be anything here.
51223     if (!Op.getOperand(1).isUndef())
51224       return SDValue();
51225     if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
51226       if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
51227         return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
51228                                     cast<ShuffleVectorSDNode>(Op)->getMask());
51229     break;
51230   }
51231   case ISD::INSERT_VECTOR_ELT: {
51232     // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
51233     // -V, INDEX).
51234     SDValue InsVector = Op.getOperand(0);
51235     SDValue InsVal = Op.getOperand(1);
51236     if (!InsVector.isUndef())
51237       return SDValue();
51238     if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
51239       if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
51240         return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
51241                            NegInsVal, Op.getOperand(2));
51242     break;
51243   }
51244   case ISD::FSUB:
51245   case ISD::XOR:
51246   case X86ISD::FXOR: {
51247     SDValue Op1 = Op.getOperand(1);
51248     SDValue Op0 = Op.getOperand(0);
51249
51250     // For XOR and FXOR, we want to check if constant
51251     // bits of Op1 are sign bit masks. For FSUB, we
51252     // have to check if constant bits of Op0 are sign
51253     // bit masks and hence we swap the operands.
51254     if (Opc == ISD::FSUB)
51255       std::swap(Op0, Op1);
51256
51257     APInt UndefElts;
51258     SmallVector<APInt, 16> EltBits;
51259     // Extract constant bits and see if they are all
51260     // sign bit masks. Ignore the undef elements.
51261     if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
51262                                       /* AllowWholeUndefs */ true,
51263                                       /* AllowPartialUndefs */ false)) {
51264       for (unsigned I = 0, E = EltBits.size(); I < E; I++)
51265         if (!UndefElts[I] && !EltBits[I].isSignMask())
51266           return SDValue();
51267
51268       // Only allow bitcast from correctly-sized constant.
51269       Op0 = peekThroughBitcasts(Op0);
51270       if (Op0.getScalarValueSizeInBits() == ScalarSize)
51271         return Op0;
51272     }
51273     break;
51274   } // case
51275   } // switch
51276
51277   return SDValue();
51278 }
51279
51280 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
51281                                 bool NegRes) {
51282   if (NegMul) {
51283     switch (Opcode) {
51284     default: llvm_unreachable("Unexpected opcode");
51285     case ISD::FMA:              Opcode = X86ISD::FNMADD;        break;
51286     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FNMADD; break;
51287     case X86ISD::FMADD_RND:     Opcode = X86ISD::FNMADD_RND;    break;
51288     case X86ISD::FMSUB:         Opcode = X86ISD::FNMSUB;        break;
51289     case X86ISD::STRICT_FMSUB:  Opcode = X86ISD::STRICT_FNMSUB; break;
51290     case X86ISD::FMSUB_RND:     Opcode = X86ISD::FNMSUB_RND;    break;
51291     case X86ISD::FNMADD:        Opcode = ISD::FMA;              break;
51292     case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA;       break;
51293     case X86ISD::FNMADD_RND:    Opcode = X86ISD::FMADD_RND;     break;
51294     case X86ISD::FNMSUB:        Opcode = X86ISD::FMSUB;         break;
51295     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB;  break;
51296     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FMSUB_RND;     break;
51297     }
51298   }
51299
51300   if (NegAcc) {
51301     switch (Opcode) {
51302     default: llvm_unreachable("Unexpected opcode");
51303     case ISD::FMA:              Opcode = X86ISD::FMSUB;         break;
51304     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FMSUB;  break;
51305     case X86ISD::FMADD_RND:     Opcode = X86ISD::FMSUB_RND;     break;
51306     case X86ISD::FMSUB:         Opcode = ISD::FMA;              break;
51307     case X86ISD::STRICT_FMSUB:  Opcode = ISD::STRICT_FMA;       break;
51308     case X86ISD::FMSUB_RND:     Opcode = X86ISD::FMADD_RND;     break;
51309     case X86ISD::FNMADD:        Opcode = X86ISD::FNMSUB;        break;
51310     case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
51311     case X86ISD::FNMADD_RND:    Opcode = X86ISD::FNMSUB_RND;    break;
51312     case X86ISD::FNMSUB:        Opcode = X86ISD::FNMADD;        break;
51313     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
51314     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FNMADD_RND;    break;
51315     case X86ISD::FMADDSUB:      Opcode = X86ISD::FMSUBADD;      break;
51316     case X86ISD::FMADDSUB_RND:  Opcode = X86ISD::FMSUBADD_RND;  break;
51317     case X86ISD::FMSUBADD:      Opcode = X86ISD::FMADDSUB;      break;
51318     case X86ISD::FMSUBADD_RND:  Opcode = X86ISD::FMADDSUB_RND;  break;
51319     }
51320   }
51321
51322   if (NegRes) {
51323     switch (Opcode) {
51324     // For accuracy reason, we never combine fneg and fma under strict FP.
51325     default: llvm_unreachable("Unexpected opcode");
51326     case ISD::FMA:             Opcode = X86ISD::FNMSUB;       break;
51327     case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
51328     case X86ISD::FMSUB:        Opcode = X86ISD::FNMADD;       break;
51329     case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMADD_RND;   break;
51330     case X86ISD::FNMADD:       Opcode = X86ISD::FMSUB;        break;
51331     case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMSUB_RND;    break;
51332     case X86ISD::FNMSUB:       Opcode = ISD::FMA;             break;
51333     case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMADD_RND;    break;
51334     }
51335   }
51336
51337   return Opcode;
51338 }
51339
51340 /// Do target-specific dag combines on floating point negations.
51341 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
51342                            TargetLowering::DAGCombinerInfo &DCI,
51343                            const X86Subtarget &Subtarget) {
51344   EVT OrigVT = N->getValueType(0);
51345   SDValue Arg = isFNEG(DAG, N);
51346   if (!Arg)
51347     return SDValue();
51348
51349   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51350   EVT VT = Arg.getValueType();
51351   EVT SVT = VT.getScalarType();
51352   SDLoc DL(N);
51353
51354   // Let legalize expand this if it isn't a legal type yet.
51355   if (!TLI.isTypeLegal(VT))
51356     return SDValue();
51357
51358   // If we're negating a FMUL node on a target with FMA, then we can avoid the
51359   // use of a constant by performing (-0 - A*B) instead.
51360   // FIXME: Check rounding control flags as well once it becomes available.
51361   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
51362       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
51363     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
51364     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
51365                                   Arg.getOperand(1), Zero);
51366     return DAG.getBitcast(OrigVT, NewNode);
51367   }
51368
51369   bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
51370   bool LegalOperations = !DCI.isBeforeLegalizeOps();
51371   if (SDValue NegArg =
51372           TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
51373     return DAG.getBitcast(OrigVT, NegArg);
51374
51375   return SDValue();
51376 }
51377
51378 SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
51379                                                 bool LegalOperations,
51380                                                 bool ForCodeSize,
51381                                                 NegatibleCost &Cost,
51382                                                 unsigned Depth) const {
51383   // fneg patterns are removable even if they have multiple uses.
51384   if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
51385     Cost = NegatibleCost::Cheaper;
51386     return DAG.getBitcast(Op.getValueType(), Arg);
51387   }
51388
51389   EVT VT = Op.getValueType();
51390   EVT SVT = VT.getScalarType();
51391   unsigned Opc = Op.getOpcode();
51392   SDNodeFlags Flags = Op.getNode()->getFlags();
51393   switch (Opc) {
51394   case ISD::FMA:
51395   case X86ISD::FMSUB:
51396   case X86ISD::FNMADD:
51397   case X86ISD::FNMSUB:
51398   case X86ISD::FMADD_RND:
51399   case X86ISD::FMSUB_RND:
51400   case X86ISD::FNMADD_RND:
51401   case X86ISD::FNMSUB_RND: {
51402     if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
51403         !(SVT == MVT::f32 || SVT == MVT::f64) ||
51404         !isOperationLegal(ISD::FMA, VT))
51405       break;
51406
51407     // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
51408     // if it may have signed zeros.
51409     if (!Flags.hasNoSignedZeros())
51410       break;
51411
51412     // This is always negatible for free but we might be able to remove some
51413     // extra operand negations as well.
51414     SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
51415     for (int i = 0; i != 3; ++i)
51416       NewOps[i] = getCheaperNegatedExpression(
51417           Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
51418
51419     bool NegA = !!NewOps[0];
51420     bool NegB = !!NewOps[1];
51421     bool NegC = !!NewOps[2];
51422     unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
51423
51424     Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
51425                                   : NegatibleCost::Neutral;
51426
51427     // Fill in the non-negated ops with the original values.
51428     for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
51429       if (!NewOps[i])
51430         NewOps[i] = Op.getOperand(i);
51431     return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
51432   }
51433   case X86ISD::FRCP:
51434     if (SDValue NegOp0 =
51435             getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
51436                                  ForCodeSize, Cost, Depth + 1))
51437       return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
51438     break;
51439   }
51440
51441   return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
51442                                               ForCodeSize, Cost, Depth);
51443 }
51444
51445 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
51446                                  const X86Subtarget &Subtarget) {
51447   MVT VT = N->getSimpleValueType(0);
51448   // If we have integer vector types available, use the integer opcodes.
51449   if (!VT.isVector() || !Subtarget.hasSSE2())
51450     return SDValue();
51451
51452   SDLoc dl(N);
51453
51454   unsigned IntBits = VT.getScalarSizeInBits();
51455   MVT IntSVT = MVT::getIntegerVT(IntBits);
51456   MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
51457
51458   SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
51459   SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
51460   unsigned IntOpcode;
51461   switch (N->getOpcode()) {
51462   default: llvm_unreachable("Unexpected FP logic op");
51463   case X86ISD::FOR:   IntOpcode = ISD::OR; break;
51464   case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;
51465   case X86ISD::FAND:  IntOpcode = ISD::AND; break;
51466   case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
51467   }
51468   SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
51469   return DAG.getBitcast(VT, IntOp);
51470 }
51471
51472
51473 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
51474 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
51475   if (N->getOpcode() != ISD::XOR)
51476     return SDValue();
51477
51478   SDValue LHS = N->getOperand(0);
51479   if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
51480     return SDValue();
51481
51482   X86::CondCode NewCC = X86::GetOppositeBranchCondition(
51483       X86::CondCode(LHS->getConstantOperandVal(0)));
51484   SDLoc DL(N);
51485   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
51486 }
51487
51488 static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,
51489                                  const X86Subtarget &Subtarget) {
51490   assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
51491          "Invalid opcode for combing with CTLZ");
51492   if (Subtarget.hasFastLZCNT())
51493     return SDValue();
51494
51495   EVT VT = N->getValueType(0);
51496   if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
51497       (VT != MVT::i64 || !Subtarget.is64Bit()))
51498     return SDValue();
51499
51500   SDValue N0 = N->getOperand(0);
51501   SDValue N1 = N->getOperand(1);
51502
51503   if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
51504       N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)
51505     return SDValue();
51506
51507   SDValue OpCTLZ;
51508   SDValue OpSizeTM1;
51509
51510   if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
51511     OpCTLZ = N1;
51512     OpSizeTM1 = N0;
51513   } else if (N->getOpcode() == ISD::SUB) {
51514     return SDValue();
51515   } else {
51516     OpCTLZ = N0;
51517     OpSizeTM1 = N1;
51518   }
51519
51520   if (!OpCTLZ.hasOneUse())
51521     return SDValue();
51522   auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
51523   if (!C)
51524     return SDValue();
51525
51526   if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
51527     return SDValue();
51528   SDLoc DL(N);
51529   EVT OpVT = VT;
51530   SDValue Op = OpCTLZ.getOperand(0);
51531   if (VT == MVT::i8) {
51532     // Zero extend to i32 since there is not an i8 bsr.
51533     OpVT = MVT::i32;
51534     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
51535   }
51536
51537   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
51538   Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
51539   if (VT == MVT::i8)
51540     Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
51541
51542   return Op;
51543 }
51544
51545 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
51546                           TargetLowering::DAGCombinerInfo &DCI,
51547                           const X86Subtarget &Subtarget) {
51548   SDValue N0 = N->getOperand(0);
51549   SDValue N1 = N->getOperand(1);
51550   EVT VT = N->getValueType(0);
51551
51552   // If this is SSE1 only convert to FXOR to avoid scalarization.
51553   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51554     return DAG.getBitcast(MVT::v4i32,
51555                           DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
51556                                       DAG.getBitcast(MVT::v4f32, N0),
51557                                       DAG.getBitcast(MVT::v4f32, N1)));
51558   }
51559
51560   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
51561     return Cmp;
51562
51563   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
51564     return R;
51565
51566   if (SDValue R = combineBitOpWithShift(N, DAG))
51567     return R;
51568
51569   if (SDValue R = combineBitOpWithPACK(N, DAG))
51570     return R;
51571
51572   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
51573     return FPLogic;
51574
51575   if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
51576     return R;
51577
51578   if (DCI.isBeforeLegalizeOps())
51579     return SDValue();
51580
51581   if (SDValue SetCC = foldXor1SetCC(N, DAG))
51582     return SetCC;
51583
51584   if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
51585     return R;
51586
51587   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
51588     return RV;
51589
51590   // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
51591   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51592   if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
51593       N0.getOperand(0).getValueType().isVector() &&
51594       N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
51595       TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
51596     return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
51597                                          N0.getOperand(0).getValueType()));
51598   }
51599
51600   // Handle AVX512 mask widening.
51601   // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
51602   if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
51603       VT.getVectorElementType() == MVT::i1 &&
51604       N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
51605       TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
51606     return DAG.getNode(
51607         ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
51608         DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
51609         N0.getOperand(2));
51610   }
51611
51612   // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
51613   // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
51614   // TODO: Under what circumstances could this be performed in DAGCombine?
51615   if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
51616       N0.getOperand(0).getOpcode() == N->getOpcode()) {
51617     SDValue TruncExtSrc = N0.getOperand(0);
51618     auto *N1C = dyn_cast<ConstantSDNode>(N1);
51619     auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
51620     if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
51621       SDLoc DL(N);
51622       SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
51623       SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
51624       return DAG.getNode(ISD::XOR, DL, VT, LHS,
51625                          DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
51626     }
51627   }
51628
51629   if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
51630     return R;
51631
51632   return combineFneg(N, DAG, DCI, Subtarget);
51633 }
51634
51635 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
51636                             TargetLowering::DAGCombinerInfo &DCI,
51637                             const X86Subtarget &Subtarget) {
51638   EVT VT = N->getValueType(0);
51639   unsigned NumBits = VT.getSizeInBits();
51640
51641   // TODO - Constant Folding.
51642
51643   // Simplify the inputs.
51644   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51645   APInt DemandedMask(APInt::getAllOnes(NumBits));
51646   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
51647     return SDValue(N, 0);
51648
51649   return SDValue();
51650 }
51651
51652 static bool isNullFPScalarOrVectorConst(SDValue V) {
51653   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
51654 }
51655
51656 /// If a value is a scalar FP zero or a vector FP zero (potentially including
51657 /// undefined elements), return a zero constant that may be used to fold away
51658 /// that value. In the case of a vector, the returned constant will not contain
51659 /// undefined elements even if the input parameter does. This makes it suitable
51660 /// to be used as a replacement operand with operations (eg, bitwise-and) where
51661 /// an undef should not propagate.
51662 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
51663                                         const X86Subtarget &Subtarget) {
51664   if (!isNullFPScalarOrVectorConst(V))
51665     return SDValue();
51666
51667   if (V.getValueType().isVector())
51668     return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
51669
51670   return V;
51671 }
51672
51673 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
51674                                       const X86Subtarget &Subtarget) {
51675   SDValue N0 = N->getOperand(0);
51676   SDValue N1 = N->getOperand(1);
51677   EVT VT = N->getValueType(0);
51678   SDLoc DL(N);
51679
51680   // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
51681   if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
51682         (VT == MVT::f64 && Subtarget.hasSSE2()) ||
51683         (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
51684     return SDValue();
51685
51686   auto isAllOnesConstantFP = [](SDValue V) {
51687     if (V.getSimpleValueType().isVector())
51688       return ISD::isBuildVectorAllOnes(V.getNode());
51689     auto *C = dyn_cast<ConstantFPSDNode>(V);
51690     return C && C->getConstantFPValue()->isAllOnesValue();
51691   };
51692
51693   // fand (fxor X, -1), Y --> fandn X, Y
51694   if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
51695     return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
51696
51697   // fand X, (fxor Y, -1) --> fandn Y, X
51698   if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
51699     return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
51700
51701   return SDValue();
51702 }
51703
51704 /// Do target-specific dag combines on X86ISD::FAND nodes.
51705 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
51706                            const X86Subtarget &Subtarget) {
51707   // FAND(0.0, x) -> 0.0
51708   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
51709     return V;
51710
51711   // FAND(x, 0.0) -> 0.0
51712   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
51713     return V;
51714
51715   if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
51716     return V;
51717
51718   return lowerX86FPLogicOp(N, DAG, Subtarget);
51719 }
51720
51721 /// Do target-specific dag combines on X86ISD::FANDN nodes.
51722 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
51723                             const X86Subtarget &Subtarget) {
51724   // FANDN(0.0, x) -> x
51725   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
51726     return N->getOperand(1);
51727
51728   // FANDN(x, 0.0) -> 0.0
51729   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
51730     return V;
51731
51732   return lowerX86FPLogicOp(N, DAG, Subtarget);
51733 }
51734
51735 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
51736 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
51737                           TargetLowering::DAGCombinerInfo &DCI,
51738                           const X86Subtarget &Subtarget) {
51739   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
51740
51741   // F[X]OR(0.0, x) -> x
51742   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
51743     return N->getOperand(1);
51744
51745   // F[X]OR(x, 0.0) -> x
51746   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
51747     return N->getOperand(0);
51748
51749   if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
51750     return NewVal;
51751
51752   return lowerX86FPLogicOp(N, DAG, Subtarget);
51753 }
51754
51755 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
51756 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
51757   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
51758
51759   // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
51760   if (!DAG.getTarget().Options.NoNaNsFPMath ||
51761       !DAG.getTarget().Options.NoSignedZerosFPMath)
51762     return SDValue();
51763
51764   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
51765   // into FMINC and FMAXC, which are Commutative operations.
51766   unsigned NewOp = 0;
51767   switch (N->getOpcode()) {
51768     default: llvm_unreachable("unknown opcode");
51769     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
51770     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
51771   }
51772
51773   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
51774                      N->getOperand(0), N->getOperand(1));
51775 }
51776
51777 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
51778                                      const X86Subtarget &Subtarget) {
51779   EVT VT = N->getValueType(0);
51780   if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
51781     return SDValue();
51782
51783   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51784
51785   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
51786         (Subtarget.hasSSE2() && VT == MVT::f64) ||
51787         (Subtarget.hasFP16() && VT == MVT::f16) ||
51788         (VT.isVector() && TLI.isTypeLegal(VT))))
51789     return SDValue();
51790
51791   SDValue Op0 = N->getOperand(0);
51792   SDValue Op1 = N->getOperand(1);
51793   SDLoc DL(N);
51794   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
51795
51796   // If we don't have to respect NaN inputs, this is a direct translation to x86
51797   // min/max instructions.
51798   if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
51799     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
51800
51801   // If one of the operands is known non-NaN use the native min/max instructions
51802   // with the non-NaN input as second operand.
51803   if (DAG.isKnownNeverNaN(Op1))
51804     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
51805   if (DAG.isKnownNeverNaN(Op0))
51806     return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
51807
51808   // If we have to respect NaN inputs, this takes at least 3 instructions.
51809   // Favor a library call when operating on a scalar and minimizing code size.
51810   if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
51811     return SDValue();
51812
51813   EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
51814                                          VT);
51815
51816   // There are 4 possibilities involving NaN inputs, and these are the required
51817   // outputs:
51818   //                   Op1
51819   //               Num     NaN
51820   //            ----------------
51821   //       Num  |  Max  |  Op0 |
51822   // Op0        ----------------
51823   //       NaN  |  Op1  |  NaN |
51824   //            ----------------
51825   //
51826   // The SSE FP max/min instructions were not designed for this case, but rather
51827   // to implement:
51828   //   Min = Op1 < Op0 ? Op1 : Op0
51829   //   Max = Op1 > Op0 ? Op1 : Op0
51830   //
51831   // So they always return Op0 if either input is a NaN. However, we can still
51832   // use those instructions for fmaxnum by selecting away a NaN input.
51833
51834   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
51835   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
51836   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
51837
51838   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
51839   // are NaN, the NaN value of Op1 is the result.
51840   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
51841 }
51842
51843 static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
51844                                    TargetLowering::DAGCombinerInfo &DCI) {
51845   EVT VT = N->getValueType(0);
51846   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51847
51848   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
51849   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
51850     return SDValue(N, 0);
51851
51852   // Convert a full vector load into vzload when not all bits are needed.
51853   SDValue In = N->getOperand(0);
51854   MVT InVT = In.getSimpleValueType();
51855   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
51856       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
51857     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
51858     LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
51859     unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
51860     MVT MemVT = MVT::getIntegerVT(NumBits);
51861     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
51862     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
51863       SDLoc dl(N);
51864       SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
51865                                     DAG.getBitcast(InVT, VZLoad));
51866       DCI.CombineTo(N, Convert);
51867       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
51868       DCI.recursivelyDeleteUnusedNodes(LN);
51869       return SDValue(N, 0);
51870     }
51871   }
51872
51873   return SDValue();
51874 }
51875
51876 static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
51877                                      TargetLowering::DAGCombinerInfo &DCI) {
51878   bool IsStrict = N->isTargetStrictFPOpcode();
51879   EVT VT = N->getValueType(0);
51880
51881   // Convert a full vector load into vzload when not all bits are needed.
51882   SDValue In = N->getOperand(IsStrict ? 1 : 0);
51883   MVT InVT = In.getSimpleValueType();
51884   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
51885       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
51886     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
51887     LoadSDNode *LN = cast<LoadSDNode>(In);
51888     unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
51889     MVT MemVT = MVT::getFloatingPointVT(NumBits);
51890     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
51891     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
51892       SDLoc dl(N);
51893       if (IsStrict) {
51894         SDValue Convert =
51895             DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
51896                         {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
51897         DCI.CombineTo(N, Convert, Convert.getValue(1));
51898       } else {
51899         SDValue Convert =
51900             DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
51901         DCI.CombineTo(N, Convert);
51902       }
51903       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
51904       DCI.recursivelyDeleteUnusedNodes(LN);
51905       return SDValue(N, 0);
51906     }
51907   }
51908
51909   return SDValue();
51910 }
51911
51912 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
51913 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
51914                             TargetLowering::DAGCombinerInfo &DCI,
51915                             const X86Subtarget &Subtarget) {
51916   SDValue N0 = N->getOperand(0);
51917   SDValue N1 = N->getOperand(1);
51918   MVT VT = N->getSimpleValueType(0);
51919   int NumElts = VT.getVectorNumElements();
51920   unsigned EltSizeInBits = VT.getScalarSizeInBits();
51921   SDLoc DL(N);
51922
51923   // ANDNP(undef, x) -> 0
51924   // ANDNP(x, undef) -> 0
51925   if (N0.isUndef() || N1.isUndef())
51926     return DAG.getConstant(0, DL, VT);
51927
51928   // ANDNP(0, x) -> x
51929   if (ISD::isBuildVectorAllZeros(N0.getNode()))
51930     return N1;
51931
51932   // ANDNP(x, 0) -> 0
51933   if (ISD::isBuildVectorAllZeros(N1.getNode()))
51934     return DAG.getConstant(0, DL, VT);
51935
51936   // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
51937   if (ISD::isBuildVectorAllOnes(N1.getNode()))
51938     return DAG.getNOT(DL, N0, VT);
51939
51940   // Turn ANDNP back to AND if input is inverted.
51941   if (SDValue Not = IsNOT(N0, DAG))
51942     return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
51943
51944   // Fold for better commutatvity:
51945   // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
51946   if (N1->hasOneUse())
51947     if (SDValue Not = IsNOT(N1, DAG))
51948       return DAG.getNOT(
51949           DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
51950
51951   // Constant Folding
51952   APInt Undefs0, Undefs1;
51953   SmallVector<APInt> EltBits0, EltBits1;
51954   if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {
51955     if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {
51956       SmallVector<APInt> ResultBits;
51957       for (int I = 0; I != NumElts; ++I)
51958         ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
51959       return getConstVector(ResultBits, VT, DAG, DL);
51960     }
51961
51962     // Constant fold NOT(N0) to allow us to use AND.
51963     // Ensure this is only performed if we can confirm that the bitcasted source
51964     // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
51965     if (N0->hasOneUse()) {
51966       SDValue BC0 = peekThroughOneUseBitcasts(N0);
51967       if (BC0.getOpcode() != ISD::BITCAST) {
51968         for (APInt &Elt : EltBits0)
51969           Elt = ~Elt;
51970         SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
51971         return DAG.getNode(ISD::AND, DL, VT, Not, N1);
51972       }
51973     }
51974   }
51975
51976   // Attempt to recursively combine a bitmask ANDNP with shuffles.
51977   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51978     SDValue Op(N, 0);
51979     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51980       return Res;
51981
51982     // If either operand is a constant mask, then only the elements that aren't
51983     // zero are actually demanded by the other operand.
51984     auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
51985       APInt UndefElts;
51986       SmallVector<APInt> EltBits;
51987       APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51988       APInt DemandedElts = APInt::getAllOnes(NumElts);
51989       if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51990                                         EltBits)) {
51991         DemandedBits.clearAllBits();
51992         DemandedElts.clearAllBits();
51993         for (int I = 0; I != NumElts; ++I) {
51994           if (UndefElts[I]) {
51995             // We can't assume an undef src element gives an undef dst - the
51996             // other src might be zero.
51997             DemandedBits.setAllBits();
51998             DemandedElts.setBit(I);
51999           } else if ((Invert && !EltBits[I].isAllOnes()) ||
52000                      (!Invert && !EltBits[I].isZero())) {
52001             DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
52002             DemandedElts.setBit(I);
52003           }
52004         }
52005       }
52006       return std::make_pair(DemandedBits, DemandedElts);
52007     };
52008     APInt Bits0, Elts0;
52009     APInt Bits1, Elts1;
52010     std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52011     std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
52012
52013     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52014     if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52015         TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52016         TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52017         TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52018       if (N->getOpcode() != ISD::DELETED_NODE)
52019         DCI.AddToWorklist(N);
52020       return SDValue(N, 0);
52021     }
52022   }
52023
52024   return SDValue();
52025 }
52026
52027 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
52028                          TargetLowering::DAGCombinerInfo &DCI) {
52029   SDValue N1 = N->getOperand(1);
52030
52031   // BT ignores high bits in the bit index operand.
52032   unsigned BitWidth = N1.getValueSizeInBits();
52033   APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
52034   if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
52035     if (N->getOpcode() != ISD::DELETED_NODE)
52036       DCI.AddToWorklist(N);
52037     return SDValue(N, 0);
52038   }
52039
52040   return SDValue();
52041 }
52042
52043 static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
52044                                TargetLowering::DAGCombinerInfo &DCI) {
52045   bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
52046   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
52047
52048   if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
52049     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52050     APInt DemandedElts = APInt::getLowBitsSet(8, 4);
52051     if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
52052       if (N->getOpcode() != ISD::DELETED_NODE)
52053         DCI.AddToWorklist(N);
52054       return SDValue(N, 0);
52055     }
52056
52057     // Convert a full vector load into vzload when not all bits are needed.
52058     if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
52059       LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
52060       if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
52061         SDLoc dl(N);
52062         if (IsStrict) {
52063           SDValue Convert = DAG.getNode(
52064               N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
52065               {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
52066           DCI.CombineTo(N, Convert, Convert.getValue(1));
52067         } else {
52068           SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
52069                                         DAG.getBitcast(MVT::v8i16, VZLoad));
52070           DCI.CombineTo(N, Convert);
52071         }
52072
52073         DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52074         DCI.recursivelyDeleteUnusedNodes(LN);
52075         return SDValue(N, 0);
52076       }
52077     }
52078   }
52079
52080   return SDValue();
52081 }
52082
52083 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
52084 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
52085   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
52086
52087   EVT DstVT = N->getValueType(0);
52088
52089   SDValue N0 = N->getOperand(0);
52090   SDValue N1 = N->getOperand(1);
52091   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
52092
52093   if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
52094     return SDValue();
52095
52096   // Look through single use any_extends / truncs.
52097   SDValue IntermediateBitwidthOp;
52098   if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
52099       N0.hasOneUse()) {
52100     IntermediateBitwidthOp = N0;
52101     N0 = N0.getOperand(0);
52102   }
52103
52104   // See if we have a single use cmov.
52105   if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
52106     return SDValue();
52107
52108   SDValue CMovOp0 = N0.getOperand(0);
52109   SDValue CMovOp1 = N0.getOperand(1);
52110
52111   // Make sure both operands are constants.
52112   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
52113       !isa<ConstantSDNode>(CMovOp1.getNode()))
52114     return SDValue();
52115
52116   SDLoc DL(N);
52117
52118   // If we looked through an any_extend/trunc above, add one to the constants.
52119   if (IntermediateBitwidthOp) {
52120     unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
52121     CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
52122     CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
52123   }
52124
52125   CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
52126   CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
52127
52128   EVT CMovVT = DstVT;
52129   // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
52130   if (DstVT == MVT::i16) {
52131     CMovVT = MVT::i32;
52132     CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
52133     CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
52134   }
52135
52136   SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
52137                              N0.getOperand(2), N0.getOperand(3));
52138
52139   if (CMovVT != DstVT)
52140     CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
52141
52142   return CMov;
52143 }
52144
52145 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
52146                                       const X86Subtarget &Subtarget) {
52147   assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
52148
52149   if (SDValue V = combineSextInRegCmov(N, DAG))
52150     return V;
52151
52152   EVT VT = N->getValueType(0);
52153   SDValue N0 = N->getOperand(0);
52154   SDValue N1 = N->getOperand(1);
52155   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
52156   SDLoc dl(N);
52157
52158   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
52159   // both SSE and AVX2 since there is no sign-extended shift right
52160   // operation on a vector with 64-bit elements.
52161   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
52162   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
52163   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
52164                            N0.getOpcode() == ISD::SIGN_EXTEND)) {
52165     SDValue N00 = N0.getOperand(0);
52166
52167     // EXTLOAD has a better solution on AVX2,
52168     // it may be replaced with X86ISD::VSEXT node.
52169     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
52170       if (!ISD::isNormalLoad(N00.getNode()))
52171         return SDValue();
52172
52173     // Attempt to promote any comparison mask ops before moving the
52174     // SIGN_EXTEND_INREG in the way.
52175     if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
52176       return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
52177
52178     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
52179       SDValue Tmp =
52180           DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
52181       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
52182     }
52183   }
52184   return SDValue();
52185 }
52186
52187 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
52188 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
52189 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
52190 /// opportunities to combine math ops, use an LEA, or use a complex addressing
52191 /// mode. This can eliminate extend, add, and shift instructions.
52192 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
52193                                    const X86Subtarget &Subtarget) {
52194   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
52195       Ext->getOpcode() != ISD::ZERO_EXTEND)
52196     return SDValue();
52197
52198   // TODO: This should be valid for other integer types.
52199   EVT VT = Ext->getValueType(0);
52200   if (VT != MVT::i64)
52201     return SDValue();
52202
52203   SDValue Add = Ext->getOperand(0);
52204   if (Add.getOpcode() != ISD::ADD)
52205     return SDValue();
52206
52207   SDValue AddOp0 = Add.getOperand(0);
52208   SDValue AddOp1 = Add.getOperand(1);
52209   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
52210   bool NSW = Add->getFlags().hasNoSignedWrap();
52211   bool NUW = Add->getFlags().hasNoUnsignedWrap();
52212   NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
52213   NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
52214
52215   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
52216   // into the 'zext'
52217   if ((Sext && !NSW) || (!Sext && !NUW))
52218     return SDValue();
52219
52220   // Having a constant operand to the 'add' ensures that we are not increasing
52221   // the instruction count because the constant is extended for free below.
52222   // A constant operand can also become the displacement field of an LEA.
52223   auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
52224   if (!AddOp1C)
52225     return SDValue();
52226
52227   // Don't make the 'add' bigger if there's no hope of combining it with some
52228   // other 'add' or 'shl' instruction.
52229   // TODO: It may be profitable to generate simpler LEA instructions in place
52230   // of single 'add' instructions, but the cost model for selecting an LEA
52231   // currently has a high threshold.
52232   bool HasLEAPotential = false;
52233   for (auto *User : Ext->uses()) {
52234     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
52235       HasLEAPotential = true;
52236       break;
52237     }
52238   }
52239   if (!HasLEAPotential)
52240     return SDValue();
52241
52242   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
52243   int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
52244   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
52245   SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
52246
52247   // The wider add is guaranteed to not wrap because both operands are
52248   // sign-extended.
52249   SDNodeFlags Flags;
52250   Flags.setNoSignedWrap(NSW);
52251   Flags.setNoUnsignedWrap(NUW);
52252   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
52253 }
52254
52255 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
52256 // operands and the result of CMOV is not used anywhere else - promote CMOV
52257 // itself instead of promoting its result. This could be beneficial, because:
52258 //     1) X86TargetLowering::EmitLoweredSelect later can do merging of two
52259 //        (or more) pseudo-CMOVs only when they go one-after-another and
52260 //        getting rid of result extension code after CMOV will help that.
52261 //     2) Promotion of constant CMOV arguments is free, hence the
52262 //        {ANY,SIGN,ZERO}_EXTEND will just be deleted.
52263 //     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
52264 //        promotion is also good in terms of code-size.
52265 //        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
52266 //         promotion).
52267 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
52268   SDValue CMovN = Extend->getOperand(0);
52269   if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
52270     return SDValue();
52271
52272   EVT TargetVT = Extend->getValueType(0);
52273   unsigned ExtendOpcode = Extend->getOpcode();
52274   SDLoc DL(Extend);
52275
52276   EVT VT = CMovN.getValueType();
52277   SDValue CMovOp0 = CMovN.getOperand(0);
52278   SDValue CMovOp1 = CMovN.getOperand(1);
52279
52280   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
52281       !isa<ConstantSDNode>(CMovOp1.getNode()))
52282     return SDValue();
52283
52284   // Only extend to i32 or i64.
52285   if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
52286     return SDValue();
52287
52288   // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
52289   // are free.
52290   if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
52291     return SDValue();
52292
52293   // If this a zero extend to i64, we should only extend to i32 and use a free
52294   // zero extend to finish.
52295   EVT ExtendVT = TargetVT;
52296   if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
52297     ExtendVT = MVT::i32;
52298
52299   CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
52300   CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
52301
52302   SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
52303                             CMovN.getOperand(2), CMovN.getOperand(3));
52304
52305   // Finish extending if needed.
52306   if (ExtendVT != TargetVT)
52307     Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
52308
52309   return Res;
52310 }
52311
52312 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
52313 // result type.
52314 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
52315                                const X86Subtarget &Subtarget) {
52316   SDValue N0 = N->getOperand(0);
52317   EVT VT = N->getValueType(0);
52318   SDLoc dl(N);
52319
52320   // Only do this combine with AVX512 for vector extends.
52321   if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
52322     return SDValue();
52323
52324   // Only combine legal element types.
52325   EVT SVT = VT.getVectorElementType();
52326   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
52327       SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
52328     return SDValue();
52329
52330   // We don't have CMPP Instruction for vxf16
52331   if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
52332     return SDValue();
52333   // We can only do this if the vector size in 256 bits or less.
52334   unsigned Size = VT.getSizeInBits();
52335   if (Size > 256 && Subtarget.useAVX512Regs())
52336     return SDValue();
52337
52338   // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
52339   // that's the only integer compares with we have.
52340   ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
52341   if (ISD::isUnsignedIntSetCC(CC))
52342     return SDValue();
52343
52344   // Only do this combine if the extension will be fully consumed by the setcc.
52345   EVT N00VT = N0.getOperand(0).getValueType();
52346   EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
52347   if (Size != MatchingVecType.getSizeInBits())
52348     return SDValue();
52349
52350   SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
52351
52352   if (N->getOpcode() == ISD::ZERO_EXTEND)
52353     Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
52354
52355   return Res;
52356 }
52357
52358 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
52359                            TargetLowering::DAGCombinerInfo &DCI,
52360                            const X86Subtarget &Subtarget) {
52361   SDValue N0 = N->getOperand(0);
52362   EVT VT = N->getValueType(0);
52363   SDLoc DL(N);
52364
52365   // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
52366   if (!DCI.isBeforeLegalizeOps() &&
52367       N0.getOpcode() == X86ISD::SETCC_CARRY) {
52368     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
52369                                  N0->getOperand(1));
52370     bool ReplaceOtherUses = !N0.hasOneUse();
52371     DCI.CombineTo(N, Setcc);
52372     // Replace other uses with a truncate of the widened setcc_carry.
52373     if (ReplaceOtherUses) {
52374       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
52375                                   N0.getValueType(), Setcc);
52376       DCI.CombineTo(N0.getNode(), Trunc);
52377     }
52378
52379     return SDValue(N, 0);
52380   }
52381
52382   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
52383     return NewCMov;
52384
52385   if (!DCI.isBeforeLegalizeOps())
52386     return SDValue();
52387
52388   if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
52389     return V;
52390
52391   if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
52392                                                  DAG, DCI, Subtarget))
52393     return V;
52394
52395   if (VT.isVector()) {
52396     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
52397       return R;
52398
52399     if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
52400       return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
52401   }
52402
52403   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
52404     return NewAdd;
52405
52406   return SDValue();
52407 }
52408
52409 // Inverting a constant vector is profitable if it can be eliminated and the
52410 // inverted vector is already present in DAG. Otherwise, it will be loaded
52411 // anyway.
52412 //
52413 // We determine which of the values can be completely eliminated and invert it.
52414 // If both are eliminable, select a vector with the first negative element.
52415 static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG) {
52416   assert(ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()) &&
52417          "ConstantFP build vector expected");
52418   // Check if we can eliminate V. We assume if a value is only used in FMAs, we
52419   // can eliminate it. Since this function is invoked for each FMA with this
52420   // vector.
52421   auto IsNotFMA = [](SDNode *Use) {
52422     return Use->getOpcode() != ISD::FMA && Use->getOpcode() != ISD::STRICT_FMA;
52423   };
52424   if (llvm::any_of(V->uses(), IsNotFMA))
52425     return SDValue();
52426
52427   SmallVector<SDValue, 8> Ops;
52428   EVT VT = V.getValueType();
52429   EVT EltVT = VT.getVectorElementType();
52430   for (auto Op : V->op_values()) {
52431     if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
52432       Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
52433     } else {
52434       assert(Op.isUndef());
52435       Ops.push_back(DAG.getUNDEF(EltVT));
52436     }
52437   }
52438
52439   SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
52440   if (!NV)
52441     return SDValue();
52442
52443   // If an inverted version cannot be eliminated, choose it instead of the
52444   // original version.
52445   if (llvm::any_of(NV->uses(), IsNotFMA))
52446     return SDValue(NV, 0);
52447
52448   // If the inverted version also can be eliminated, we have to consistently
52449   // prefer one of the values. We prefer a constant with a negative value on
52450   // the first place.
52451   // N.B. We need to skip undefs that may precede a value.
52452   for (auto op : V->op_values()) {
52453     if (auto *Cst = dyn_cast<ConstantFPSDNode>(op)) {
52454       if (Cst->isNegative())
52455         return SDValue();
52456       break;
52457     }
52458   }
52459   return SDValue(NV, 0);
52460 }
52461
52462 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
52463                           TargetLowering::DAGCombinerInfo &DCI,
52464                           const X86Subtarget &Subtarget) {
52465   SDLoc dl(N);
52466   EVT VT = N->getValueType(0);
52467   bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
52468
52469   // Let legalize expand this if it isn't a legal type yet.
52470   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52471   if (!TLI.isTypeLegal(VT))
52472     return SDValue();
52473
52474   SDValue A = N->getOperand(IsStrict ? 1 : 0);
52475   SDValue B = N->getOperand(IsStrict ? 2 : 1);
52476   SDValue C = N->getOperand(IsStrict ? 3 : 2);
52477
52478   // If the operation allows fast-math and the target does not support FMA,
52479   // split this into mul+add to avoid libcall(s).
52480   SDNodeFlags Flags = N->getFlags();
52481   if (!IsStrict && Flags.hasAllowReassociation() &&
52482       TLI.isOperationExpand(ISD::FMA, VT)) {
52483     SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
52484     return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
52485   }
52486
52487   EVT ScalarVT = VT.getScalarType();
52488   if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
52489        !Subtarget.hasAnyFMA()) &&
52490       !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
52491     return SDValue();
52492
52493   auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
52494     bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
52495     bool LegalOperations = !DCI.isBeforeLegalizeOps();
52496     if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
52497                                                        CodeSize)) {
52498       V = NegV;
52499       return true;
52500     }
52501     // Look through extract_vector_elts. If it comes from an FNEG, create a
52502     // new extract from the FNEG input.
52503     if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
52504         isNullConstant(V.getOperand(1))) {
52505       SDValue Vec = V.getOperand(0);
52506       if (SDValue NegV = TLI.getCheaperNegatedExpression(
52507               Vec, DAG, LegalOperations, CodeSize)) {
52508         V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
52509                         NegV, V.getOperand(1));
52510         return true;
52511       }
52512     }
52513     // Lookup if there is an inverted version of constant vector V in DAG.
52514     if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
52515       if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
52516         V = NegV;
52517         return true;
52518       }
52519     }
52520     return false;
52521   };
52522
52523   // Do not convert the passthru input of scalar intrinsics.
52524   // FIXME: We could allow negations of the lower element only.
52525   bool NegA = invertIfNegative(A);
52526   bool NegB = invertIfNegative(B);
52527   bool NegC = invertIfNegative(C);
52528
52529   if (!NegA && !NegB && !NegC)
52530     return SDValue();
52531
52532   unsigned NewOpcode =
52533       negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
52534
52535   // Propagate fast-math-flags to new FMA node.
52536   SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
52537   if (IsStrict) {
52538     assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
52539     return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
52540                        {N->getOperand(0), A, B, C});
52541   } else {
52542     if (N->getNumOperands() == 4)
52543       return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
52544     return DAG.getNode(NewOpcode, dl, VT, A, B, C);
52545   }
52546 }
52547
52548 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
52549 // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
52550 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
52551                                TargetLowering::DAGCombinerInfo &DCI) {
52552   SDLoc dl(N);
52553   EVT VT = N->getValueType(0);
52554   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52555   bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
52556   bool LegalOperations = !DCI.isBeforeLegalizeOps();
52557
52558   SDValue N2 = N->getOperand(2);
52559
52560   SDValue NegN2 =
52561       TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
52562   if (!NegN2)
52563     return SDValue();
52564   unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
52565
52566   if (N->getNumOperands() == 4)
52567     return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
52568                        NegN2, N->getOperand(3));
52569   return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
52570                      NegN2);
52571 }
52572
52573 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
52574                            TargetLowering::DAGCombinerInfo &DCI,
52575                            const X86Subtarget &Subtarget) {
52576   SDLoc dl(N);
52577   SDValue N0 = N->getOperand(0);
52578   EVT VT = N->getValueType(0);
52579
52580   // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
52581   // FIXME: Is this needed? We don't seem to have any tests for it.
52582   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
52583       N0.getOpcode() == X86ISD::SETCC_CARRY) {
52584     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
52585                                  N0->getOperand(1));
52586     bool ReplaceOtherUses = !N0.hasOneUse();
52587     DCI.CombineTo(N, Setcc);
52588     // Replace other uses with a truncate of the widened setcc_carry.
52589     if (ReplaceOtherUses) {
52590       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
52591                                   N0.getValueType(), Setcc);
52592       DCI.CombineTo(N0.getNode(), Trunc);
52593     }
52594
52595     return SDValue(N, 0);
52596   }
52597
52598   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
52599     return NewCMov;
52600
52601   if (DCI.isBeforeLegalizeOps())
52602     if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
52603       return V;
52604
52605   if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
52606                                                  DAG, DCI, Subtarget))
52607     return V;
52608
52609   if (VT.isVector())
52610     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
52611       return R;
52612
52613   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
52614     return NewAdd;
52615
52616   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
52617     return R;
52618
52619   // TODO: Combine with any target/faux shuffle.
52620   if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
52621       VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
52622     SDValue N00 = N0.getOperand(0);
52623     SDValue N01 = N0.getOperand(1);
52624     unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
52625     APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
52626     if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
52627         (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
52628       return concatSubVectors(N00, N01, DAG, dl);
52629     }
52630   }
52631
52632   return SDValue();
52633 }
52634
52635 /// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
52636 /// pre-promote its result type since vXi1 vectors don't get promoted
52637 /// during type legalization.
52638 static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,
52639                                         SDValue RHS, ISD::CondCode CC,
52640                                         const SDLoc &DL, SelectionDAG &DAG,
52641                                         const X86Subtarget &Subtarget) {
52642   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
52643       VT.getVectorElementType() == MVT::i1 &&
52644       (OpVT.getVectorElementType() == MVT::i8 ||
52645        OpVT.getVectorElementType() == MVT::i16)) {
52646     SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
52647     return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
52648   }
52649   return SDValue();
52650 }
52651
52652 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
52653                             TargetLowering::DAGCombinerInfo &DCI,
52654                             const X86Subtarget &Subtarget) {
52655   const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
52656   const SDValue LHS = N->getOperand(0);
52657   const SDValue RHS = N->getOperand(1);
52658   EVT VT = N->getValueType(0);
52659   EVT OpVT = LHS.getValueType();
52660   SDLoc DL(N);
52661
52662   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
52663     if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
52664                                                     Subtarget))
52665       return V;
52666
52667     if (VT == MVT::i1) {
52668       X86::CondCode X86CC;
52669       if (SDValue V =
52670               MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
52671         return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
52672     }
52673
52674     if (OpVT.isScalarInteger()) {
52675       // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
52676       // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
52677       auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
52678         if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
52679           if (N0.getOperand(0) == N1)
52680             return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
52681                                N0.getOperand(1));
52682           if (N0.getOperand(1) == N1)
52683             return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
52684                                N0.getOperand(0));
52685         }
52686         return SDValue();
52687       };
52688       if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
52689         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
52690       if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
52691         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
52692
52693       // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
52694       // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
52695       auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
52696         if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
52697           if (N0.getOperand(0) == N1)
52698             return DAG.getNode(ISD::AND, DL, OpVT, N1,
52699                                DAG.getNOT(DL, N0.getOperand(1), OpVT));
52700           if (N0.getOperand(1) == N1)
52701             return DAG.getNode(ISD::AND, DL, OpVT, N1,
52702                                DAG.getNOT(DL, N0.getOperand(0), OpVT));
52703         }
52704         return SDValue();
52705       };
52706       if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
52707         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
52708       if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
52709         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
52710
52711       // cmpeq(trunc(x),C) --> cmpeq(x,C)
52712       // cmpne(trunc(x),C) --> cmpne(x,C)
52713       // iff x upper bits are zero.
52714       if (LHS.getOpcode() == ISD::TRUNCATE &&
52715           LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
52716           isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
52717         EVT SrcVT = LHS.getOperand(0).getValueType();
52718         APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
52719                                                 OpVT.getScalarSizeInBits());
52720         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52721         auto *C = cast<ConstantSDNode>(RHS);
52722         if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
52723             TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
52724           return DAG.getSetCC(DL, VT, LHS.getOperand(0),
52725                               DAG.getConstant(C->getAPIntValue().zextOrTrunc(
52726                                                   SrcVT.getScalarSizeInBits()),
52727                                               DL, SrcVT),
52728                               CC);
52729       }
52730
52731       // With C as a power of 2 and C != 0 and C != INT_MIN:
52732       //    icmp eq Abs(X) C ->
52733       //        (icmp eq A, C) | (icmp eq A, -C)
52734       //    icmp ne Abs(X) C ->
52735       //        (icmp ne A, C) & (icmp ne A, -C)
52736       // Both of these patterns can be better optimized in
52737       // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
52738       // integers which is checked above.
52739       if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
52740         if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
52741           const APInt &CInt = C->getAPIntValue();
52742           // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
52743           if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
52744             SDValue BaseOp = LHS.getOperand(0);
52745             SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
52746             SDValue SETCC1 = DAG.getSetCC(
52747                 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
52748             return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
52749                                SETCC0, SETCC1);
52750           }
52751         }
52752       }
52753     }
52754   }
52755
52756   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
52757       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
52758     // Using temporaries to avoid messing up operand ordering for later
52759     // transformations if this doesn't work.
52760     SDValue Op0 = LHS;
52761     SDValue Op1 = RHS;
52762     ISD::CondCode TmpCC = CC;
52763     // Put build_vector on the right.
52764     if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
52765       std::swap(Op0, Op1);
52766       TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
52767     }
52768
52769     bool IsSEXT0 =
52770         (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
52771         (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
52772     bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
52773
52774     if (IsSEXT0 && IsVZero1) {
52775       assert(VT == Op0.getOperand(0).getValueType() &&
52776              "Unexpected operand type");
52777       if (TmpCC == ISD::SETGT)
52778         return DAG.getConstant(0, DL, VT);
52779       if (TmpCC == ISD::SETLE)
52780         return DAG.getConstant(1, DL, VT);
52781       if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
52782         return DAG.getNOT(DL, Op0.getOperand(0), VT);
52783
52784       assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
52785              "Unexpected condition code!");
52786       return Op0.getOperand(0);
52787     }
52788   }
52789
52790   // Try and make unsigned vector comparison signed. On pre AVX512 targets there
52791   // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
52792   // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
52793   // a mask, there are signed AVX512 comparisons).
52794   if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
52795     bool CanMakeSigned = false;
52796     if (ISD::isUnsignedIntSetCC(CC)) {
52797       KnownBits CmpKnown =
52798           DAG.computeKnownBits(LHS).intersectWith(DAG.computeKnownBits(RHS));
52799       // If we know LHS/RHS share the same sign bit at each element we can
52800       // make this signed.
52801       // NOTE: `computeKnownBits` on a vector type aggregates common bits
52802       // across all lanes. So a pattern where the sign varies from lane to
52803       // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
52804       // missed. We could get around this by demanding each lane
52805       // independently, but this isn't the most important optimization and
52806       // that may eat into compile time.
52807       CanMakeSigned =
52808           CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
52809     }
52810     if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
52811       SDValue LHSOut = LHS;
52812       SDValue RHSOut = RHS;
52813       ISD::CondCode NewCC = CC;
52814       switch (CC) {
52815       case ISD::SETGE:
52816       case ISD::SETUGE:
52817         if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
52818                                                   /*NSW*/ true))
52819           LHSOut = NewLHS;
52820         else if (SDValue NewRHS = incDecVectorConstant(
52821                      RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
52822           RHSOut = NewRHS;
52823         else
52824           break;
52825
52826         [[fallthrough]];
52827       case ISD::SETUGT:
52828         NewCC = ISD::SETGT;
52829         break;
52830
52831       case ISD::SETLE:
52832       case ISD::SETULE:
52833         if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
52834                                                   /*NSW*/ true))
52835           LHSOut = NewLHS;
52836         else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
52837                                                        /*NSW*/ true))
52838           RHSOut = NewRHS;
52839         else
52840           break;
52841
52842         [[fallthrough]];
52843       case ISD::SETULT:
52844         // Will be swapped to SETGT in LowerVSETCC*.
52845         NewCC = ISD::SETLT;
52846         break;
52847       default:
52848         break;
52849       }
52850       if (NewCC != CC) {
52851         if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
52852                                                  NewCC, DL, DAG, Subtarget))
52853           return R;
52854         return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
52855       }
52856     }
52857   }
52858
52859   if (SDValue R =
52860           truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
52861     return R;
52862
52863   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
52864   // to avoid scalarization via legalization because v4i32 is not a legal type.
52865   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
52866       LHS.getValueType() == MVT::v4f32)
52867     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
52868
52869   // X pred 0.0 --> X pred -X
52870   // If the negation of X already exists, use it in the comparison. This removes
52871   // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
52872   // instructions in patterns with a 'select' node.
52873   if (isNullFPScalarOrVectorConst(RHS)) {
52874     SDVTList FNegVT = DAG.getVTList(OpVT);
52875     if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
52876       return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
52877   }
52878
52879   return SDValue();
52880 }
52881
52882 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
52883                              TargetLowering::DAGCombinerInfo &DCI,
52884                              const X86Subtarget &Subtarget) {
52885   SDValue Src = N->getOperand(0);
52886   MVT SrcVT = Src.getSimpleValueType();
52887   MVT VT = N->getSimpleValueType(0);
52888   unsigned NumBits = VT.getScalarSizeInBits();
52889   unsigned NumElts = SrcVT.getVectorNumElements();
52890   unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
52891   assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
52892
52893   // Perform constant folding.
52894   APInt UndefElts;
52895   SmallVector<APInt, 32> EltBits;
52896   if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {
52897     APInt Imm(32, 0);
52898     for (unsigned Idx = 0; Idx != NumElts; ++Idx)
52899       if (!UndefElts[Idx] && EltBits[Idx].isNegative())
52900         Imm.setBit(Idx);
52901
52902     return DAG.getConstant(Imm, SDLoc(N), VT);
52903   }
52904
52905   // Look through int->fp bitcasts that don't change the element width.
52906   unsigned EltWidth = SrcVT.getScalarSizeInBits();
52907   if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
52908       Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
52909     return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
52910
52911   // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
52912   // with scalar comparisons.
52913   if (SDValue NotSrc = IsNOT(Src, DAG)) {
52914     SDLoc DL(N);
52915     APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
52916     NotSrc = DAG.getBitcast(SrcVT, NotSrc);
52917     return DAG.getNode(ISD::XOR, DL, VT,
52918                        DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
52919                        DAG.getConstant(NotMask, DL, VT));
52920   }
52921
52922   // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
52923   // results with scalar comparisons.
52924   if (Src.getOpcode() == X86ISD::PCMPGT &&
52925       ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
52926     SDLoc DL(N);
52927     APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
52928     return DAG.getNode(ISD::XOR, DL, VT,
52929                        DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
52930                        DAG.getConstant(NotMask, DL, VT));
52931   }
52932
52933   // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
52934   // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
52935   // iff pow2splat(c1).
52936   // Use KnownBits to determine if only a single bit is non-zero
52937   // in each element (pow2 or zero), and shift that bit to the msb.
52938   if (Src.getOpcode() == X86ISD::PCMPEQ) {
52939     KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
52940     KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
52941     unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
52942     if (KnownLHS.countMaxPopulation() == 1 &&
52943         (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
52944                                ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
52945       SDLoc DL(N);
52946       MVT ShiftVT = SrcVT;
52947       SDValue ShiftLHS = Src.getOperand(0);
52948       SDValue ShiftRHS = Src.getOperand(1);
52949       if (ShiftVT.getScalarType() == MVT::i8) {
52950         // vXi8 shifts - we only care about the signbit so can use PSLLW.
52951         ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
52952         ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
52953         ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
52954       }
52955       ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
52956                                             ShiftLHS, ShiftAmt, DAG);
52957       ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
52958                                             ShiftRHS, ShiftAmt, DAG);
52959       ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
52960       ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
52961       SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
52962       return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
52963     }
52964   }
52965
52966   // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
52967   if (N->isOnlyUserOf(Src.getNode())) {
52968     SDValue SrcBC = peekThroughOneUseBitcasts(Src);
52969     if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
52970       APInt UndefElts;
52971       SmallVector<APInt, 32> EltBits;
52972       if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
52973                                         UndefElts, EltBits)) {
52974         APInt Mask = APInt::getZero(NumBits);
52975         for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
52976           if (!UndefElts[Idx] && EltBits[Idx].isNegative())
52977             Mask.setBit(Idx);
52978         }
52979         SDLoc DL(N);
52980         SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
52981         SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
52982         return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
52983                            DAG.getConstant(Mask, DL, VT));
52984       }
52985     }
52986   }
52987
52988   // Simplify the inputs.
52989   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52990   APInt DemandedMask(APInt::getAllOnes(NumBits));
52991   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52992     return SDValue(N, 0);
52993
52994   return SDValue();
52995 }
52996
52997 static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG,
52998                             TargetLowering::DAGCombinerInfo &DCI,
52999                             const X86Subtarget &Subtarget) {
53000   MVT VT = N->getSimpleValueType(0);
53001   unsigned NumBits = VT.getScalarSizeInBits();
53002
53003   // Simplify the inputs.
53004   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53005   APInt DemandedMask(APInt::getAllOnes(NumBits));
53006   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53007     return SDValue(N, 0);
53008
53009   return SDValue();
53010 }
53011
53012 static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
53013                                        TargetLowering::DAGCombinerInfo &DCI) {
53014   auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
53015   SDValue Mask = MemOp->getMask();
53016
53017   // With vector masks we only demand the upper bit of the mask.
53018   if (Mask.getScalarValueSizeInBits() != 1) {
53019     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53020     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
53021     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
53022       if (N->getOpcode() != ISD::DELETED_NODE)
53023         DCI.AddToWorklist(N);
53024       return SDValue(N, 0);
53025     }
53026   }
53027
53028   return SDValue();
53029 }
53030
53031 static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
53032                                     SDValue Index, SDValue Base, SDValue Scale,
53033                                     SelectionDAG &DAG) {
53034   SDLoc DL(GorS);
53035
53036   if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
53037     SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
53038                       Gather->getMask(), Base, Index, Scale } ;
53039     return DAG.getMaskedGather(Gather->getVTList(),
53040                                Gather->getMemoryVT(), DL, Ops,
53041                                Gather->getMemOperand(),
53042                                Gather->getIndexType(),
53043                                Gather->getExtensionType());
53044   }
53045   auto *Scatter = cast<MaskedScatterSDNode>(GorS);
53046   SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
53047                     Scatter->getMask(), Base, Index, Scale };
53048   return DAG.getMaskedScatter(Scatter->getVTList(),
53049                               Scatter->getMemoryVT(), DL,
53050                               Ops, Scatter->getMemOperand(),
53051                               Scatter->getIndexType(),
53052                               Scatter->isTruncatingStore());
53053 }
53054
53055 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
53056                                     TargetLowering::DAGCombinerInfo &DCI) {
53057   SDLoc DL(N);
53058   auto *GorS = cast<MaskedGatherScatterSDNode>(N);
53059   SDValue Index = GorS->getIndex();
53060   SDValue Base = GorS->getBasePtr();
53061   SDValue Scale = GorS->getScale();
53062
53063   if (DCI.isBeforeLegalize()) {
53064     unsigned IndexWidth = Index.getScalarValueSizeInBits();
53065
53066     // Shrink constant indices if they are larger than 32-bits.
53067     // Only do this before legalize types since v2i64 could become v2i32.
53068     // FIXME: We could check that the type is legal if we're after legalize
53069     // types, but then we would need to construct test cases where that happens.
53070     // FIXME: We could support more than just constant vectors, but we need to
53071     // careful with costing. A truncate that can be optimized out would be fine.
53072     // Otherwise we might only want to create a truncate if it avoids a split.
53073     if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
53074       if (BV->isConstant() && IndexWidth > 32 &&
53075           DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
53076         EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
53077         Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
53078         return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53079       }
53080     }
53081
53082     // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
53083     // there are sufficient sign bits. Only do this before legalize types to
53084     // avoid creating illegal types in truncate.
53085     if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
53086          Index.getOpcode() == ISD::ZERO_EXTEND) &&
53087         IndexWidth > 32 &&
53088         Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
53089         DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
53090       EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
53091       Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
53092       return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53093     }
53094   }
53095
53096   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53097   EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53098   // Try to move splat constant adders from the index operand to the base
53099   // pointer operand. Taking care to multiply by the scale. We can only do
53100   // this when index element type is the same as the pointer type.
53101   // Otherwise we need to be sure the math doesn't wrap before the scale.
53102   if (Index.getOpcode() == ISD::ADD &&
53103       Index.getValueType().getVectorElementType() == PtrVT &&
53104       isa<ConstantSDNode>(Scale)) {
53105     uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();
53106     if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
53107       BitVector UndefElts;
53108       if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
53109         // FIXME: Allow non-constant?
53110         if (UndefElts.none()) {
53111           // Apply the scale.
53112           APInt Adder = C->getAPIntValue() * ScaleAmt;
53113           // Add it to the existing base.
53114           Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
53115                              DAG.getConstant(Adder, DL, PtrVT));
53116           Index = Index.getOperand(0);
53117           return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53118         }
53119       }
53120
53121       // It's also possible base is just a constant. In that case, just
53122       // replace it with 0 and move the displacement into the index.
53123       if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
53124           isOneConstant(Scale)) {
53125         SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
53126         // Combine the constant build_vector and the constant base.
53127         Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
53128                             Index.getOperand(1), Splat);
53129         // Add to the LHS of the original Index add.
53130         Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
53131                             Index.getOperand(0), Splat);
53132         Base = DAG.getConstant(0, DL, Base.getValueType());
53133         return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53134       }
53135     }
53136   }
53137
53138   if (DCI.isBeforeLegalizeOps()) {
53139     unsigned IndexWidth = Index.getScalarValueSizeInBits();
53140
53141     // Make sure the index is either i32 or i64
53142     if (IndexWidth != 32 && IndexWidth != 64) {
53143       MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
53144       EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
53145       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
53146       return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53147     }
53148   }
53149
53150   // With vector masks we only demand the upper bit of the mask.
53151   SDValue Mask = GorS->getMask();
53152   if (Mask.getScalarValueSizeInBits() != 1) {
53153     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53154     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
53155     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
53156       if (N->getOpcode() != ISD::DELETED_NODE)
53157         DCI.AddToWorklist(N);
53158       return SDValue(N, 0);
53159     }
53160   }
53161
53162   return SDValue();
53163 }
53164
53165 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
53166 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
53167                                const X86Subtarget &Subtarget) {
53168   SDLoc DL(N);
53169   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
53170   SDValue EFLAGS = N->getOperand(1);
53171
53172   // Try to simplify the EFLAGS and condition code operands.
53173   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
53174     return getSETCC(CC, Flags, DL, DAG);
53175
53176   return SDValue();
53177 }
53178
53179 /// Optimize branch condition evaluation.
53180 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
53181                              const X86Subtarget &Subtarget) {
53182   SDLoc DL(N);
53183   SDValue EFLAGS = N->getOperand(3);
53184   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
53185
53186   // Try to simplify the EFLAGS and condition code operands.
53187   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
53188   // RAUW them under us.
53189   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
53190     SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
53191     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
53192                        N->getOperand(1), Cond, Flags);
53193   }
53194
53195   return SDValue();
53196 }
53197
53198 // TODO: Could we move this to DAGCombine?
53199 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
53200                                                   SelectionDAG &DAG) {
53201   // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
53202   // to optimize away operation when it's from a constant.
53203   //
53204   // The general transformation is:
53205   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
53206   //       AND(VECTOR_CMP(x,y), constant2)
53207   //    constant2 = UNARYOP(constant)
53208
53209   // Early exit if this isn't a vector operation, the operand of the
53210   // unary operation isn't a bitwise AND, or if the sizes of the operations
53211   // aren't the same.
53212   EVT VT = N->getValueType(0);
53213   bool IsStrict = N->isStrictFPOpcode();
53214   unsigned NumEltBits = VT.getScalarSizeInBits();
53215   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
53216   if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
53217       DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
53218       VT.getSizeInBits() != Op0.getValueSizeInBits())
53219     return SDValue();
53220
53221   // Now check that the other operand of the AND is a constant. We could
53222   // make the transformation for non-constant splats as well, but it's unclear
53223   // that would be a benefit as it would not eliminate any operations, just
53224   // perform one more step in scalar code before moving to the vector unit.
53225   if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
53226     // Bail out if the vector isn't a constant.
53227     if (!BV->isConstant())
53228       return SDValue();
53229
53230     // Everything checks out. Build up the new and improved node.
53231     SDLoc DL(N);
53232     EVT IntVT = BV->getValueType(0);
53233     // Create a new constant of the appropriate type for the transformed
53234     // DAG.
53235     SDValue SourceConst;
53236     if (IsStrict)
53237       SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
53238                                 {N->getOperand(0), SDValue(BV, 0)});
53239     else
53240       SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
53241     // The AND node needs bitcasts to/from an integer vector type around it.
53242     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
53243     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
53244                                  MaskConst);
53245     SDValue Res = DAG.getBitcast(VT, NewAnd);
53246     if (IsStrict)
53247       return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
53248     return Res;
53249   }
53250
53251   return SDValue();
53252 }
53253
53254 /// If we are converting a value to floating-point, try to replace scalar
53255 /// truncate of an extracted vector element with a bitcast. This tries to keep
53256 /// the sequence on XMM registers rather than moving between vector and GPRs.
53257 static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
53258   // TODO: This is currently only used by combineSIntToFP, but it is generalized
53259   //       to allow being called by any similar cast opcode.
53260   // TODO: Consider merging this into lowering: vectorizeExtractedCast().
53261   SDValue Trunc = N->getOperand(0);
53262   if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
53263     return SDValue();
53264
53265   SDValue ExtElt = Trunc.getOperand(0);
53266   if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53267       !isNullConstant(ExtElt.getOperand(1)))
53268     return SDValue();
53269
53270   EVT TruncVT = Trunc.getValueType();
53271   EVT SrcVT = ExtElt.getValueType();
53272   unsigned DestWidth = TruncVT.getSizeInBits();
53273   unsigned SrcWidth = SrcVT.getSizeInBits();
53274   if (SrcWidth % DestWidth != 0)
53275     return SDValue();
53276
53277   // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
53278   EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
53279   unsigned VecWidth = SrcVecVT.getSizeInBits();
53280   unsigned NumElts = VecWidth / DestWidth;
53281   EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
53282   SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
53283   SDLoc DL(N);
53284   SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
53285                                   BitcastVec, ExtElt.getOperand(1));
53286   return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
53287 }
53288
53289 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
53290                                const X86Subtarget &Subtarget) {
53291   bool IsStrict = N->isStrictFPOpcode();
53292   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
53293   EVT VT = N->getValueType(0);
53294   EVT InVT = Op0.getValueType();
53295
53296   // Using i16 as an intermediate type is a bad idea, unless we have HW support
53297   // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
53298   // if hasFP16 support:
53299   //   UINT_TO_FP(vXi1~15)  -> UINT_TO_FP(ZEXT(vXi1~15  to vXi16))
53300   //   UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
53301   // else
53302   //   UINT_TO_FP(vXi1~31) -> UINT_TO_FP(ZEXT(vXi1~31 to vXi32))
53303   // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
53304   if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
53305     unsigned ScalarSize = InVT.getScalarSizeInBits();
53306     if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
53307         ScalarSize >= 64)
53308       return SDValue();
53309     SDLoc dl(N);
53310     EVT DstVT =
53311         EVT::getVectorVT(*DAG.getContext(),
53312                          (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
53313                          : ScalarSize < 32                        ? MVT::i32
53314                                                                   : MVT::i64,
53315                          InVT.getVectorNumElements());
53316     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
53317     if (IsStrict)
53318       return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
53319                          {N->getOperand(0), P});
53320     return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
53321   }
53322
53323   // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
53324   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
53325   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
53326   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
53327       VT.getScalarType() != MVT::f16) {
53328     SDLoc dl(N);
53329     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
53330     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
53331
53332     // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
53333     if (IsStrict)
53334       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
53335                          {N->getOperand(0), P});
53336     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
53337   }
53338
53339   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
53340   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
53341   // the optimization here.
53342   if (DAG.SignBitIsZero(Op0)) {
53343     if (IsStrict)
53344       return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
53345                          {N->getOperand(0), Op0});
53346     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
53347   }
53348
53349   return SDValue();
53350 }
53351
53352 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
53353                                TargetLowering::DAGCombinerInfo &DCI,
53354                                const X86Subtarget &Subtarget) {
53355   // First try to optimize away the conversion entirely when it's
53356   // conditionally from a constant. Vectors only.
53357   bool IsStrict = N->isStrictFPOpcode();
53358   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
53359     return Res;
53360
53361   // Now move on to more general possibilities.
53362   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
53363   EVT VT = N->getValueType(0);
53364   EVT InVT = Op0.getValueType();
53365
53366   // Using i16 as an intermediate type is a bad idea, unless we have HW support
53367   // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
53368   // if hasFP16 support:
53369   //   SINT_TO_FP(vXi1~15)  -> SINT_TO_FP(SEXT(vXi1~15  to vXi16))
53370   //   SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
53371   // else
53372   //   SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
53373   // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
53374   if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
53375     unsigned ScalarSize = InVT.getScalarSizeInBits();
53376     if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
53377         ScalarSize >= 64)
53378       return SDValue();
53379     SDLoc dl(N);
53380     EVT DstVT =
53381         EVT::getVectorVT(*DAG.getContext(),
53382                          (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
53383                          : ScalarSize < 32                        ? MVT::i32
53384                                                                   : MVT::i64,
53385                          InVT.getVectorNumElements());
53386     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
53387     if (IsStrict)
53388       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
53389                          {N->getOperand(0), P});
53390     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
53391   }
53392
53393   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
53394   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
53395   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
53396   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
53397       VT.getScalarType() != MVT::f16) {
53398     SDLoc dl(N);
53399     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
53400     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
53401     if (IsStrict)
53402       return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
53403                          {N->getOperand(0), P});
53404     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
53405   }
53406
53407   // Without AVX512DQ we only support i64 to float scalar conversion. For both
53408   // vectors and scalars, see if we know that the upper bits are all the sign
53409   // bit, in which case we can truncate the input to i32 and convert from that.
53410   if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
53411     unsigned BitWidth = InVT.getScalarSizeInBits();
53412     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
53413     if (NumSignBits >= (BitWidth - 31)) {
53414       EVT TruncVT = MVT::i32;
53415       if (InVT.isVector())
53416         TruncVT = InVT.changeVectorElementType(TruncVT);
53417       SDLoc dl(N);
53418       if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
53419         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
53420         if (IsStrict)
53421           return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
53422                              {N->getOperand(0), Trunc});
53423         return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
53424       }
53425       // If we're after legalize and the type is v2i32 we need to shuffle and
53426       // use CVTSI2P.
53427       assert(InVT == MVT::v2i64 && "Unexpected VT!");
53428       SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
53429       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
53430                                           { 0, 2, -1, -1 });
53431       if (IsStrict)
53432         return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
53433                            {N->getOperand(0), Shuf});
53434       return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
53435     }
53436   }
53437
53438   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
53439   // a 32-bit target where SSE doesn't support i64->FP operations.
53440   if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
53441       Op0.getOpcode() == ISD::LOAD) {
53442     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
53443
53444     // This transformation is not supported if the result type is f16 or f128.
53445     if (VT == MVT::f16 || VT == MVT::f128)
53446       return SDValue();
53447
53448     // If we have AVX512DQ we can use packed conversion instructions unless
53449     // the VT is f80.
53450     if (Subtarget.hasDQI() && VT != MVT::f80)
53451       return SDValue();
53452
53453     if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
53454         Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
53455       std::pair<SDValue, SDValue> Tmp =
53456           Subtarget.getTargetLowering()->BuildFILD(
53457               VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
53458               Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
53459       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
53460       return Tmp.first;
53461     }
53462   }
53463
53464   if (IsStrict)
53465     return SDValue();
53466
53467   if (SDValue V = combineToFPTruncExtElt(N, DAG))
53468     return V;
53469
53470   return SDValue();
53471 }
53472
53473 static bool needCarryOrOverflowFlag(SDValue Flags) {
53474   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
53475
53476   for (const SDNode *User : Flags->uses()) {
53477     X86::CondCode CC;
53478     switch (User->getOpcode()) {
53479     default:
53480       // Be conservative.
53481       return true;
53482     case X86ISD::SETCC:
53483     case X86ISD::SETCC_CARRY:
53484       CC = (X86::CondCode)User->getConstantOperandVal(0);
53485       break;
53486     case X86ISD::BRCOND:
53487     case X86ISD::CMOV:
53488       CC = (X86::CondCode)User->getConstantOperandVal(2);
53489       break;
53490     }
53491
53492     switch (CC) {
53493     default: break;
53494     case X86::COND_A: case X86::COND_AE:
53495     case X86::COND_B: case X86::COND_BE:
53496     case X86::COND_O: case X86::COND_NO:
53497     case X86::COND_G: case X86::COND_GE:
53498     case X86::COND_L: case X86::COND_LE:
53499       return true;
53500     }
53501   }
53502
53503   return false;
53504 }
53505
53506 static bool onlyZeroFlagUsed(SDValue Flags) {
53507   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
53508
53509   for (const SDNode *User : Flags->uses()) {
53510     unsigned CCOpNo;
53511     switch (User->getOpcode()) {
53512     default:
53513       // Be conservative.
53514       return false;
53515     case X86ISD::SETCC:
53516     case X86ISD::SETCC_CARRY:
53517       CCOpNo = 0;
53518       break;
53519     case X86ISD::BRCOND:
53520     case X86ISD::CMOV:
53521       CCOpNo = 2;
53522       break;
53523     }
53524
53525     X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
53526     if (CC != X86::COND_E && CC != X86::COND_NE)
53527       return false;
53528   }
53529
53530   return true;
53531 }
53532
53533 static SDValue combineCMP(SDNode *N, SelectionDAG &DAG,
53534                           const X86Subtarget &Subtarget) {
53535   // Only handle test patterns.
53536   if (!isNullConstant(N->getOperand(1)))
53537     return SDValue();
53538
53539   // If we have a CMP of a truncated binop, see if we can make a smaller binop
53540   // and use its flags directly.
53541   // TODO: Maybe we should try promoting compares that only use the zero flag
53542   // first if we can prove the upper bits with computeKnownBits?
53543   SDLoc dl(N);
53544   SDValue Op = N->getOperand(0);
53545   EVT VT = Op.getValueType();
53546   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53547
53548   // If we have a constant logical shift that's only used in a comparison
53549   // against zero turn it into an equivalent AND. This allows turning it into
53550   // a TEST instruction later.
53551   if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
53552       Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
53553       onlyZeroFlagUsed(SDValue(N, 0))) {
53554     unsigned BitWidth = VT.getSizeInBits();
53555     const APInt &ShAmt = Op.getConstantOperandAPInt(1);
53556     if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
53557       unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
53558       APInt Mask = Op.getOpcode() == ISD::SRL
53559                        ? APInt::getHighBitsSet(BitWidth, MaskBits)
53560                        : APInt::getLowBitsSet(BitWidth, MaskBits);
53561       if (Mask.isSignedIntN(32)) {
53562         Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
53563                          DAG.getConstant(Mask, dl, VT));
53564         return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
53565                            DAG.getConstant(0, dl, VT));
53566       }
53567     }
53568   }
53569
53570   // If we're extracting from a avx512 bool vector and comparing against zero,
53571   // then try to just bitcast the vector to an integer to use TEST/BT directly.
53572   // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
53573   if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
53574       Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
53575     SDValue Src = Op.getOperand(0);
53576     if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
53577         isNullConstant(Src.getOperand(1)) &&
53578         Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
53579       SDValue BoolVec = Src.getOperand(0);
53580       unsigned ShAmt = 0;
53581       if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
53582         ShAmt = BoolVec.getConstantOperandVal(1);
53583         BoolVec = BoolVec.getOperand(0);
53584       }
53585       BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
53586       EVT VecVT = BoolVec.getValueType();
53587       unsigned BitWidth = VecVT.getVectorNumElements();
53588       EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
53589       if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
53590         APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
53591         Op = DAG.getBitcast(BCVT, BoolVec);
53592         Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
53593                          DAG.getConstant(Mask, dl, BCVT));
53594         return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
53595                            DAG.getConstant(0, dl, BCVT));
53596       }
53597     }
53598   }
53599
53600   // Peek through any zero-extend if we're only testing for a zero result.
53601   if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
53602     SDValue Src = Op.getOperand(0);
53603     EVT SrcVT = Src.getValueType();
53604     if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
53605       return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
53606                          DAG.getConstant(0, dl, SrcVT));
53607   }
53608
53609   // Look for a truncate.
53610   if (Op.getOpcode() != ISD::TRUNCATE)
53611     return SDValue();
53612
53613   SDValue Trunc = Op;
53614   Op = Op.getOperand(0);
53615
53616   // See if we can compare with zero against the truncation source,
53617   // which should help using the Z flag from many ops. Only do this for
53618   // i32 truncated op to prevent partial-reg compares of promoted ops.
53619   EVT OpVT = Op.getValueType();
53620   APInt UpperBits =
53621       APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
53622   if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
53623       onlyZeroFlagUsed(SDValue(N, 0))) {
53624     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
53625                        DAG.getConstant(0, dl, OpVT));
53626   }
53627
53628   // After this the truncate and arithmetic op must have a single use.
53629   if (!Trunc.hasOneUse() || !Op.hasOneUse())
53630       return SDValue();
53631
53632   unsigned NewOpc;
53633   switch (Op.getOpcode()) {
53634   default: return SDValue();
53635   case ISD::AND:
53636     // Skip and with constant. We have special handling for and with immediate
53637     // during isel to generate test instructions.
53638     if (isa<ConstantSDNode>(Op.getOperand(1)))
53639       return SDValue();
53640     NewOpc = X86ISD::AND;
53641     break;
53642   case ISD::OR:  NewOpc = X86ISD::OR;  break;
53643   case ISD::XOR: NewOpc = X86ISD::XOR; break;
53644   case ISD::ADD:
53645     // If the carry or overflow flag is used, we can't truncate.
53646     if (needCarryOrOverflowFlag(SDValue(N, 0)))
53647       return SDValue();
53648     NewOpc = X86ISD::ADD;
53649     break;
53650   case ISD::SUB:
53651     // If the carry or overflow flag is used, we can't truncate.
53652     if (needCarryOrOverflowFlag(SDValue(N, 0)))
53653       return SDValue();
53654     NewOpc = X86ISD::SUB;
53655     break;
53656   }
53657
53658   // We found an op we can narrow. Truncate its inputs.
53659   SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
53660   SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
53661
53662   // Use a X86 specific opcode to avoid DAG combine messing with it.
53663   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
53664   Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
53665
53666   // For AND, keep a CMP so that we can match the test pattern.
53667   if (NewOpc == X86ISD::AND)
53668     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
53669                        DAG.getConstant(0, dl, VT));
53670
53671   // Return the flags.
53672   return Op.getValue(1);
53673 }
53674
53675 static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
53676                                 TargetLowering::DAGCombinerInfo &DCI) {
53677   assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
53678          "Expected X86ISD::ADD or X86ISD::SUB");
53679
53680   SDLoc DL(N);
53681   SDValue LHS = N->getOperand(0);
53682   SDValue RHS = N->getOperand(1);
53683   MVT VT = LHS.getSimpleValueType();
53684   bool IsSub = X86ISD::SUB == N->getOpcode();
53685   unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
53686
53687   // If we don't use the flag result, simplify back to a generic ADD/SUB.
53688   if (!N->hasAnyUseOfValue(1)) {
53689     SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
53690     return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
53691   }
53692
53693   // Fold any similar generic ADD/SUB opcodes to reuse this node.
53694   auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
53695     SDValue Ops[] = {N0, N1};
53696     SDVTList VTs = DAG.getVTList(N->getValueType(0));
53697     if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
53698       SDValue Op(N, 0);
53699       if (Negate)
53700         Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
53701       DCI.CombineTo(GenericAddSub, Op);
53702     }
53703   };
53704   MatchGeneric(LHS, RHS, false);
53705   MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
53706
53707   // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
53708   // EFLAGS result doesn't change.
53709   return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
53710                                    /*ZeroSecondOpOnly*/ true);
53711 }
53712
53713 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
53714   SDValue LHS = N->getOperand(0);
53715   SDValue RHS = N->getOperand(1);
53716   SDValue BorrowIn = N->getOperand(2);
53717
53718   if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
53719     MVT VT = N->getSimpleValueType(0);
53720     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
53721     return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
53722   }
53723
53724   // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
53725   // iff the flag result is dead.
53726   if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
53727       !N->hasAnyUseOfValue(1))
53728     return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
53729                        LHS.getOperand(1), BorrowIn);
53730
53731   return SDValue();
53732 }
53733
53734 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
53735 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
53736                           TargetLowering::DAGCombinerInfo &DCI) {
53737   SDValue LHS = N->getOperand(0);
53738   SDValue RHS = N->getOperand(1);
53739   SDValue CarryIn = N->getOperand(2);
53740   auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
53741   auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
53742
53743   // Canonicalize constant to RHS.
53744   if (LHSC && !RHSC)
53745     return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
53746                        CarryIn);
53747
53748   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
53749   // the result is either zero or one (depending on the input carry bit).
53750   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
53751   if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
53752       // We don't have a good way to replace an EFLAGS use, so only do this when
53753       // dead right now.
53754       SDValue(N, 1).use_empty()) {
53755     SDLoc DL(N);
53756     EVT VT = N->getValueType(0);
53757     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
53758     SDValue Res1 = DAG.getNode(
53759         ISD::AND, DL, VT,
53760         DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
53761                     DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
53762         DAG.getConstant(1, DL, VT));
53763     return DCI.CombineTo(N, Res1, CarryOut);
53764   }
53765
53766   // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
53767   // iff the flag result is dead.
53768   // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
53769   if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
53770     SDLoc DL(N);
53771     APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
53772     return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
53773                        DAG.getConstant(0, DL, LHS.getValueType()),
53774                        DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
53775   }
53776
53777   if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
53778     MVT VT = N->getSimpleValueType(0);
53779     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
53780     return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
53781   }
53782
53783   // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
53784   // iff the flag result is dead.
53785   if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
53786       !N->hasAnyUseOfValue(1))
53787     return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
53788                        LHS.getOperand(1), CarryIn);
53789
53790   return SDValue();
53791 }
53792
53793 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
53794                             const SDLoc &DL, EVT VT,
53795                             const X86Subtarget &Subtarget) {
53796   // Example of pattern we try to detect:
53797   // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
53798   //(add (build_vector (extract_elt t, 0),
53799   //                   (extract_elt t, 2),
53800   //                   (extract_elt t, 4),
53801   //                   (extract_elt t, 6)),
53802   //     (build_vector (extract_elt t, 1),
53803   //                   (extract_elt t, 3),
53804   //                   (extract_elt t, 5),
53805   //                   (extract_elt t, 7)))
53806
53807   if (!Subtarget.hasSSE2())
53808     return SDValue();
53809
53810   if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
53811       Op1.getOpcode() != ISD::BUILD_VECTOR)
53812     return SDValue();
53813
53814   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
53815       VT.getVectorNumElements() < 4 ||
53816       !isPowerOf2_32(VT.getVectorNumElements()))
53817     return SDValue();
53818
53819   // Check if one of Op0,Op1 is of the form:
53820   // (build_vector (extract_elt Mul, 0),
53821   //               (extract_elt Mul, 2),
53822   //               (extract_elt Mul, 4),
53823   //                   ...
53824   // the other is of the form:
53825   // (build_vector (extract_elt Mul, 1),
53826   //               (extract_elt Mul, 3),
53827   //               (extract_elt Mul, 5),
53828   //                   ...
53829   // and identify Mul.
53830   SDValue Mul;
53831   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
53832     SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
53833             Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
53834     // TODO: Be more tolerant to undefs.
53835     if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53836         Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53837         Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53838         Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
53839       return SDValue();
53840     auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
53841     auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
53842     auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
53843     auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
53844     if (!Const0L || !Const1L || !Const0H || !Const1H)
53845       return SDValue();
53846     unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
53847              Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
53848     // Commutativity of mul allows factors of a product to reorder.
53849     if (Idx0L > Idx1L)
53850       std::swap(Idx0L, Idx1L);
53851     if (Idx0H > Idx1H)
53852       std::swap(Idx0H, Idx1H);
53853     // Commutativity of add allows pairs of factors to reorder.
53854     if (Idx0L > Idx0H) {
53855       std::swap(Idx0L, Idx0H);
53856       std::swap(Idx1L, Idx1H);
53857     }
53858     if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
53859         Idx1H != 2 * i + 3)
53860       return SDValue();
53861     if (!Mul) {
53862       // First time an extract_elt's source vector is visited. Must be a MUL
53863       // with 2X number of vector elements than the BUILD_VECTOR.
53864       // Both extracts must be from same MUL.
53865       Mul = Op0L->getOperand(0);
53866       if (Mul->getOpcode() != ISD::MUL ||
53867           Mul.getValueType().getVectorNumElements() != 2 * e)
53868         return SDValue();
53869     }
53870     // Check that the extract is from the same MUL previously seen.
53871     if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
53872         Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
53873       return SDValue();
53874   }
53875
53876   // Check if the Mul source can be safely shrunk.
53877   ShrinkMode Mode;
53878   if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
53879       Mode == ShrinkMode::MULU16)
53880     return SDValue();
53881
53882   EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
53883                                  VT.getVectorNumElements() * 2);
53884   SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
53885   SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
53886
53887   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
53888                          ArrayRef<SDValue> Ops) {
53889     EVT InVT = Ops[0].getValueType();
53890     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
53891     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
53892                                  InVT.getVectorNumElements() / 2);
53893     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
53894   };
53895   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
53896 }
53897
53898 // Attempt to turn this pattern into PMADDWD.
53899 // (add (mul (sext (build_vector)), (sext (build_vector))),
53900 //      (mul (sext (build_vector)), (sext (build_vector)))
53901 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
53902                               const SDLoc &DL, EVT VT,
53903                               const X86Subtarget &Subtarget) {
53904   if (!Subtarget.hasSSE2())
53905     return SDValue();
53906
53907   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
53908     return SDValue();
53909
53910   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
53911       VT.getVectorNumElements() < 4 ||
53912       !isPowerOf2_32(VT.getVectorNumElements()))
53913     return SDValue();
53914
53915   SDValue N00 = N0.getOperand(0);
53916   SDValue N01 = N0.getOperand(1);
53917   SDValue N10 = N1.getOperand(0);
53918   SDValue N11 = N1.getOperand(1);
53919
53920   // All inputs need to be sign extends.
53921   // TODO: Support ZERO_EXTEND from known positive?
53922   if (N00.getOpcode() != ISD::SIGN_EXTEND ||
53923       N01.getOpcode() != ISD::SIGN_EXTEND ||
53924       N10.getOpcode() != ISD::SIGN_EXTEND ||
53925       N11.getOpcode() != ISD::SIGN_EXTEND)
53926     return SDValue();
53927
53928   // Peek through the extends.
53929   N00 = N00.getOperand(0);
53930   N01 = N01.getOperand(0);
53931   N10 = N10.getOperand(0);
53932   N11 = N11.getOperand(0);
53933
53934   // Must be extending from vXi16.
53935   EVT InVT = N00.getValueType();
53936   if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
53937       N10.getValueType() != InVT || N11.getValueType() != InVT)
53938     return SDValue();
53939
53940   // All inputs should be build_vectors.
53941   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
53942       N01.getOpcode() != ISD::BUILD_VECTOR ||
53943       N10.getOpcode() != ISD::BUILD_VECTOR ||
53944       N11.getOpcode() != ISD::BUILD_VECTOR)
53945     return SDValue();
53946
53947   // For each element, we need to ensure we have an odd element from one vector
53948   // multiplied by the odd element of another vector and the even element from
53949   // one of the same vectors being multiplied by the even element from the
53950   // other vector. So we need to make sure for each element i, this operator
53951   // is being performed:
53952   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
53953   SDValue In0, In1;
53954   for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
53955     SDValue N00Elt = N00.getOperand(i);
53956     SDValue N01Elt = N01.getOperand(i);
53957     SDValue N10Elt = N10.getOperand(i);
53958     SDValue N11Elt = N11.getOperand(i);
53959     // TODO: Be more tolerant to undefs.
53960     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53961         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53962         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
53963         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
53964       return SDValue();
53965     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
53966     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
53967     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
53968     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
53969     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
53970       return SDValue();
53971     unsigned IdxN00 = ConstN00Elt->getZExtValue();
53972     unsigned IdxN01 = ConstN01Elt->getZExtValue();
53973     unsigned IdxN10 = ConstN10Elt->getZExtValue();
53974     unsigned IdxN11 = ConstN11Elt->getZExtValue();
53975     // Add is commutative so indices can be reordered.
53976     if (IdxN00 > IdxN10) {
53977       std::swap(IdxN00, IdxN10);
53978       std::swap(IdxN01, IdxN11);
53979     }
53980     // N0 indices be the even element. N1 indices must be the next odd element.
53981     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
53982         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
53983       return SDValue();
53984     SDValue N00In = N00Elt.getOperand(0);
53985     SDValue N01In = N01Elt.getOperand(0);
53986     SDValue N10In = N10Elt.getOperand(0);
53987     SDValue N11In = N11Elt.getOperand(0);
53988
53989     // First time we find an input capture it.
53990     if (!In0) {
53991       In0 = N00In;
53992       In1 = N01In;
53993
53994       // The input vectors must be at least as wide as the output.
53995       // If they are larger than the output, we extract subvector below.
53996       if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
53997           In1.getValueSizeInBits() < VT.getSizeInBits())
53998         return SDValue();
53999     }
54000     // Mul is commutative so the input vectors can be in any order.
54001     // Canonicalize to make the compares easier.
54002     if (In0 != N00In)
54003       std::swap(N00In, N01In);
54004     if (In0 != N10In)
54005       std::swap(N10In, N11In);
54006     if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
54007       return SDValue();
54008   }
54009
54010   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54011                          ArrayRef<SDValue> Ops) {
54012     EVT OpVT = Ops[0].getValueType();
54013     assert(OpVT.getScalarType() == MVT::i16 &&
54014            "Unexpected scalar element type");
54015     assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
54016     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
54017                                  OpVT.getVectorNumElements() / 2);
54018     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
54019   };
54020
54021   // If the output is narrower than an input, extract the low part of the input
54022   // vector.
54023   EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54024                                VT.getVectorNumElements() * 2);
54025   if (OutVT16.bitsLT(In0.getValueType())) {
54026     In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
54027                       DAG.getIntPtrConstant(0, DL));
54028   }
54029   if (OutVT16.bitsLT(In1.getValueType())) {
54030     In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
54031                       DAG.getIntPtrConstant(0, DL));
54032   }
54033   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
54034                           PMADDBuilder);
54035 }
54036
54037 // ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
54038 // If upper element in each pair of both VPMADDWD are zero then we can merge
54039 // the operand elements and use the implicit add of VPMADDWD.
54040 // TODO: Add support for VPMADDUBSW (which isn't commutable).
54041 static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
54042                                    const SDLoc &DL, EVT VT) {
54043   if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
54044     return SDValue();
54045
54046   // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
54047   if (VT.getSizeInBits() > 128)
54048     return SDValue();
54049
54050   unsigned NumElts = VT.getVectorNumElements();
54051   MVT OpVT = N0.getOperand(0).getSimpleValueType();
54052   APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());
54053   APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
54054
54055   bool Op0HiZero =
54056       DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
54057       DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
54058   bool Op1HiZero =
54059       DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
54060       DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
54061
54062   // TODO: Check for zero lower elements once we have actual codegen that
54063   // creates them.
54064   if (!Op0HiZero || !Op1HiZero)
54065     return SDValue();
54066
54067   // Create a shuffle mask packing the lower elements from each VPMADDWD.
54068   SmallVector<int> Mask;
54069   for (int i = 0; i != (int)NumElts; ++i) {
54070     Mask.push_back(2 * i);
54071     Mask.push_back(2 * (i + NumElts));
54072   }
54073
54074   SDValue LHS =
54075       DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
54076   SDValue RHS =
54077       DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
54078   return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
54079 }
54080
54081 /// CMOV of constants requires materializing constant operands in registers.
54082 /// Try to fold those constants into an 'add' instruction to reduce instruction
54083 /// count. We do this with CMOV rather the generic 'select' because there are
54084 /// earlier folds that may be used to turn select-of-constants into logic hacks.
54085 static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
54086                                        const X86Subtarget &Subtarget) {
54087   // If an operand is zero, add-of-0 gets simplified away, so that's clearly
54088   // better because we eliminate 1-2 instructions. This transform is still
54089   // an improvement without zero operands because we trade 2 move constants and
54090   // 1 add for 2 adds (LEA) as long as the constants can be represented as
54091   // immediate asm operands (fit in 32-bits).
54092   auto isSuitableCmov = [](SDValue V) {
54093     if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
54094       return false;
54095     if (!isa<ConstantSDNode>(V.getOperand(0)) ||
54096         !isa<ConstantSDNode>(V.getOperand(1)))
54097       return false;
54098     return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
54099            (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
54100             V.getConstantOperandAPInt(1).isSignedIntN(32));
54101   };
54102
54103   // Match an appropriate CMOV as the first operand of the add.
54104   SDValue Cmov = N->getOperand(0);
54105   SDValue OtherOp = N->getOperand(1);
54106   if (!isSuitableCmov(Cmov))
54107     std::swap(Cmov, OtherOp);
54108   if (!isSuitableCmov(Cmov))
54109     return SDValue();
54110
54111   // Don't remove a load folding opportunity for the add. That would neutralize
54112   // any improvements from removing constant materializations.
54113   if (X86::mayFoldLoad(OtherOp, Subtarget))
54114     return SDValue();
54115
54116   EVT VT = N->getValueType(0);
54117   SDLoc DL(N);
54118   SDValue FalseOp = Cmov.getOperand(0);
54119   SDValue TrueOp = Cmov.getOperand(1);
54120
54121   // We will push the add through the select, but we can potentially do better
54122   // if we know there is another add in the sequence and this is pointer math.
54123   // In that case, we can absorb an add into the trailing memory op and avoid
54124   // a 3-operand LEA which is likely slower than a 2-operand LEA.
54125   // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
54126   if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
54127       !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
54128       all_of(N->uses(), [&](SDNode *Use) {
54129         auto *MemNode = dyn_cast<MemSDNode>(Use);
54130         return MemNode && MemNode->getBasePtr().getNode() == N;
54131       })) {
54132     // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
54133     // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
54134     //       it is possible that choosing op1 might be better.
54135     SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
54136     FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
54137     TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
54138     Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
54139                        Cmov.getOperand(2), Cmov.getOperand(3));
54140     return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
54141   }
54142
54143   // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
54144   FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
54145   TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
54146   return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
54147                      Cmov.getOperand(3));
54148 }
54149
54150 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
54151                           TargetLowering::DAGCombinerInfo &DCI,
54152                           const X86Subtarget &Subtarget) {
54153   EVT VT = N->getValueType(0);
54154   SDValue Op0 = N->getOperand(0);
54155   SDValue Op1 = N->getOperand(1);
54156   SDLoc DL(N);
54157
54158   if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
54159     return Select;
54160
54161   if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
54162     return MAdd;
54163   if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
54164     return MAdd;
54165   if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
54166     return MAdd;
54167
54168   // Try to synthesize horizontal adds from adds of shuffles.
54169   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
54170     return V;
54171
54172   // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
54173   // (sub Y, (sext (vXi1 X))).
54174   // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
54175   // generic DAG combine without a legal type check, but adding this there
54176   // caused regressions.
54177   if (VT.isVector()) {
54178     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54179     if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
54180         Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
54181         TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
54182       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
54183       return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
54184     }
54185
54186     if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
54187         Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
54188         TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
54189       SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
54190       return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
54191     }
54192   }
54193
54194   // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
54195   if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
54196       X86::isZeroNode(Op0.getOperand(1))) {
54197     assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
54198     return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
54199                        Op0.getOperand(0), Op0.getOperand(2));
54200   }
54201
54202   return combineAddOrSubToADCOrSBB(N, DAG);
54203 }
54204
54205 // Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
54206 // condition comes from the subtract node that produced -X. This matches the
54207 // cmov expansion for absolute value. By swapping the operands we convert abs
54208 // to nabs.
54209 static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {
54210   SDValue N0 = N->getOperand(0);
54211   SDValue N1 = N->getOperand(1);
54212
54213   if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
54214     return SDValue();
54215
54216   X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);
54217   if (CC != X86::COND_S && CC != X86::COND_NS)
54218     return SDValue();
54219
54220   // Condition should come from a negate operation.
54221   SDValue Cond = N1.getOperand(3);
54222   if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
54223     return SDValue();
54224   assert(Cond.getResNo() == 1 && "Unexpected result number");
54225
54226   // Get the X and -X from the negate.
54227   SDValue NegX = Cond.getValue(0);
54228   SDValue X = Cond.getOperand(1);
54229
54230   SDValue FalseOp = N1.getOperand(0);
54231   SDValue TrueOp = N1.getOperand(1);
54232
54233   // Cmov operands should be X and NegX. Order doesn't matter.
54234   if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
54235     return SDValue();
54236
54237   // Build a new CMOV with the operands swapped.
54238   SDLoc DL(N);
54239   MVT VT = N->getSimpleValueType(0);
54240   SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
54241                              N1.getOperand(2), Cond);
54242   // Convert sub to add.
54243   return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
54244 }
54245
54246 static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {
54247   SDValue Op0 = N->getOperand(0);
54248   SDValue Op1 = N->getOperand(1);
54249
54250   // (sub C (zero_extend (setcc)))
54251   // =>
54252   // (add (zero_extend (setcc inverted) C-1))   if C is a nonzero immediate
54253   // Don't disturb (sub 0 setcc), which is easily done with neg.
54254   EVT VT = N->getValueType(0);
54255   auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
54256   if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
54257       !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
54258       Op1.getOperand(0).hasOneUse()) {
54259     SDValue SetCC = Op1.getOperand(0);
54260     X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
54261     X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC);
54262     APInt NewImm = Op0C->getAPIntValue() - 1;
54263     SDLoc DL(Op1);
54264     SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
54265     NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
54266     return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
54267                        DAG.getConstant(NewImm, DL, VT));
54268   }
54269
54270   return SDValue();
54271 }
54272
54273 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
54274                           TargetLowering::DAGCombinerInfo &DCI,
54275                           const X86Subtarget &Subtarget) {
54276   SDValue Op0 = N->getOperand(0);
54277   SDValue Op1 = N->getOperand(1);
54278
54279   // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
54280   auto IsNonOpaqueConstant = [&](SDValue Op) {
54281     if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
54282       if (auto *Cst = dyn_cast<ConstantSDNode>(C))
54283         return !Cst->isOpaque();
54284       return true;
54285     }
54286     return false;
54287   };
54288
54289   // X86 can't encode an immediate LHS of a sub. See if we can push the
54290   // negation into a preceding instruction. If the RHS of the sub is a XOR with
54291   // one use and a constant, invert the immediate, saving one register.
54292   // However, ignore cases where C1 is 0, as those will become a NEG.
54293   // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
54294   if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
54295       !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
54296       Op1->hasOneUse()) {
54297     SDLoc DL(N);
54298     EVT VT = Op0.getValueType();
54299     SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
54300                                  DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
54301     SDValue NewAdd =
54302         DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
54303     return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
54304   }
54305
54306   if (SDValue V = combineSubABS(N, DAG))
54307     return V;
54308
54309   // Try to synthesize horizontal subs from subs of shuffles.
54310   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
54311     return V;
54312
54313   // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
54314   if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
54315       X86::isZeroNode(Op1.getOperand(1))) {
54316     assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
54317     return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
54318                        Op1.getOperand(0), Op1.getOperand(2));
54319   }
54320
54321   // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
54322   // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
54323   if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
54324       !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
54325     assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
54326     SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
54327                               Op1.getOperand(1), Op1.getOperand(2));
54328     return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
54329                        Op1.getOperand(0));
54330   }
54331
54332   if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
54333     return V;
54334
54335   if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
54336     return V;
54337
54338   return combineSubSetcc(N, DAG);
54339 }
54340
54341 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
54342                                     const X86Subtarget &Subtarget) {
54343   MVT VT = N->getSimpleValueType(0);
54344   SDLoc DL(N);
54345
54346   if (N->getOperand(0) == N->getOperand(1)) {
54347     if (N->getOpcode() == X86ISD::PCMPEQ)
54348       return DAG.getConstant(-1, DL, VT);
54349     if (N->getOpcode() == X86ISD::PCMPGT)
54350       return DAG.getConstant(0, DL, VT);
54351   }
54352
54353   return SDValue();
54354 }
54355
54356 /// Helper that combines an array of subvector ops as if they were the operands
54357 /// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
54358 /// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
54359 static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
54360                                       ArrayRef<SDValue> Ops, SelectionDAG &DAG,
54361                                       TargetLowering::DAGCombinerInfo &DCI,
54362                                       const X86Subtarget &Subtarget) {
54363   assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
54364   unsigned EltSizeInBits = VT.getScalarSizeInBits();
54365
54366   if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
54367     return DAG.getUNDEF(VT);
54368
54369   if (llvm::all_of(Ops, [](SDValue Op) {
54370         return ISD::isBuildVectorAllZeros(Op.getNode());
54371       }))
54372     return getZeroVector(VT, Subtarget, DAG, DL);
54373
54374   SDValue Op0 = Ops[0];
54375   bool IsSplat = llvm::all_equal(Ops);
54376   unsigned NumOps = Ops.size();
54377
54378   // Repeated subvectors.
54379   if (IsSplat &&
54380       (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
54381     // If this broadcast is inserted into both halves, use a larger broadcast.
54382     if (Op0.getOpcode() == X86ISD::VBROADCAST)
54383       return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
54384
54385     // If this simple subvector or scalar/subvector broadcast_load is inserted
54386     // into both halves, use a larger broadcast_load. Update other uses to use
54387     // an extracted subvector.
54388     if (ISD::isNormalLoad(Op0.getNode()) ||
54389         Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
54390         Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
54391       auto *Mem = cast<MemSDNode>(Op0);
54392       unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
54393                          ? X86ISD::VBROADCAST_LOAD
54394                          : X86ISD::SUBV_BROADCAST_LOAD;
54395       if (SDValue BcastLd =
54396               getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
54397         SDValue BcastSrc =
54398             extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
54399         DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
54400         return BcastLd;
54401       }
54402     }
54403
54404     // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
54405     if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
54406         (Subtarget.hasAVX2() ||
54407          X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),
54408                                               VT.getScalarType(), Subtarget)))
54409       return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
54410                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
54411                                      Op0.getOperand(0),
54412                                      DAG.getIntPtrConstant(0, DL)));
54413
54414     // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
54415     if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
54416         (Subtarget.hasAVX2() ||
54417          (EltSizeInBits >= 32 &&
54418           X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
54419         Op0.getOperand(0).getValueType() == VT.getScalarType())
54420       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
54421
54422     // concat_vectors(extract_subvector(broadcast(x)),
54423     //                extract_subvector(broadcast(x))) -> broadcast(x)
54424     // concat_vectors(extract_subvector(subv_broadcast(x)),
54425     //                extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
54426     if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
54427         Op0.getOperand(0).getValueType() == VT) {
54428       SDValue SrcVec = Op0.getOperand(0);
54429       if (SrcVec.getOpcode() == X86ISD::VBROADCAST ||
54430           SrcVec.getOpcode() == X86ISD::VBROADCAST_LOAD)
54431         return Op0.getOperand(0);
54432       if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
54433           Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
54434         return Op0.getOperand(0);
54435     }
54436   }
54437
54438   // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
54439   // Only concat of subvector high halves which vperm2x128 is best at.
54440   // TODO: This should go in combineX86ShufflesRecursively eventually.
54441   if (VT.is256BitVector() && NumOps == 2) {
54442     SDValue Src0 = peekThroughBitcasts(Ops[0]);
54443     SDValue Src1 = peekThroughBitcasts(Ops[1]);
54444     if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
54445         Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
54446       EVT SrcVT0 = Src0.getOperand(0).getValueType();
54447       EVT SrcVT1 = Src1.getOperand(0).getValueType();
54448       unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
54449       unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
54450       if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
54451           Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
54452           Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
54453         return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
54454                            DAG.getBitcast(VT, Src0.getOperand(0)),
54455                            DAG.getBitcast(VT, Src1.getOperand(0)),
54456                            DAG.getTargetConstant(0x31, DL, MVT::i8));
54457       }
54458     }
54459   }
54460
54461   // Repeated opcode.
54462   // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
54463   // but it currently struggles with different vector widths.
54464   if (llvm::all_of(Ops, [Op0](SDValue Op) {
54465         return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
54466       })) {
54467     auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
54468       SmallVector<SDValue> Subs;
54469       for (SDValue SubOp : SubOps)
54470         Subs.push_back(SubOp.getOperand(I));
54471       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
54472     };
54473     auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
54474       bool AllConstants = true;
54475       bool AllSubVectors = true;
54476       for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
54477         SDValue Sub = SubOps[I].getOperand(Op);
54478         unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
54479         SDValue BC = peekThroughBitcasts(Sub);
54480         AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
54481                         ISD::isBuildVectorOfConstantFPSDNodes(BC.getNode());
54482         AllSubVectors &= Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
54483                          Sub.getOperand(0).getValueType() == VT &&
54484                          Sub.getConstantOperandAPInt(1) == (I * NumSubElts);
54485       }
54486       return AllConstants || AllSubVectors;
54487     };
54488
54489     switch (Op0.getOpcode()) {
54490     case X86ISD::VBROADCAST: {
54491       if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
54492             return Op.getOperand(0).getValueType().is128BitVector();
54493           })) {
54494         if (VT == MVT::v4f64 || VT == MVT::v4i64)
54495           return DAG.getNode(X86ISD::UNPCKL, DL, VT,
54496                              ConcatSubOperand(VT, Ops, 0),
54497                              ConcatSubOperand(VT, Ops, 0));
54498         // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
54499         if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
54500           return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
54501                                               : X86ISD::PSHUFD,
54502                              DL, VT, ConcatSubOperand(VT, Ops, 0),
54503                              getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
54504       }
54505       break;
54506     }
54507     case X86ISD::MOVDDUP:
54508     case X86ISD::MOVSHDUP:
54509     case X86ISD::MOVSLDUP: {
54510       if (!IsSplat)
54511         return DAG.getNode(Op0.getOpcode(), DL, VT,
54512                            ConcatSubOperand(VT, Ops, 0));
54513       break;
54514     }
54515     case X86ISD::SHUFP: {
54516       // Add SHUFPD support if/when necessary.
54517       if (!IsSplat && VT.getScalarType() == MVT::f32 &&
54518           llvm::all_of(Ops, [Op0](SDValue Op) {
54519             return Op.getOperand(2) == Op0.getOperand(2);
54520           })) {
54521         return DAG.getNode(Op0.getOpcode(), DL, VT,
54522                            ConcatSubOperand(VT, Ops, 0),
54523                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
54524       }
54525       break;
54526     }
54527     case X86ISD::UNPCKH:
54528     case X86ISD::UNPCKL: {
54529       // Don't concatenate build_vector patterns.
54530       if (!IsSplat && EltSizeInBits >= 32 &&
54531           ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54532            (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
54533           none_of(Ops, [](SDValue Op) {
54534             return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
54535                        ISD::SCALAR_TO_VECTOR ||
54536                    peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
54537                        ISD::SCALAR_TO_VECTOR;
54538           })) {
54539         return DAG.getNode(Op0.getOpcode(), DL, VT,
54540                            ConcatSubOperand(VT, Ops, 0),
54541                            ConcatSubOperand(VT, Ops, 1));
54542       }
54543       break;
54544     }
54545     case X86ISD::PSHUFHW:
54546     case X86ISD::PSHUFLW:
54547     case X86ISD::PSHUFD:
54548       if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
54549           Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
54550         return DAG.getNode(Op0.getOpcode(), DL, VT,
54551                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
54552       }
54553       [[fallthrough]];
54554     case X86ISD::VPERMILPI:
54555       if (!IsSplat && EltSizeInBits == 32 &&
54556           (VT.is256BitVector() ||
54557            (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
54558           all_of(Ops, [&Op0](SDValue Op) {
54559             return Op0.getOperand(1) == Op.getOperand(1);
54560           })) {
54561         MVT FloatVT = VT.changeVectorElementType(MVT::f32);
54562         SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
54563         Res =
54564             DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
54565         return DAG.getBitcast(VT, Res);
54566       }
54567       if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
54568         uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
54569         uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
54570         uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
54571         return DAG.getNode(Op0.getOpcode(), DL, VT,
54572                            ConcatSubOperand(VT, Ops, 0),
54573                            DAG.getTargetConstant(Idx, DL, MVT::i8));
54574       }
54575       break;
54576     case X86ISD::PSHUFB:
54577     case X86ISD::PSADBW:
54578       if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54579                        (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
54580         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
54581         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
54582                                  NumOps * SrcVT.getVectorNumElements());
54583         return DAG.getNode(Op0.getOpcode(), DL, VT,
54584                            ConcatSubOperand(SrcVT, Ops, 0),
54585                            ConcatSubOperand(SrcVT, Ops, 1));
54586       }
54587       break;
54588     case X86ISD::VPERMV:
54589       if (!IsSplat && NumOps == 2 &&
54590           (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
54591         MVT OpVT = Op0.getSimpleValueType();
54592         int NumSrcElts = OpVT.getVectorNumElements();
54593         SmallVector<int, 64> ConcatMask;
54594         for (unsigned i = 0; i != NumOps; ++i) {
54595           SmallVector<int, 64> SubMask;
54596           SmallVector<SDValue, 2> SubOps;
54597           if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
54598                                     SubMask))
54599             break;
54600           for (int M : SubMask) {
54601             if (0 <= M)
54602               M += i * NumSrcElts;
54603             ConcatMask.push_back(M);
54604           }
54605         }
54606         if (ConcatMask.size() == (NumOps * NumSrcElts)) {
54607           SDValue Src = concatSubVectors(Ops[0].getOperand(1),
54608                                          Ops[1].getOperand(1), DAG, DL);
54609           MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
54610           MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
54611           SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
54612           return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
54613         }
54614       }
54615       break;
54616     case X86ISD::VPERMV3:
54617       if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
54618         MVT OpVT = Op0.getSimpleValueType();
54619         int NumSrcElts = OpVT.getVectorNumElements();
54620         SmallVector<int, 64> ConcatMask;
54621         for (unsigned i = 0; i != NumOps; ++i) {
54622           SmallVector<int, 64> SubMask;
54623           SmallVector<SDValue, 2> SubOps;
54624           if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
54625                                     SubMask))
54626             break;
54627           for (int M : SubMask) {
54628             if (0 <= M) {
54629               M += M < NumSrcElts ? 0 : NumSrcElts;
54630               M += i * NumSrcElts;
54631             }
54632             ConcatMask.push_back(M);
54633           }
54634         }
54635         if (ConcatMask.size() == (NumOps * NumSrcElts)) {
54636           SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
54637                                           Ops[1].getOperand(0), DAG, DL);
54638           SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
54639                                           Ops[1].getOperand(2), DAG, DL);
54640           MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
54641           MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
54642           SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
54643           return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
54644         }
54645       }
54646       break;
54647     case X86ISD::VPERM2X128: {
54648       if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
54649         assert(NumOps == 2 && "Bad concat_vectors operands");
54650         unsigned Imm0 = Ops[0].getConstantOperandVal(2);
54651         unsigned Imm1 = Ops[1].getConstantOperandVal(2);
54652         // TODO: Handle zero'd subvectors.
54653         if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
54654           int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
54655                          (int)((Imm1 >> 4) & 0x3)};
54656           MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
54657           SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
54658                                          Ops[0].getOperand(1), DAG, DL);
54659           SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
54660                                          Ops[1].getOperand(1), DAG, DL);
54661           SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
54662                                     DAG.getBitcast(ShuffleVT, LHS),
54663                                     DAG.getBitcast(ShuffleVT, RHS),
54664                                     getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
54665           return DAG.getBitcast(VT, Res);
54666         }
54667       }
54668       break;
54669     }
54670     case X86ISD::SHUF128: {
54671       if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
54672         unsigned Imm0 = Ops[0].getConstantOperandVal(2);
54673         unsigned Imm1 = Ops[1].getConstantOperandVal(2);
54674         unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
54675                        ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
54676         SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
54677                                        Ops[0].getOperand(1), DAG, DL);
54678         SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
54679                                        Ops[1].getOperand(1), DAG, DL);
54680         return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
54681                            DAG.getTargetConstant(Imm, DL, MVT::i8));
54682       }
54683       break;
54684     }
54685     case ISD::TRUNCATE:
54686       if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
54687         EVT SrcVT = Ops[0].getOperand(0).getValueType();
54688         if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
54689             SrcVT == Ops[1].getOperand(0).getValueType() &&
54690             Subtarget.useAVX512Regs() &&
54691             Subtarget.getPreferVectorWidth() >= 512 &&
54692             (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
54693           EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
54694           return DAG.getNode(ISD::TRUNCATE, DL, VT,
54695                              ConcatSubOperand(NewSrcVT, Ops, 0));
54696         }
54697       }
54698       break;
54699     case X86ISD::VSHLI:
54700     case X86ISD::VSRLI:
54701       // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
54702       // TODO: Move this to LowerShiftByScalarImmediate?
54703       if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
54704           llvm::all_of(Ops, [](SDValue Op) {
54705             return Op.getConstantOperandAPInt(1) == 32;
54706           })) {
54707         SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
54708         SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
54709         if (Op0.getOpcode() == X86ISD::VSHLI) {
54710           Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
54711                                      {8, 0, 8, 2, 8, 4, 8, 6});
54712         } else {
54713           Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
54714                                      {1, 8, 3, 8, 5, 8, 7, 8});
54715         }
54716         return DAG.getBitcast(VT, Res);
54717       }
54718       [[fallthrough]];
54719     case X86ISD::VSRAI:
54720     case X86ISD::VSHL:
54721     case X86ISD::VSRL:
54722     case X86ISD::VSRA:
54723       if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
54724            (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
54725             (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
54726           llvm::all_of(Ops, [Op0](SDValue Op) {
54727             return Op0.getOperand(1) == Op.getOperand(1);
54728           })) {
54729         return DAG.getNode(Op0.getOpcode(), DL, VT,
54730                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
54731       }
54732       break;
54733     case X86ISD::VPERMI:
54734     case X86ISD::VROTLI:
54735     case X86ISD::VROTRI:
54736       if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
54737           llvm::all_of(Ops, [Op0](SDValue Op) {
54738             return Op0.getOperand(1) == Op.getOperand(1);
54739           })) {
54740         return DAG.getNode(Op0.getOpcode(), DL, VT,
54741                            ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
54742       }
54743       break;
54744     case ISD::AND:
54745     case ISD::OR:
54746     case ISD::XOR:
54747     case X86ISD::ANDNP:
54748       if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54749                        (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
54750         return DAG.getNode(Op0.getOpcode(), DL, VT,
54751                            ConcatSubOperand(VT, Ops, 0),
54752                            ConcatSubOperand(VT, Ops, 1));
54753       }
54754       break;
54755     case X86ISD::PCMPEQ:
54756     case X86ISD::PCMPGT:
54757       if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256() &&
54758           (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
54759         return DAG.getNode(Op0.getOpcode(), DL, VT,
54760                            ConcatSubOperand(VT, Ops, 0),
54761                            ConcatSubOperand(VT, Ops, 1));
54762       }
54763       break;
54764     case ISD::CTPOP:
54765     case ISD::CTTZ:
54766     case ISD::CTLZ:
54767     case ISD::CTTZ_ZERO_UNDEF:
54768     case ISD::CTLZ_ZERO_UNDEF:
54769       if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54770                        (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
54771         return DAG.getNode(Op0.getOpcode(), DL, VT,
54772                            ConcatSubOperand(VT, Ops, 0));
54773       }
54774       break;
54775     case X86ISD::GF2P8AFFINEQB:
54776       if (!IsSplat &&
54777           (VT.is256BitVector() ||
54778            (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
54779           llvm::all_of(Ops, [Op0](SDValue Op) {
54780             return Op0.getOperand(2) == Op.getOperand(2);
54781           })) {
54782         return DAG.getNode(Op0.getOpcode(), DL, VT,
54783                            ConcatSubOperand(VT, Ops, 0),
54784                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
54785       }
54786       break;
54787     case ISD::ADD:
54788     case ISD::SUB:
54789     case ISD::MUL:
54790       if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54791                        (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
54792                         (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
54793         return DAG.getNode(Op0.getOpcode(), DL, VT,
54794                            ConcatSubOperand(VT, Ops, 0),
54795                            ConcatSubOperand(VT, Ops, 1));
54796       }
54797       break;
54798     // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
54799     // their latency are short, so here we don't replace them.
54800     case ISD::FDIV:
54801       if (!IsSplat && (VT.is256BitVector() ||
54802                        (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
54803         return DAG.getNode(Op0.getOpcode(), DL, VT,
54804                            ConcatSubOperand(VT, Ops, 0),
54805                            ConcatSubOperand(VT, Ops, 1));
54806       }
54807       break;
54808     case X86ISD::HADD:
54809     case X86ISD::HSUB:
54810     case X86ISD::FHADD:
54811     case X86ISD::FHSUB:
54812       if (!IsSplat && VT.is256BitVector() &&
54813           (VT.isFloatingPoint() || Subtarget.hasInt256())) {
54814         return DAG.getNode(Op0.getOpcode(), DL, VT,
54815                            ConcatSubOperand(VT, Ops, 0),
54816                            ConcatSubOperand(VT, Ops, 1));
54817       }
54818       break;
54819     case X86ISD::PACKSS:
54820     case X86ISD::PACKUS:
54821       if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54822                        (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
54823         MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
54824         SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
54825                                  NumOps * SrcVT.getVectorNumElements());
54826         return DAG.getNode(Op0.getOpcode(), DL, VT,
54827                            ConcatSubOperand(SrcVT, Ops, 0),
54828                            ConcatSubOperand(SrcVT, Ops, 1));
54829       }
54830       break;
54831     case X86ISD::PALIGNR:
54832       if (!IsSplat &&
54833           ((VT.is256BitVector() && Subtarget.hasInt256()) ||
54834            (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
54835           llvm::all_of(Ops, [Op0](SDValue Op) {
54836             return Op0.getOperand(2) == Op.getOperand(2);
54837           })) {
54838         return DAG.getNode(Op0.getOpcode(), DL, VT,
54839                            ConcatSubOperand(VT, Ops, 0),
54840                            ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
54841       }
54842       break;
54843     case ISD::VSELECT:
54844       if (!IsSplat && Subtarget.hasAVX512() &&
54845           (VT.is256BitVector() ||
54846            (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
54847           (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
54848         EVT SelVT = Ops[0].getOperand(0).getValueType();
54849         if (SelVT.getVectorElementType() == MVT::i1) {
54850           SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
54851                                    NumOps * SelVT.getVectorNumElements());
54852           if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
54853             return DAG.getNode(Op0.getOpcode(), DL, VT,
54854                                ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
54855                                ConcatSubOperand(VT, Ops, 1),
54856                                ConcatSubOperand(VT, Ops, 2));
54857         }
54858       }
54859       [[fallthrough]];
54860     case X86ISD::BLENDV:
54861       if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
54862           (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
54863           IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
54864         EVT SelVT = Ops[0].getOperand(0).getValueType();
54865         SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());
54866         if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))
54867           return DAG.getNode(Op0.getOpcode(), DL, VT,
54868                              ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
54869                              ConcatSubOperand(VT, Ops, 1),
54870                              ConcatSubOperand(VT, Ops, 2));
54871       }
54872       break;
54873     }
54874   }
54875
54876   // Fold subvector loads into one.
54877   // If needed, look through bitcasts to get to the load.
54878   if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
54879     unsigned Fast;
54880     const X86TargetLowering *TLI = Subtarget.getTargetLowering();
54881     if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
54882                                 *FirstLd->getMemOperand(), &Fast) &&
54883         Fast) {
54884       if (SDValue Ld =
54885               EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
54886         return Ld;
54887     }
54888   }
54889
54890   // Attempt to fold target constant loads.
54891   if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
54892     SmallVector<APInt> EltBits;
54893     APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
54894     for (unsigned I = 0; I != NumOps; ++I) {
54895       APInt OpUndefElts;
54896       SmallVector<APInt> OpEltBits;
54897       if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
54898                                         OpEltBits, true, false))
54899           break;
54900       EltBits.append(OpEltBits);
54901       UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
54902     }
54903     if (EltBits.size() == VT.getVectorNumElements())
54904       return getConstVector(EltBits, UndefElts, VT, DAG, DL);
54905   }
54906
54907   // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
54908   if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
54909       Subtarget.useAVX512Regs()) {
54910     MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
54911     SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
54912     Res = DAG.getBitcast(ShuffleVT, Res);
54913     Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
54914                       getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
54915     return DAG.getBitcast(VT, Res);
54916   }
54917
54918   return SDValue();
54919 }
54920
54921 static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
54922                                      TargetLowering::DAGCombinerInfo &DCI,
54923                                      const X86Subtarget &Subtarget) {
54924   EVT VT = N->getValueType(0);
54925   EVT SrcVT = N->getOperand(0).getValueType();
54926   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54927   SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
54928
54929   if (VT.getVectorElementType() == MVT::i1) {
54930     // Attempt to constant fold.
54931     unsigned SubSizeInBits = SrcVT.getSizeInBits();
54932     APInt Constant = APInt::getZero(VT.getSizeInBits());
54933     for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
54934       auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
54935       if (!C) break;
54936       Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
54937       if (I == (E - 1)) {
54938         EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
54939         if (TLI.isTypeLegal(IntVT))
54940           return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
54941       }
54942     }
54943
54944     // Don't do anything else for i1 vectors.
54945     return SDValue();
54946   }
54947
54948   if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
54949     if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
54950                                            DCI, Subtarget))
54951       return R;
54952   }
54953
54954   return SDValue();
54955 }
54956
54957 static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
54958                                        TargetLowering::DAGCombinerInfo &DCI,
54959                                        const X86Subtarget &Subtarget) {
54960   if (DCI.isBeforeLegalizeOps())
54961     return SDValue();
54962
54963   MVT OpVT = N->getSimpleValueType(0);
54964
54965   bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
54966
54967   SDLoc dl(N);
54968   SDValue Vec = N->getOperand(0);
54969   SDValue SubVec = N->getOperand(1);
54970
54971   uint64_t IdxVal = N->getConstantOperandVal(2);
54972   MVT SubVecVT = SubVec.getSimpleValueType();
54973
54974   if (Vec.isUndef() && SubVec.isUndef())
54975     return DAG.getUNDEF(OpVT);
54976
54977   // Inserting undefs/zeros into zeros/undefs is a zero vector.
54978   if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
54979       (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
54980     return getZeroVector(OpVT, Subtarget, DAG, dl);
54981
54982   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
54983     // If we're inserting into a zero vector and then into a larger zero vector,
54984     // just insert into the larger zero vector directly.
54985     if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
54986         ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
54987       uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
54988       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
54989                          getZeroVector(OpVT, Subtarget, DAG, dl),
54990                          SubVec.getOperand(1),
54991                          DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
54992     }
54993
54994     // If we're inserting into a zero vector and our input was extracted from an
54995     // insert into a zero vector of the same type and the extraction was at
54996     // least as large as the original insertion. Just insert the original
54997     // subvector into a zero vector.
54998     if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
54999         isNullConstant(SubVec.getOperand(1)) &&
55000         SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
55001       SDValue Ins = SubVec.getOperand(0);
55002       if (isNullConstant(Ins.getOperand(2)) &&
55003           ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
55004           Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
55005               SubVecVT.getFixedSizeInBits())
55006           return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55007                              getZeroVector(OpVT, Subtarget, DAG, dl),
55008                              Ins.getOperand(1), N->getOperand(2));
55009     }
55010   }
55011
55012   // Stop here if this is an i1 vector.
55013   if (IsI1Vector)
55014     return SDValue();
55015
55016   // Eliminate an intermediate vector widening:
55017   // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
55018   // insert_subvector X, Y, Idx
55019   // TODO: This is a more general version of a DAGCombiner fold, can we move it
55020   // there?
55021   if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
55022       SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
55023     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
55024                        SubVec.getOperand(1), N->getOperand(2));
55025
55026   // If this is an insert of an extract, combine to a shuffle. Don't do this
55027   // if the insert or extract can be represented with a subregister operation.
55028   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55029       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
55030       (IdxVal != 0 ||
55031        !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
55032     int ExtIdxVal = SubVec.getConstantOperandVal(1);
55033     if (ExtIdxVal != 0) {
55034       int VecNumElts = OpVT.getVectorNumElements();
55035       int SubVecNumElts = SubVecVT.getVectorNumElements();
55036       SmallVector<int, 64> Mask(VecNumElts);
55037       // First create an identity shuffle mask.
55038       for (int i = 0; i != VecNumElts; ++i)
55039         Mask[i] = i;
55040       // Now insert the extracted portion.
55041       for (int i = 0; i != SubVecNumElts; ++i)
55042         Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
55043
55044       return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
55045     }
55046   }
55047
55048   // Match concat_vector style patterns.
55049   SmallVector<SDValue, 2> SubVectorOps;
55050   if (collectConcatOps(N, SubVectorOps, DAG)) {
55051     if (SDValue Fold =
55052             combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
55053       return Fold;
55054
55055     // If we're inserting all zeros into the upper half, change this to
55056     // a concat with zero. We will match this to a move
55057     // with implicit upper bit zeroing during isel.
55058     // We do this here because we don't want combineConcatVectorOps to
55059     // create INSERT_SUBVECTOR from CONCAT_VECTORS.
55060     if (SubVectorOps.size() == 2 &&
55061         ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
55062       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55063                          getZeroVector(OpVT, Subtarget, DAG, dl),
55064                          SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
55065
55066     // Attempt to recursively combine to a shuffle.
55067     if (all_of(SubVectorOps, [](SDValue SubOp) {
55068           return isTargetShuffle(SubOp.getOpcode());
55069         })) {
55070       SDValue Op(N, 0);
55071       if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55072         return Res;
55073     }
55074   }
55075
55076   // If this is a broadcast insert into an upper undef, use a larger broadcast.
55077   if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
55078     return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
55079
55080   // If this is a broadcast load inserted into an upper undef, use a larger
55081   // broadcast load.
55082   if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
55083       SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
55084     auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
55085     SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
55086     SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
55087     SDValue BcastLd =
55088         DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
55089                                 MemIntr->getMemoryVT(),
55090                                 MemIntr->getMemOperand());
55091     DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
55092     return BcastLd;
55093   }
55094
55095   // If we're splatting the lower half subvector of a full vector load into the
55096   // upper half, attempt to create a subvector broadcast.
55097   if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
55098       Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
55099     auto *VecLd = dyn_cast<LoadSDNode>(Vec);
55100     auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
55101     if (VecLd && SubLd &&
55102         DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
55103                                            SubVec.getValueSizeInBits() / 8, 0))
55104       return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
55105                                SubLd, 0, DAG);
55106   }
55107
55108   return SDValue();
55109 }
55110
55111 /// If we are extracting a subvector of a vector select and the select condition
55112 /// is composed of concatenated vectors, try to narrow the select width. This
55113 /// is a common pattern for AVX1 integer code because 256-bit selects may be
55114 /// legal, but there is almost no integer math/logic available for 256-bit.
55115 /// This function should only be called with legal types (otherwise, the calls
55116 /// to get simple value types will assert).
55117 static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
55118   SDValue Sel = Ext->getOperand(0);
55119   if (Sel.getOpcode() != ISD::VSELECT ||
55120       !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG))
55121     return SDValue();
55122
55123   // Note: We assume simple value types because this should only be called with
55124   //       legal operations/types.
55125   // TODO: This can be extended to handle extraction to 256-bits.
55126   MVT VT = Ext->getSimpleValueType(0);
55127   if (!VT.is128BitVector())
55128     return SDValue();
55129
55130   MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
55131   if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
55132     return SDValue();
55133
55134   MVT WideVT = Ext->getOperand(0).getSimpleValueType();
55135   MVT SelVT = Sel.getSimpleValueType();
55136   assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
55137          "Unexpected vector type with legal operations");
55138
55139   unsigned SelElts = SelVT.getVectorNumElements();
55140   unsigned CastedElts = WideVT.getVectorNumElements();
55141   unsigned ExtIdx = Ext->getConstantOperandVal(1);
55142   if (SelElts % CastedElts == 0) {
55143     // The select has the same or more (narrower) elements than the extract
55144     // operand. The extraction index gets scaled by that factor.
55145     ExtIdx *= (SelElts / CastedElts);
55146   } else if (CastedElts % SelElts == 0) {
55147     // The select has less (wider) elements than the extract operand. Make sure
55148     // that the extraction index can be divided evenly.
55149     unsigned IndexDivisor = CastedElts / SelElts;
55150     if (ExtIdx % IndexDivisor != 0)
55151       return SDValue();
55152     ExtIdx /= IndexDivisor;
55153   } else {
55154     llvm_unreachable("Element count of simple vector types are not divisible?");
55155   }
55156
55157   unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
55158   unsigned NarrowElts = SelElts / NarrowingFactor;
55159   MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
55160   SDLoc DL(Ext);
55161   SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
55162   SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
55163   SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
55164   SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
55165   return DAG.getBitcast(VT, NarrowSel);
55166 }
55167
55168 static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
55169                                         TargetLowering::DAGCombinerInfo &DCI,
55170                                         const X86Subtarget &Subtarget) {
55171   // For AVX1 only, if we are extracting from a 256-bit and+not (which will
55172   // eventually get combined/lowered into ANDNP) with a concatenated operand,
55173   // split the 'and' into 128-bit ops to avoid the concatenate and extract.
55174   // We let generic combining take over from there to simplify the
55175   // insert/extract and 'not'.
55176   // This pattern emerges during AVX1 legalization. We handle it before lowering
55177   // to avoid complications like splitting constant vector loads.
55178
55179   // Capture the original wide type in the likely case that we need to bitcast
55180   // back to this type.
55181   if (!N->getValueType(0).isSimple())
55182     return SDValue();
55183
55184   MVT VT = N->getSimpleValueType(0);
55185   SDValue InVec = N->getOperand(0);
55186   unsigned IdxVal = N->getConstantOperandVal(1);
55187   SDValue InVecBC = peekThroughBitcasts(InVec);
55188   EVT InVecVT = InVec.getValueType();
55189   unsigned SizeInBits = VT.getSizeInBits();
55190   unsigned InSizeInBits = InVecVT.getSizeInBits();
55191   unsigned NumSubElts = VT.getVectorNumElements();
55192   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55193
55194   if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
55195       TLI.isTypeLegal(InVecVT) &&
55196       InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
55197     auto isConcatenatedNot = [](SDValue V) {
55198       V = peekThroughBitcasts(V);
55199       if (!isBitwiseNot(V))
55200         return false;
55201       SDValue NotOp = V->getOperand(0);
55202       return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
55203     };
55204     if (isConcatenatedNot(InVecBC.getOperand(0)) ||
55205         isConcatenatedNot(InVecBC.getOperand(1))) {
55206       // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
55207       SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
55208       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
55209                          DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
55210     }
55211   }
55212
55213   if (DCI.isBeforeLegalizeOps())
55214     return SDValue();
55215
55216   if (SDValue V = narrowExtractedVectorSelect(N, DAG))
55217     return V;
55218
55219   if (ISD::isBuildVectorAllZeros(InVec.getNode()))
55220     return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
55221
55222   if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
55223     if (VT.getScalarType() == MVT::i1)
55224       return DAG.getConstant(1, SDLoc(N), VT);
55225     return getOnesVector(VT, DAG, SDLoc(N));
55226   }
55227
55228   if (InVec.getOpcode() == ISD::BUILD_VECTOR)
55229     return DAG.getBuildVector(VT, SDLoc(N),
55230                               InVec->ops().slice(IdxVal, NumSubElts));
55231
55232   // If we are extracting from an insert into a larger vector, replace with a
55233   // smaller insert if we don't access less than the original subvector. Don't
55234   // do this for i1 vectors.
55235   // TODO: Relax the matching indices requirement?
55236   if (VT.getVectorElementType() != MVT::i1 &&
55237       InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
55238       IdxVal == InVec.getConstantOperandVal(2) &&
55239       InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
55240     SDLoc DL(N);
55241     SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
55242                                  InVec.getOperand(0), N->getOperand(1));
55243     unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
55244     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
55245                        InVec.getOperand(1),
55246                        DAG.getVectorIdxConstant(NewIdxVal, DL));
55247   }
55248
55249   // If we're extracting an upper subvector from a broadcast we should just
55250   // extract the lowest subvector instead which should allow
55251   // SimplifyDemandedVectorElts do more simplifications.
55252   if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
55253                       InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
55254                       DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
55255     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
55256
55257   // If we're extracting a broadcasted subvector, just use the lowest subvector.
55258   if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
55259       cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
55260     return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
55261
55262   // Attempt to extract from the source of a shuffle vector.
55263   if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
55264     SmallVector<int, 32> ShuffleMask;
55265     SmallVector<int, 32> ScaledMask;
55266     SmallVector<SDValue, 2> ShuffleInputs;
55267     unsigned NumSubVecs = InSizeInBits / SizeInBits;
55268     // Decode the shuffle mask and scale it so its shuffling subvectors.
55269     if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
55270         scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
55271       unsigned SubVecIdx = IdxVal / NumSubElts;
55272       if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
55273         return DAG.getUNDEF(VT);
55274       if (ScaledMask[SubVecIdx] == SM_SentinelZero)
55275         return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
55276       SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
55277       if (Src.getValueSizeInBits() == InSizeInBits) {
55278         unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
55279         unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
55280         return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
55281                                 SDLoc(N), SizeInBits);
55282       }
55283     }
55284   }
55285
55286   // If we're extracting the lowest subvector and we're the only user,
55287   // we may be able to perform this with a smaller vector width.
55288   unsigned InOpcode = InVec.getOpcode();
55289   if (InVec.hasOneUse()) {
55290     if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
55291       // v2f64 CVTDQ2PD(v4i32).
55292       if (InOpcode == ISD::SINT_TO_FP &&
55293           InVec.getOperand(0).getValueType() == MVT::v4i32) {
55294         return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
55295       }
55296       // v2f64 CVTUDQ2PD(v4i32).
55297       if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
55298           InVec.getOperand(0).getValueType() == MVT::v4i32) {
55299         return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
55300       }
55301       // v2f64 CVTPS2PD(v4f32).
55302       if (InOpcode == ISD::FP_EXTEND &&
55303           InVec.getOperand(0).getValueType() == MVT::v4f32) {
55304         return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
55305       }
55306     }
55307     if (IdxVal == 0 &&
55308         (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
55309         (SizeInBits == 128 || SizeInBits == 256) &&
55310         InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
55311       SDLoc DL(N);
55312       SDValue Ext = InVec.getOperand(0);
55313       if (Ext.getValueSizeInBits() > SizeInBits)
55314         Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
55315       unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
55316       return DAG.getNode(ExtOp, DL, VT, Ext);
55317     }
55318     if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
55319         InVec.getOperand(0).getValueType().is256BitVector() &&
55320         InVec.getOperand(1).getValueType().is256BitVector() &&
55321         InVec.getOperand(2).getValueType().is256BitVector()) {
55322       SDLoc DL(N);
55323       SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
55324       SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
55325       SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
55326       return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
55327     }
55328     if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
55329         (VT.is128BitVector() || VT.is256BitVector())) {
55330       SDLoc DL(N);
55331       SDValue InVecSrc = InVec.getOperand(0);
55332       unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
55333       SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
55334       return DAG.getNode(InOpcode, DL, VT, Ext);
55335     }
55336     if (InOpcode == X86ISD::MOVDDUP &&
55337         (VT.is128BitVector() || VT.is256BitVector())) {
55338       SDLoc DL(N);
55339       SDValue Ext0 =
55340           extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
55341       return DAG.getNode(InOpcode, DL, VT, Ext0);
55342     }
55343   }
55344
55345   // Always split vXi64 logical shifts where we're extracting the upper 32-bits
55346   // as this is very likely to fold into a shuffle/truncation.
55347   if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
55348       InVecVT.getScalarSizeInBits() == 64 &&
55349       InVec.getConstantOperandAPInt(1) == 32) {
55350     SDLoc DL(N);
55351     SDValue Ext =
55352         extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
55353     return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
55354   }
55355
55356   return SDValue();
55357 }
55358
55359 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
55360   EVT VT = N->getValueType(0);
55361   SDValue Src = N->getOperand(0);
55362   SDLoc DL(N);
55363
55364   // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
55365   // This occurs frequently in our masked scalar intrinsic code and our
55366   // floating point select lowering with AVX512.
55367   // TODO: SimplifyDemandedBits instead?
55368   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
55369       isOneConstant(Src.getOperand(1)))
55370     return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
55371
55372   // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
55373   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
55374       Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
55375       Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55376       isNullConstant(Src.getOperand(1)))
55377     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
55378                        Src.getOperand(1));
55379
55380   // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
55381   // TODO: Move to DAGCombine/SimplifyDemandedBits?
55382   if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
55383     auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
55384       if (Op.getValueType() != MVT::i64)
55385         return SDValue();
55386       unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
55387       if (Op.getOpcode() == Opc &&
55388           Op.getOperand(0).getScalarValueSizeInBits() <= 32)
55389         return Op.getOperand(0);
55390       unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
55391       if (auto *Ld = dyn_cast<LoadSDNode>(Op))
55392         if (Ld->getExtensionType() == Ext &&
55393             Ld->getMemoryVT().getScalarSizeInBits() <= 32)
55394           return Op;
55395       if (IsZeroExt) {
55396         KnownBits Known = DAG.computeKnownBits(Op);
55397         if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
55398           return Op;
55399       }
55400       return SDValue();
55401     };
55402
55403     if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
55404       return DAG.getBitcast(
55405           VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
55406                           DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
55407
55408     if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
55409       return DAG.getBitcast(
55410           VT,
55411           DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
55412                       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
55413                                   DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
55414   }
55415
55416   // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
55417   if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
55418       Src.getOperand(0).getValueType() == MVT::x86mmx)
55419     return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
55420
55421   // See if we're broadcasting the scalar value, in which case just reuse that.
55422   // Ensure the same SDValue from the SDNode use is being used.
55423   if (VT.getScalarType() == Src.getValueType())
55424     for (SDNode *User : Src->uses())
55425       if (User->getOpcode() == X86ISD::VBROADCAST &&
55426           Src == User->getOperand(0)) {
55427         unsigned SizeInBits = VT.getFixedSizeInBits();
55428         unsigned BroadcastSizeInBits =
55429             User->getValueSizeInBits(0).getFixedValue();
55430         if (BroadcastSizeInBits == SizeInBits)
55431           return SDValue(User, 0);
55432         if (BroadcastSizeInBits > SizeInBits)
55433           return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
55434         // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
55435         // coverage.
55436       }
55437
55438   return SDValue();
55439 }
55440
55441 // Simplify PMULDQ and PMULUDQ operations.
55442 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
55443                              TargetLowering::DAGCombinerInfo &DCI,
55444                              const X86Subtarget &Subtarget) {
55445   SDValue LHS = N->getOperand(0);
55446   SDValue RHS = N->getOperand(1);
55447
55448   // Canonicalize constant to RHS.
55449   if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
55450       !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
55451     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
55452
55453   // Multiply by zero.
55454   // Don't return RHS as it may contain UNDEFs.
55455   if (ISD::isBuildVectorAllZeros(RHS.getNode()))
55456     return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
55457
55458   // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
55459   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55460   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
55461     return SDValue(N, 0);
55462
55463   // If the input is an extend_invec and the SimplifyDemandedBits call didn't
55464   // convert it to any_extend_invec, due to the LegalOperations check, do the
55465   // conversion directly to a vector shuffle manually. This exposes combine
55466   // opportunities missed by combineEXTEND_VECTOR_INREG not calling
55467   // combineX86ShufflesRecursively on SSE4.1 targets.
55468   // FIXME: This is basically a hack around several other issues related to
55469   // ANY_EXTEND_VECTOR_INREG.
55470   if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
55471       (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
55472        LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
55473       LHS.getOperand(0).getValueType() == MVT::v4i32) {
55474     SDLoc dl(N);
55475     LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
55476                                LHS.getOperand(0), { 0, -1, 1, -1 });
55477     LHS = DAG.getBitcast(MVT::v2i64, LHS);
55478     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
55479   }
55480   if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
55481       (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
55482        RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
55483       RHS.getOperand(0).getValueType() == MVT::v4i32) {
55484     SDLoc dl(N);
55485     RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
55486                                RHS.getOperand(0), { 0, -1, 1, -1 });
55487     RHS = DAG.getBitcast(MVT::v2i64, RHS);
55488     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
55489   }
55490
55491   return SDValue();
55492 }
55493
55494 // Simplify VPMADDUBSW/VPMADDWD operations.
55495 static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
55496                              TargetLowering::DAGCombinerInfo &DCI) {
55497   EVT VT = N->getValueType(0);
55498   SDValue LHS = N->getOperand(0);
55499   SDValue RHS = N->getOperand(1);
55500
55501   // Multiply by zero.
55502   // Don't return LHS/RHS as it may contain UNDEFs.
55503   if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
55504       ISD::isBuildVectorAllZeros(RHS.getNode()))
55505     return DAG.getConstant(0, SDLoc(N), VT);
55506
55507   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55508   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55509   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55510     return SDValue(N, 0);
55511
55512   return SDValue();
55513 }
55514
55515 static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
55516                                           TargetLowering::DAGCombinerInfo &DCI,
55517                                           const X86Subtarget &Subtarget) {
55518   EVT VT = N->getValueType(0);
55519   SDValue In = N->getOperand(0);
55520   unsigned Opcode = N->getOpcode();
55521   unsigned InOpcode = In.getOpcode();
55522   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55523   SDLoc DL(N);
55524
55525   // Try to merge vector loads and extend_inreg to an extload.
55526   if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
55527       In.hasOneUse()) {
55528     auto *Ld = cast<LoadSDNode>(In);
55529     if (Ld->isSimple()) {
55530       MVT SVT = In.getSimpleValueType().getVectorElementType();
55531       ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
55532                                  ? ISD::SEXTLOAD
55533                                  : ISD::ZEXTLOAD;
55534       EVT MemVT = VT.changeVectorElementType(SVT);
55535       if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
55536         SDValue Load = DAG.getExtLoad(
55537             Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
55538             MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
55539         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
55540         return Load;
55541       }
55542     }
55543   }
55544
55545   // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
55546   if (Opcode == InOpcode)
55547     return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
55548
55549   // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
55550   // -> EXTEND_VECTOR_INREG(X).
55551   // TODO: Handle non-zero subvector indices.
55552   if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
55553       In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
55554       In.getOperand(0).getOperand(0).getValueSizeInBits() ==
55555           In.getValueSizeInBits())
55556     return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
55557
55558   // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
55559   // TODO: Move to DAGCombine?
55560   if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
55561       In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
55562       In.getValueSizeInBits() == VT.getSizeInBits()) {
55563     unsigned NumElts = VT.getVectorNumElements();
55564     unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
55565     EVT EltVT = In.getOperand(0).getValueType();
55566     SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
55567     for (unsigned I = 0; I != NumElts; ++I)
55568       Elts[I * Scale] = In.getOperand(I);
55569     return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
55570   }
55571
55572   // Attempt to combine as a shuffle on SSE41+ targets.
55573   if (Subtarget.hasSSE41()) {
55574     SDValue Op(N, 0);
55575     if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
55576       if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55577         return Res;
55578   }
55579
55580   return SDValue();
55581 }
55582
55583 static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
55584                              TargetLowering::DAGCombinerInfo &DCI) {
55585   EVT VT = N->getValueType(0);
55586
55587   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
55588     return DAG.getConstant(0, SDLoc(N), VT);
55589
55590   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55591   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55592   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55593     return SDValue(N, 0);
55594
55595   return SDValue();
55596 }
55597
55598 // Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
55599 // Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
55600 // extra instructions between the conversion due to going to scalar and back.
55601 static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
55602                                  const X86Subtarget &Subtarget) {
55603   if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
55604     return SDValue();
55605
55606   if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
55607     return SDValue();
55608
55609   if (N->getValueType(0) != MVT::f32 ||
55610       N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
55611     return SDValue();
55612
55613   SDLoc dl(N);
55614   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
55615                             N->getOperand(0).getOperand(0));
55616   Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
55617                     DAG.getTargetConstant(4, dl, MVT::i32));
55618   Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
55619   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
55620                      DAG.getIntPtrConstant(0, dl));
55621 }
55622
55623 static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
55624                                 const X86Subtarget &Subtarget) {
55625   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
55626     return SDValue();
55627
55628   if (Subtarget.hasFP16())
55629     return SDValue();
55630
55631   bool IsStrict = N->isStrictFPOpcode();
55632   EVT VT = N->getValueType(0);
55633   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55634   EVT SrcVT = Src.getValueType();
55635
55636   if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
55637     return SDValue();
55638
55639   if (VT.getVectorElementType() != MVT::f32 &&
55640       VT.getVectorElementType() != MVT::f64)
55641     return SDValue();
55642
55643   unsigned NumElts = VT.getVectorNumElements();
55644   if (NumElts == 1 || !isPowerOf2_32(NumElts))
55645     return SDValue();
55646
55647   SDLoc dl(N);
55648
55649   // Convert the input to vXi16.
55650   EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
55651   Src = DAG.getBitcast(IntVT, Src);
55652
55653   // Widen to at least 8 input elements.
55654   if (NumElts < 8) {
55655     unsigned NumConcats = 8 / NumElts;
55656     SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
55657                                 : DAG.getConstant(0, dl, IntVT);
55658     SmallVector<SDValue, 4> Ops(NumConcats, Fill);
55659     Ops[0] = Src;
55660     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
55661   }
55662
55663   // Destination is vXf32 with at least 4 elements.
55664   EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
55665                                std::max(4U, NumElts));
55666   SDValue Cvt, Chain;
55667   if (IsStrict) {
55668     Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
55669                       {N->getOperand(0), Src});
55670     Chain = Cvt.getValue(1);
55671   } else {
55672     Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
55673   }
55674
55675   if (NumElts < 4) {
55676     assert(NumElts == 2 && "Unexpected size");
55677     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
55678                       DAG.getIntPtrConstant(0, dl));
55679   }
55680
55681   if (IsStrict) {
55682     // Extend to the original VT if necessary.
55683     if (Cvt.getValueType() != VT) {
55684       Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
55685                         {Chain, Cvt});
55686       Chain = Cvt.getValue(1);
55687     }
55688     return DAG.getMergeValues({Cvt, Chain}, dl);
55689   }
55690
55691   // Extend to the original VT if necessary.
55692   return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
55693 }
55694
55695 // Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
55696 // from. Limit this to cases where the loads have the same input chain and the
55697 // output chains are unused. This avoids any memory ordering issues.
55698 static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
55699                                      TargetLowering::DAGCombinerInfo &DCI) {
55700   assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
55701           N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
55702          "Unknown broadcast load type");
55703
55704   // Only do this if the chain result is unused.
55705   if (N->hasAnyUseOfValue(1))
55706     return SDValue();
55707
55708   auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
55709
55710   SDValue Ptr = MemIntrin->getBasePtr();
55711   SDValue Chain = MemIntrin->getChain();
55712   EVT VT = N->getSimpleValueType(0);
55713   EVT MemVT = MemIntrin->getMemoryVT();
55714
55715   // Look at other users of our base pointer and try to find a wider broadcast.
55716   // The input chain and the size of the memory VT must match.
55717   for (SDNode *User : Ptr->uses())
55718     if (User != N && User->getOpcode() == N->getOpcode() &&
55719         cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
55720         cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
55721         cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
55722             MemVT.getSizeInBits() &&
55723         !User->hasAnyUseOfValue(1) &&
55724         User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
55725       SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
55726                                          VT.getSizeInBits());
55727       Extract = DAG.getBitcast(VT, Extract);
55728       return DCI.CombineTo(N, Extract, SDValue(User, 1));
55729     }
55730
55731   return SDValue();
55732 }
55733
55734 static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
55735                                const X86Subtarget &Subtarget) {
55736   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
55737     return SDValue();
55738
55739   bool IsStrict = N->isStrictFPOpcode();
55740   EVT VT = N->getValueType(0);
55741   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55742   EVT SrcVT = Src.getValueType();
55743
55744   if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
55745       SrcVT.getVectorElementType() != MVT::f32)
55746     return SDValue();
55747
55748   SDLoc dl(N);
55749
55750   SDValue Cvt, Chain;
55751   unsigned NumElts = VT.getVectorNumElements();
55752   if (Subtarget.hasFP16()) {
55753     // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))
55754     // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))
55755     if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {
55756       SDValue Cvt0, Cvt1;
55757       SDValue Op0 = Src.getOperand(0);
55758       SDValue Op1 = Src.getOperand(1);
55759       bool IsOp0Strict = Op0->isStrictFPOpcode();
55760       if (Op0.getOpcode() != Op1.getOpcode() ||
55761           Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
55762           Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
55763         return SDValue();
55764       }
55765       int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
55766       if (IsStrict) {
55767         assert(IsOp0Strict && "Op0 must be strict node");
55768         unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
55769                            ? X86ISD::STRICT_CVTSI2P
55770                            : X86ISD::STRICT_CVTUI2P;
55771         Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
55772                            {Op0.getOperand(0), Op0.getOperand(1)});
55773         Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
55774                            {Op1.getOperand(0), Op1.getOperand(1)});
55775         Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
55776         return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
55777       }
55778       unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
55779                                                         : X86ISD::CVTUI2P;
55780       Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
55781       Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
55782       return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
55783     }
55784     return SDValue();
55785   }
55786
55787   if (NumElts == 1 || !isPowerOf2_32(NumElts))
55788     return SDValue();
55789
55790   // Widen to at least 4 input elements.
55791   if (NumElts < 4)
55792     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
55793                       DAG.getConstantFP(0.0, dl, SrcVT));
55794
55795   // Destination is v8i16 with at least 8 elements.
55796   EVT CvtVT =
55797       EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
55798   SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
55799   if (IsStrict) {
55800     Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
55801                       {N->getOperand(0), Src, Rnd});
55802     Chain = Cvt.getValue(1);
55803   } else {
55804     Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
55805   }
55806
55807   // Extract down to real number of elements.
55808   if (NumElts < 8) {
55809     EVT IntVT = VT.changeVectorElementTypeToInteger();
55810     Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
55811                       DAG.getIntPtrConstant(0, dl));
55812   }
55813
55814   Cvt = DAG.getBitcast(VT, Cvt);
55815
55816   if (IsStrict)
55817     return DAG.getMergeValues({Cvt, Chain}, dl);
55818
55819   return Cvt;
55820 }
55821
55822 static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
55823   SDValue Src = N->getOperand(0);
55824
55825   // Turn MOVDQ2Q+simple_load into an mmx load.
55826   if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55827     LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
55828
55829     if (LN->isSimple()) {
55830       SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
55831                                   LN->getBasePtr(),
55832                                   LN->getPointerInfo(),
55833                                   LN->getOriginalAlign(),
55834                                   LN->getMemOperand()->getFlags());
55835       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
55836       return NewLd;
55837     }
55838   }
55839
55840   return SDValue();
55841 }
55842
55843 static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
55844                            TargetLowering::DAGCombinerInfo &DCI) {
55845   unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
55846   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55847   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
55848     return SDValue(N, 0);
55849
55850   return SDValue();
55851 }
55852
55853 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
55854                                              DAGCombinerInfo &DCI) const {
55855   SelectionDAG &DAG = DCI.DAG;
55856   switch (N->getOpcode()) {
55857   default: break;
55858   case ISD::SCALAR_TO_VECTOR:
55859     return combineScalarToVector(N, DAG);
55860   case ISD::EXTRACT_VECTOR_ELT:
55861   case X86ISD::PEXTRW:
55862   case X86ISD::PEXTRB:
55863     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
55864   case ISD::CONCAT_VECTORS:
55865     return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
55866   case ISD::INSERT_SUBVECTOR:
55867     return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
55868   case ISD::EXTRACT_SUBVECTOR:
55869     return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
55870   case ISD::VSELECT:
55871   case ISD::SELECT:
55872   case X86ISD::BLENDV:      return combineSelect(N, DAG, DCI, Subtarget);
55873   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
55874   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
55875   case X86ISD::CMP:         return combineCMP(N, DAG, Subtarget);
55876   case ISD::ADD:            return combineAdd(N, DAG, DCI, Subtarget);
55877   case ISD::SUB:            return combineSub(N, DAG, DCI, Subtarget);
55878   case X86ISD::ADD:
55879   case X86ISD::SUB:         return combineX86AddSub(N, DAG, DCI);
55880   case X86ISD::SBB:         return combineSBB(N, DAG);
55881   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
55882   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
55883   case ISD::SHL:            return combineShiftLeft(N, DAG);
55884   case ISD::SRA:            return combineShiftRightArithmetic(N, DAG, Subtarget);
55885   case ISD::SRL:            return combineShiftRightLogical(N, DAG, DCI, Subtarget);
55886   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
55887   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
55888   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
55889   case X86ISD::BEXTR:
55890   case X86ISD::BEXTRI:      return combineBEXTR(N, DAG, DCI, Subtarget);
55891   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
55892   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
55893   case ISD::STORE:          return combineStore(N, DAG, DCI, Subtarget);
55894   case ISD::MSTORE:         return combineMaskedStore(N, DAG, DCI, Subtarget);
55895   case X86ISD::VEXTRACT_STORE:
55896     return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
55897   case ISD::SINT_TO_FP:
55898   case ISD::STRICT_SINT_TO_FP:
55899     return combineSIntToFP(N, DAG, DCI, Subtarget);
55900   case ISD::UINT_TO_FP:
55901   case ISD::STRICT_UINT_TO_FP:
55902     return combineUIntToFP(N, DAG, Subtarget);
55903   case ISD::FADD:
55904   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
55905   case X86ISD::VFCMULC:
55906   case X86ISD::VFMULC:      return combineFMulcFCMulc(N, DAG, Subtarget);
55907   case ISD::FNEG:           return combineFneg(N, DAG, DCI, Subtarget);
55908   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
55909   case X86ISD::VTRUNC:      return combineVTRUNC(N, DAG, DCI);
55910   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
55911   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
55912   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
55913   case X86ISD::FXOR:
55914   case X86ISD::FOR:         return combineFOr(N, DAG, DCI, Subtarget);
55915   case X86ISD::FMIN:
55916   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
55917   case ISD::FMINNUM:
55918   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
55919   case X86ISD::CVTSI2P:
55920   case X86ISD::CVTUI2P:     return combineX86INT_TO_FP(N, DAG, DCI);
55921   case X86ISD::CVTP2SI:
55922   case X86ISD::CVTP2UI:
55923   case X86ISD::STRICT_CVTTP2SI:
55924   case X86ISD::CVTTP2SI:
55925   case X86ISD::STRICT_CVTTP2UI:
55926   case X86ISD::CVTTP2UI:
55927                             return combineCVTP2I_CVTTP2I(N, DAG, DCI);
55928   case X86ISD::STRICT_CVTPH2PS:
55929   case X86ISD::CVTPH2PS:    return combineCVTPH2PS(N, DAG, DCI);
55930   case X86ISD::BT:          return combineBT(N, DAG, DCI);
55931   case ISD::ANY_EXTEND:
55932   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
55933   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
55934   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
55935   case ISD::ANY_EXTEND_VECTOR_INREG:
55936   case ISD::SIGN_EXTEND_VECTOR_INREG:
55937   case ISD::ZERO_EXTEND_VECTOR_INREG:
55938     return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
55939   case ISD::SETCC:          return combineSetCC(N, DAG, DCI, Subtarget);
55940   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
55941   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
55942   case X86ISD::PACKSS:
55943   case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);
55944   case X86ISD::HADD:
55945   case X86ISD::HSUB:
55946   case X86ISD::FHADD:
55947   case X86ISD::FHSUB:       return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
55948   case X86ISD::VSHL:
55949   case X86ISD::VSRA:
55950   case X86ISD::VSRL:
55951     return combineVectorShiftVar(N, DAG, DCI, Subtarget);
55952   case X86ISD::VSHLI:
55953   case X86ISD::VSRAI:
55954   case X86ISD::VSRLI:
55955     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
55956   case ISD::INSERT_VECTOR_ELT:
55957   case X86ISD::PINSRB:
55958   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
55959   case X86ISD::SHUFP:       // Handle all target specific shuffles
55960   case X86ISD::INSERTPS:
55961   case X86ISD::EXTRQI:
55962   case X86ISD::INSERTQI:
55963   case X86ISD::VALIGN:
55964   case X86ISD::PALIGNR:
55965   case X86ISD::VSHLDQ:
55966   case X86ISD::VSRLDQ:
55967   case X86ISD::BLENDI:
55968   case X86ISD::UNPCKH:
55969   case X86ISD::UNPCKL:
55970   case X86ISD::MOVHLPS:
55971   case X86ISD::MOVLHPS:
55972   case X86ISD::PSHUFB:
55973   case X86ISD::PSHUFD:
55974   case X86ISD::PSHUFHW:
55975   case X86ISD::PSHUFLW:
55976   case X86ISD::MOVSHDUP:
55977   case X86ISD::MOVSLDUP:
55978   case X86ISD::MOVDDUP:
55979   case X86ISD::MOVSS:
55980   case X86ISD::MOVSD:
55981   case X86ISD::MOVSH:
55982   case X86ISD::VBROADCAST:
55983   case X86ISD::VPPERM:
55984   case X86ISD::VPERMI:
55985   case X86ISD::VPERMV:
55986   case X86ISD::VPERMV3:
55987   case X86ISD::VPERMIL2:
55988   case X86ISD::VPERMILPI:
55989   case X86ISD::VPERMILPV:
55990   case X86ISD::VPERM2X128:
55991   case X86ISD::SHUF128:
55992   case X86ISD::VZEXT_MOVL:
55993   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
55994   case X86ISD::FMADD_RND:
55995   case X86ISD::FMSUB:
55996   case X86ISD::STRICT_FMSUB:
55997   case X86ISD::FMSUB_RND:
55998   case X86ISD::FNMADD:
55999   case X86ISD::STRICT_FNMADD:
56000   case X86ISD::FNMADD_RND:
56001   case X86ISD::FNMSUB:
56002   case X86ISD::STRICT_FNMSUB:
56003   case X86ISD::FNMSUB_RND:
56004   case ISD::FMA:
56005   case ISD::STRICT_FMA:     return combineFMA(N, DAG, DCI, Subtarget);
56006   case X86ISD::FMADDSUB_RND:
56007   case X86ISD::FMSUBADD_RND:
56008   case X86ISD::FMADDSUB:
56009   case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, DCI);
56010   case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI, Subtarget);
56011   case X86ISD::TESTP:       return combineTESTP(N, DAG, DCI, Subtarget);
56012   case X86ISD::MGATHER:
56013   case X86ISD::MSCATTER:    return combineX86GatherScatter(N, DAG, DCI);
56014   case ISD::MGATHER:
56015   case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI);
56016   case X86ISD::PCMPEQ:
56017   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
56018   case X86ISD::PMULDQ:
56019   case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);
56020   case X86ISD::VPMADDUBSW:
56021   case X86ISD::VPMADDWD:    return combineVPMADD(N, DAG, DCI);
56022   case X86ISD::KSHIFTL:
56023   case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);
56024   case ISD::FP16_TO_FP:     return combineFP16_TO_FP(N, DAG, Subtarget);
56025   case ISD::STRICT_FP_EXTEND:
56026   case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, Subtarget);
56027   case ISD::STRICT_FP_ROUND:
56028   case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
56029   case X86ISD::VBROADCAST_LOAD:
56030   case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
56031   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
56032   case X86ISD::PDEP:        return combinePDEP(N, DAG, DCI);
56033   }
56034
56035   return SDValue();
56036 }
56037
56038 bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {
56039   return false;
56040 }
56041
56042 // Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
56043 bool X86TargetLowering::preferSextInRegOfTruncate(EVT TruncVT, EVT VT,
56044                                                   EVT ExtVT) const {
56045   return Subtarget.hasAVX512() || !VT.isVector();
56046 }
56047
56048 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
56049   if (!isTypeLegal(VT))
56050     return false;
56051
56052   // There are no vXi8 shifts.
56053   if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
56054     return false;
56055
56056   // TODO: Almost no 8-bit ops are desirable because they have no actual
56057   //       size/speed advantages vs. 32-bit ops, but they do have a major
56058   //       potential disadvantage by causing partial register stalls.
56059   //
56060   // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
56061   // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
56062   // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
56063   // check for a constant operand to the multiply.
56064   if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
56065     return false;
56066
56067   // i16 instruction encodings are longer and some i16 instructions are slow,
56068   // so those are not desirable.
56069   if (VT == MVT::i16) {
56070     switch (Opc) {
56071     default:
56072       break;
56073     case ISD::LOAD:
56074     case ISD::SIGN_EXTEND:
56075     case ISD::ZERO_EXTEND:
56076     case ISD::ANY_EXTEND:
56077     case ISD::SHL:
56078     case ISD::SRA:
56079     case ISD::SRL:
56080     case ISD::SUB:
56081     case ISD::ADD:
56082     case ISD::MUL:
56083     case ISD::AND:
56084     case ISD::OR:
56085     case ISD::XOR:
56086       return false;
56087     }
56088   }
56089
56090   // Any legal type not explicitly accounted for above here is desirable.
56091   return true;
56092 }
56093
56094 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc &dl,
56095                                                   SDValue Value, SDValue Addr,
56096                                                   int JTI,
56097                                                   SelectionDAG &DAG) const {
56098   const Module *M = DAG.getMachineFunction().getMMI().getModule();
56099   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
56100   if (IsCFProtectionSupported) {
56101     // In case control-flow branch protection is enabled, we need to add
56102     // notrack prefix to the indirect branch.
56103     // In order to do that we create NT_BRIND SDNode.
56104     // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
56105     SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Value, dl);
56106     return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, JTInfo, Addr);
56107   }
56108
56109   return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
56110 }
56111
56112 TargetLowering::AndOrSETCCFoldKind
56113 X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(
56114     const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
56115   using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;
56116   EVT VT = LogicOp->getValueType(0);
56117   EVT OpVT = SETCC0->getOperand(0).getValueType();
56118   if (!VT.isInteger())
56119     return AndOrSETCCFoldKind::None;
56120
56121   if (VT.isVector())
56122     return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |
56123                               (isOperationLegal(ISD::ABS, OpVT)
56124                                    ? AndOrSETCCFoldKind::ABS
56125                                    : AndOrSETCCFoldKind::None));
56126
56127   // Don't use `NotAnd` as even though `not` is generally shorter code size than
56128   // `add`, `add` can lower to LEA which can save moves / spills. Any case where
56129   // `NotAnd` applies, `AddAnd` does as well.
56130   // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
56131   // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
56132   return AndOrSETCCFoldKind::AddAnd;
56133 }
56134
56135 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
56136   EVT VT = Op.getValueType();
56137   bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
56138                              isa<ConstantSDNode>(Op.getOperand(1));
56139
56140   // i16 is legal, but undesirable since i16 instruction encodings are longer
56141   // and some i16 instructions are slow.
56142   // 8-bit multiply-by-constant can usually be expanded to something cheaper
56143   // using LEA and/or other ALU ops.
56144   if (VT != MVT::i16 && !Is8BitMulByConstant)
56145     return false;
56146
56147   auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
56148     if (!Op.hasOneUse())
56149       return false;
56150     SDNode *User = *Op->use_begin();
56151     if (!ISD::isNormalStore(User))
56152       return false;
56153     auto *Ld = cast<LoadSDNode>(Load);
56154     auto *St = cast<StoreSDNode>(User);
56155     return Ld->getBasePtr() == St->getBasePtr();
56156   };
56157
56158   auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
56159     if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
56160       return false;
56161     if (!Op.hasOneUse())
56162       return false;
56163     SDNode *User = *Op->use_begin();
56164     if (User->getOpcode() != ISD::ATOMIC_STORE)
56165       return false;
56166     auto *Ld = cast<AtomicSDNode>(Load);
56167     auto *St = cast<AtomicSDNode>(User);
56168     return Ld->getBasePtr() == St->getBasePtr();
56169   };
56170
56171   bool Commute = false;
56172   switch (Op.getOpcode()) {
56173   default: return false;
56174   case ISD::SIGN_EXTEND:
56175   case ISD::ZERO_EXTEND:
56176   case ISD::ANY_EXTEND:
56177     break;
56178   case ISD::SHL:
56179   case ISD::SRA:
56180   case ISD::SRL: {
56181     SDValue N0 = Op.getOperand(0);
56182     // Look out for (store (shl (load), x)).
56183     if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
56184       return false;
56185     break;
56186   }
56187   case ISD::ADD:
56188   case ISD::MUL:
56189   case ISD::AND:
56190   case ISD::OR:
56191   case ISD::XOR:
56192     Commute = true;
56193     [[fallthrough]];
56194   case ISD::SUB: {
56195     SDValue N0 = Op.getOperand(0);
56196     SDValue N1 = Op.getOperand(1);
56197     // Avoid disabling potential load folding opportunities.
56198     if (X86::mayFoldLoad(N1, Subtarget) &&
56199         (!Commute || !isa<ConstantSDNode>(N0) ||
56200          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
56201       return false;
56202     if (X86::mayFoldLoad(N0, Subtarget) &&
56203         ((Commute && !isa<ConstantSDNode>(N1)) ||
56204          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
56205       return false;
56206     if (IsFoldableAtomicRMW(N0, Op) ||
56207         (Commute && IsFoldableAtomicRMW(N1, Op)))
56208       return false;
56209   }
56210   }
56211
56212   PVT = MVT::i32;
56213   return true;
56214 }
56215
56216 //===----------------------------------------------------------------------===//
56217 //                           X86 Inline Assembly Support
56218 //===----------------------------------------------------------------------===//
56219
56220 // Helper to match a string separated by whitespace.
56221 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
56222   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
56223
56224   for (StringRef Piece : Pieces) {
56225     if (!S.starts_with(Piece)) // Check if the piece matches.
56226       return false;
56227
56228     S = S.substr(Piece.size());
56229     StringRef::size_type Pos = S.find_first_not_of(" \t");
56230     if (Pos == 0) // We matched a prefix.
56231       return false;
56232
56233     S = S.substr(Pos);
56234   }
56235
56236   return S.empty();
56237 }
56238
56239 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
56240
56241   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
56242     if (llvm::is_contained(AsmPieces, "~{cc}") &&
56243         llvm::is_contained(AsmPieces, "~{flags}") &&
56244         llvm::is_contained(AsmPieces, "~{fpsr}")) {
56245
56246       if (AsmPieces.size() == 3)
56247         return true;
56248       else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
56249         return true;
56250     }
56251   }
56252   return false;
56253 }
56254
56255 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
56256   InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
56257
56258   const std::string &AsmStr = IA->getAsmString();
56259
56260   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
56261   if (!Ty || Ty->getBitWidth() % 16 != 0)
56262     return false;
56263
56264   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
56265   SmallVector<StringRef, 4> AsmPieces;
56266   SplitString(AsmStr, AsmPieces, ";\n");
56267
56268   switch (AsmPieces.size()) {
56269   default: return false;
56270   case 1:
56271     // FIXME: this should verify that we are targeting a 486 or better.  If not,
56272     // we will turn this bswap into something that will be lowered to logical
56273     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
56274     // lower so don't worry about this.
56275     // bswap $0
56276     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
56277         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
56278         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
56279         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
56280         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
56281         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
56282       // No need to check constraints, nothing other than the equivalent of
56283       // "=r,0" would be valid here.
56284       return IntrinsicLowering::LowerToByteSwap(CI);
56285     }
56286
56287     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
56288     if (CI->getType()->isIntegerTy(16) &&
56289         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
56290         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
56291          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
56292       AsmPieces.clear();
56293       StringRef ConstraintsStr = IA->getConstraintString();
56294       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
56295       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
56296       if (clobbersFlagRegisters(AsmPieces))
56297         return IntrinsicLowering::LowerToByteSwap(CI);
56298     }
56299     break;
56300   case 3:
56301     if (CI->getType()->isIntegerTy(32) &&
56302         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
56303         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
56304         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
56305         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
56306       AsmPieces.clear();
56307       StringRef ConstraintsStr = IA->getConstraintString();
56308       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
56309       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
56310       if (clobbersFlagRegisters(AsmPieces))
56311         return IntrinsicLowering::LowerToByteSwap(CI);
56312     }
56313
56314     if (CI->getType()->isIntegerTy(64)) {
56315       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
56316       if (Constraints.size() >= 2 &&
56317           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
56318           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
56319         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
56320         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
56321             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
56322             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
56323           return IntrinsicLowering::LowerToByteSwap(CI);
56324       }
56325     }
56326     break;
56327   }
56328   return false;
56329 }
56330
56331 static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
56332   X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
56333                            .Case("{@cca}", X86::COND_A)
56334                            .Case("{@ccae}", X86::COND_AE)
56335                            .Case("{@ccb}", X86::COND_B)
56336                            .Case("{@ccbe}", X86::COND_BE)
56337                            .Case("{@ccc}", X86::COND_B)
56338                            .Case("{@cce}", X86::COND_E)
56339                            .Case("{@ccz}", X86::COND_E)
56340                            .Case("{@ccg}", X86::COND_G)
56341                            .Case("{@ccge}", X86::COND_GE)
56342                            .Case("{@ccl}", X86::COND_L)
56343                            .Case("{@ccle}", X86::COND_LE)
56344                            .Case("{@ccna}", X86::COND_BE)
56345                            .Case("{@ccnae}", X86::COND_B)
56346                            .Case("{@ccnb}", X86::COND_AE)
56347                            .Case("{@ccnbe}", X86::COND_A)
56348                            .Case("{@ccnc}", X86::COND_AE)
56349                            .Case("{@ccne}", X86::COND_NE)
56350                            .Case("{@ccnz}", X86::COND_NE)
56351                            .Case("{@ccng}", X86::COND_LE)
56352                            .Case("{@ccnge}", X86::COND_L)
56353                            .Case("{@ccnl}", X86::COND_GE)
56354                            .Case("{@ccnle}", X86::COND_G)
56355                            .Case("{@ccno}", X86::COND_NO)
56356                            .Case("{@ccnp}", X86::COND_NP)
56357                            .Case("{@ccns}", X86::COND_NS)
56358                            .Case("{@cco}", X86::COND_O)
56359                            .Case("{@ccp}", X86::COND_P)
56360                            .Case("{@ccs}", X86::COND_S)
56361                            .Default(X86::COND_INVALID);
56362   return Cond;
56363 }
56364
56365 /// Given a constraint letter, return the type of constraint for this target.
56366 X86TargetLowering::ConstraintType
56367 X86TargetLowering::getConstraintType(StringRef Constraint) const {
56368   if (Constraint.size() == 1) {
56369     switch (Constraint[0]) {
56370     case 'R':
56371     case 'q':
56372     case 'Q':
56373     case 'f':
56374     case 't':
56375     case 'u':
56376     case 'y':
56377     case 'x':
56378     case 'v':
56379     case 'l':
56380     case 'k': // AVX512 masking registers.
56381       return C_RegisterClass;
56382     case 'a':
56383     case 'b':
56384     case 'c':
56385     case 'd':
56386     case 'S':
56387     case 'D':
56388     case 'A':
56389       return C_Register;
56390     case 'I':
56391     case 'J':
56392     case 'K':
56393     case 'N':
56394     case 'G':
56395     case 'L':
56396     case 'M':
56397       return C_Immediate;
56398     case 'C':
56399     case 'e':
56400     case 'Z':
56401       return C_Other;
56402     default:
56403       break;
56404     }
56405   }
56406   else if (Constraint.size() == 2) {
56407     switch (Constraint[0]) {
56408     default:
56409       break;
56410     case 'Y':
56411       switch (Constraint[1]) {
56412       default:
56413         break;
56414       case 'z':
56415         return C_Register;
56416       case 'i':
56417       case 'm':
56418       case 'k':
56419       case 't':
56420       case '2':
56421         return C_RegisterClass;
56422       }
56423     }
56424   } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
56425     return C_Other;
56426   return TargetLowering::getConstraintType(Constraint);
56427 }
56428
56429 /// Examine constraint type and operand type and determine a weight value.
56430 /// This object must already have been set up with the operand type
56431 /// and the current alternative constraint selected.
56432 TargetLowering::ConstraintWeight
56433 X86TargetLowering::getSingleConstraintMatchWeight(
56434     AsmOperandInfo &Info, const char *Constraint) const {
56435   ConstraintWeight Wt = CW_Invalid;
56436   Value *CallOperandVal = Info.CallOperandVal;
56437   // If we don't have a value, we can't do a match,
56438   // but allow it at the lowest weight.
56439   if (!CallOperandVal)
56440     return CW_Default;
56441   Type *Ty = CallOperandVal->getType();
56442   // Look at the constraint type.
56443   switch (*Constraint) {
56444   default:
56445     Wt = TargetLowering::getSingleConstraintMatchWeight(Info, Constraint);
56446     [[fallthrough]];
56447   case 'R':
56448   case 'q':
56449   case 'Q':
56450   case 'a':
56451   case 'b':
56452   case 'c':
56453   case 'd':
56454   case 'S':
56455   case 'D':
56456   case 'A':
56457     if (CallOperandVal->getType()->isIntegerTy())
56458       Wt = CW_SpecificReg;
56459     break;
56460   case 'f':
56461   case 't':
56462   case 'u':
56463     if (Ty->isFloatingPointTy())
56464       Wt = CW_SpecificReg;
56465     break;
56466   case 'y':
56467     if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
56468       Wt = CW_SpecificReg;
56469     break;
56470   case 'Y':
56471     if (StringRef(Constraint).size() != 2)
56472       break;
56473     switch (Constraint[1]) {
56474     default:
56475       return CW_Invalid;
56476     // XMM0
56477     case 'z':
56478       if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
56479           ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
56480           ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
56481         return CW_SpecificReg;
56482       return CW_Invalid;
56483     // Conditional OpMask regs (AVX512)
56484     case 'k':
56485       if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
56486         return CW_Register;
56487       return CW_Invalid;
56488     // Any MMX reg
56489     case 'm':
56490       if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
56491         return Wt;
56492       return CW_Invalid;
56493     // Any SSE reg when ISA >= SSE2, same as 'x'
56494     case 'i':
56495     case 't':
56496     case '2':
56497       if (!Subtarget.hasSSE2())
56498         return CW_Invalid;
56499       break;
56500     }
56501     break;
56502   case 'v':
56503     if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
56504       Wt = CW_Register;
56505     [[fallthrough]];
56506   case 'x':
56507     if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
56508         ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
56509       Wt = CW_Register;
56510     break;
56511   case 'k':
56512     // Enable conditional vector operations using %k<#> registers.
56513     if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
56514       Wt = CW_Register;
56515     break;
56516   case 'I':
56517     if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
56518       if (C->getZExtValue() <= 31)
56519         Wt = CW_Constant;
56520     break;
56521   case 'J':
56522     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56523       if (C->getZExtValue() <= 63)
56524         Wt = CW_Constant;
56525     break;
56526   case 'K':
56527     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56528       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
56529         Wt = CW_Constant;
56530     break;
56531   case 'L':
56532     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56533       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
56534         Wt = CW_Constant;
56535     break;
56536   case 'M':
56537     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56538       if (C->getZExtValue() <= 3)
56539         Wt = CW_Constant;
56540     break;
56541   case 'N':
56542     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56543       if (C->getZExtValue() <= 0xff)
56544         Wt = CW_Constant;
56545     break;
56546   case 'G':
56547   case 'C':
56548     if (isa<ConstantFP>(CallOperandVal))
56549       Wt = CW_Constant;
56550     break;
56551   case 'e':
56552     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56553       if ((C->getSExtValue() >= -0x80000000LL) &&
56554           (C->getSExtValue() <= 0x7fffffffLL))
56555         Wt = CW_Constant;
56556     break;
56557   case 'Z':
56558     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
56559       if (C->getZExtValue() <= 0xffffffff)
56560         Wt = CW_Constant;
56561     break;
56562   }
56563   return Wt;
56564 }
56565
56566 /// Try to replace an X constraint, which matches anything, with another that
56567 /// has more specific requirements based on the type of the corresponding
56568 /// operand.
56569 const char *X86TargetLowering::
56570 LowerXConstraint(EVT ConstraintVT) const {
56571   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
56572   // 'f' like normal targets.
56573   if (ConstraintVT.isFloatingPoint()) {
56574     if (Subtarget.hasSSE1())
56575       return "x";
56576   }
56577
56578   return TargetLowering::LowerXConstraint(ConstraintVT);
56579 }
56580
56581 // Lower @cc targets via setcc.
56582 SDValue X86TargetLowering::LowerAsmOutputForConstraint(
56583     SDValue &Chain, SDValue &Glue, const SDLoc &DL,
56584     const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
56585   X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
56586   if (Cond == X86::COND_INVALID)
56587     return SDValue();
56588   // Check that return type is valid.
56589   if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
56590       OpInfo.ConstraintVT.getSizeInBits() < 8)
56591     report_fatal_error("Glue output operand is of invalid type");
56592
56593   // Get EFLAGS register. Only update chain when copyfrom is glued.
56594   if (Glue.getNode()) {
56595     Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
56596     Chain = Glue.getValue(1);
56597   } else
56598     Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
56599   // Extract CC code.
56600   SDValue CC = getSETCC(Cond, Glue, DL, DAG);
56601   // Extend to 32-bits
56602   SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
56603
56604   return Result;
56605 }
56606
56607 /// Lower the specified operand into the Ops vector.
56608 /// If it is invalid, don't add anything to Ops.
56609 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
56610                                                      StringRef Constraint,
56611                                                      std::vector<SDValue> &Ops,
56612                                                      SelectionDAG &DAG) const {
56613   SDValue Result;
56614
56615   // Only support length 1 constraints for now.
56616   if (Constraint.size() > 1)
56617     return;
56618
56619   char ConstraintLetter = Constraint[0];
56620   switch (ConstraintLetter) {
56621   default: break;
56622   case 'I':
56623     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56624       if (C->getZExtValue() <= 31) {
56625         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56626                                        Op.getValueType());
56627         break;
56628       }
56629     }
56630     return;
56631   case 'J':
56632     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56633       if (C->getZExtValue() <= 63) {
56634         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56635                                        Op.getValueType());
56636         break;
56637       }
56638     }
56639     return;
56640   case 'K':
56641     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56642       if (isInt<8>(C->getSExtValue())) {
56643         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56644                                        Op.getValueType());
56645         break;
56646       }
56647     }
56648     return;
56649   case 'L':
56650     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56651       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
56652           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
56653         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
56654                                        Op.getValueType());
56655         break;
56656       }
56657     }
56658     return;
56659   case 'M':
56660     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56661       if (C->getZExtValue() <= 3) {
56662         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56663                                        Op.getValueType());
56664         break;
56665       }
56666     }
56667     return;
56668   case 'N':
56669     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56670       if (C->getZExtValue() <= 255) {
56671         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56672                                        Op.getValueType());
56673         break;
56674       }
56675     }
56676     return;
56677   case 'O':
56678     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56679       if (C->getZExtValue() <= 127) {
56680         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56681                                        Op.getValueType());
56682         break;
56683       }
56684     }
56685     return;
56686   case 'e': {
56687     // 32-bit signed value
56688     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56689       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
56690                                            C->getSExtValue())) {
56691         // Widen to 64 bits here to get it sign extended.
56692         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
56693         break;
56694       }
56695     // FIXME gcc accepts some relocatable values here too, but only in certain
56696     // memory models; it's complicated.
56697     }
56698     return;
56699   }
56700   case 'Z': {
56701     // 32-bit unsigned value
56702     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
56703       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
56704                                            C->getZExtValue())) {
56705         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
56706                                        Op.getValueType());
56707         break;
56708       }
56709     }
56710     // FIXME gcc accepts some relocatable values here too, but only in certain
56711     // memory models; it's complicated.
56712     return;
56713   }
56714   case 'i': {
56715     // Literal immediates are always ok.
56716     if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
56717       bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
56718       BooleanContent BCont = getBooleanContents(MVT::i64);
56719       ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
56720                                     : ISD::SIGN_EXTEND;
56721       int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
56722                                                   : CST->getSExtValue();
56723       Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
56724       break;
56725     }
56726
56727     // In any sort of PIC mode addresses need to be computed at runtime by
56728     // adding in a register or some sort of table lookup.  These can't
56729     // be used as immediates. BlockAddresses and BasicBlocks are fine though.
56730     if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
56731         !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
56732       return;
56733
56734     // If we are in non-pic codegen mode, we allow the address of a global (with
56735     // an optional displacement) to be used with 'i'.
56736     if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
56737       // If we require an extra load to get this address, as in PIC mode, we
56738       // can't accept it.
56739       if (isGlobalStubReference(
56740               Subtarget.classifyGlobalReference(GA->getGlobal())))
56741         return;
56742     break;
56743   }
56744   }
56745
56746   if (Result.getNode()) {
56747     Ops.push_back(Result);
56748     return;
56749   }
56750   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
56751 }
56752
56753 /// Check if \p RC is a general purpose register class.
56754 /// I.e., GR* or one of their variant.
56755 static bool isGRClass(const TargetRegisterClass &RC) {
56756   return RC.hasSuperClassEq(&X86::GR8RegClass) ||
56757          RC.hasSuperClassEq(&X86::GR16RegClass) ||
56758          RC.hasSuperClassEq(&X86::GR32RegClass) ||
56759          RC.hasSuperClassEq(&X86::GR64RegClass) ||
56760          RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
56761 }
56762
56763 /// Check if \p RC is a vector register class.
56764 /// I.e., FR* / VR* or one of their variant.
56765 static bool isFRClass(const TargetRegisterClass &RC) {
56766   return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
56767          RC.hasSuperClassEq(&X86::FR32XRegClass) ||
56768          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
56769          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
56770          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
56771          RC.hasSuperClassEq(&X86::VR512RegClass);
56772 }
56773
56774 /// Check if \p RC is a mask register class.
56775 /// I.e., VK* or one of their variant.
56776 static bool isVKClass(const TargetRegisterClass &RC) {
56777   return RC.hasSuperClassEq(&X86::VK1RegClass) ||
56778          RC.hasSuperClassEq(&X86::VK2RegClass) ||
56779          RC.hasSuperClassEq(&X86::VK4RegClass) ||
56780          RC.hasSuperClassEq(&X86::VK8RegClass) ||
56781          RC.hasSuperClassEq(&X86::VK16RegClass) ||
56782          RC.hasSuperClassEq(&X86::VK32RegClass) ||
56783          RC.hasSuperClassEq(&X86::VK64RegClass);
56784 }
56785
56786 std::pair<unsigned, const TargetRegisterClass *>
56787 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
56788                                                 StringRef Constraint,
56789                                                 MVT VT) const {
56790   // First, see if this is a constraint that directly corresponds to an LLVM
56791   // register class.
56792   if (Constraint.size() == 1) {
56793     // GCC Constraint Letters
56794     switch (Constraint[0]) {
56795     default: break;
56796     // 'A' means [ER]AX + [ER]DX.
56797     case 'A':
56798       if (Subtarget.is64Bit())
56799         return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
56800       assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
56801              "Expecting 64, 32 or 16 bit subtarget");
56802       return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
56803
56804       // TODO: Slight differences here in allocation order and leaving
56805       // RIP in the class. Do they matter any more here than they do
56806       // in the normal allocation?
56807     case 'k':
56808       if (Subtarget.hasAVX512()) {
56809         if (VT == MVT::i1)
56810           return std::make_pair(0U, &X86::VK1RegClass);
56811         if (VT == MVT::i8)
56812           return std::make_pair(0U, &X86::VK8RegClass);
56813         if (VT == MVT::i16)
56814           return std::make_pair(0U, &X86::VK16RegClass);
56815       }
56816       if (Subtarget.hasBWI()) {
56817         if (VT == MVT::i32)
56818           return std::make_pair(0U, &X86::VK32RegClass);
56819         if (VT == MVT::i64)
56820           return std::make_pair(0U, &X86::VK64RegClass);
56821       }
56822       break;
56823     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
56824       if (Subtarget.is64Bit()) {
56825         if (VT == MVT::i8 || VT == MVT::i1)
56826           return std::make_pair(0U, &X86::GR8RegClass);
56827         if (VT == MVT::i16)
56828           return std::make_pair(0U, &X86::GR16RegClass);
56829         if (VT == MVT::i32 || VT == MVT::f32)
56830           return std::make_pair(0U, &X86::GR32RegClass);
56831         if (VT != MVT::f80 && !VT.isVector())
56832           return std::make_pair(0U, &X86::GR64RegClass);
56833         break;
56834       }
56835       [[fallthrough]];
56836       // 32-bit fallthrough
56837     case 'Q':   // Q_REGS
56838       if (VT == MVT::i8 || VT == MVT::i1)
56839         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
56840       if (VT == MVT::i16)
56841         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
56842       if (VT == MVT::i32 || VT == MVT::f32 ||
56843           (!VT.isVector() && !Subtarget.is64Bit()))
56844         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
56845       if (VT != MVT::f80 && !VT.isVector())
56846         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
56847       break;
56848     case 'r':   // GENERAL_REGS
56849     case 'l':   // INDEX_REGS
56850       if (VT == MVT::i8 || VT == MVT::i1)
56851         return std::make_pair(0U, &X86::GR8RegClass);
56852       if (VT == MVT::i16)
56853         return std::make_pair(0U, &X86::GR16RegClass);
56854       if (VT == MVT::i32 || VT == MVT::f32 ||
56855           (!VT.isVector() && !Subtarget.is64Bit()))
56856         return std::make_pair(0U, &X86::GR32RegClass);
56857       if (VT != MVT::f80 && !VT.isVector())
56858         return std::make_pair(0U, &X86::GR64RegClass);
56859       break;
56860     case 'R':   // LEGACY_REGS
56861       if (VT == MVT::i8 || VT == MVT::i1)
56862         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
56863       if (VT == MVT::i16)
56864         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
56865       if (VT == MVT::i32 || VT == MVT::f32 ||
56866           (!VT.isVector() && !Subtarget.is64Bit()))
56867         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
56868       if (VT != MVT::f80 && !VT.isVector())
56869         return std::make_pair(0U, &X86::GR64_NOREXRegClass);
56870       break;
56871     case 'f':  // FP Stack registers.
56872       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
56873       // value to the correct fpstack register class.
56874       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
56875         return std::make_pair(0U, &X86::RFP32RegClass);
56876       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
56877         return std::make_pair(0U, &X86::RFP64RegClass);
56878       if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
56879         return std::make_pair(0U, &X86::RFP80RegClass);
56880       break;
56881     case 'y':   // MMX_REGS if MMX allowed.
56882       if (!Subtarget.hasMMX()) break;
56883       return std::make_pair(0U, &X86::VR64RegClass);
56884     case 'v':
56885     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
56886       if (!Subtarget.hasSSE1()) break;
56887       bool VConstraint = (Constraint[0] == 'v');
56888
56889       switch (VT.SimpleTy) {
56890       default: break;
56891       // Scalar SSE types.
56892       case MVT::f16:
56893         if (VConstraint && Subtarget.hasFP16())
56894           return std::make_pair(0U, &X86::FR16XRegClass);
56895         break;
56896       case MVT::f32:
56897       case MVT::i32:
56898         if (VConstraint && Subtarget.hasVLX())
56899           return std::make_pair(0U, &X86::FR32XRegClass);
56900         return std::make_pair(0U, &X86::FR32RegClass);
56901       case MVT::f64:
56902       case MVT::i64:
56903         if (VConstraint && Subtarget.hasVLX())
56904           return std::make_pair(0U, &X86::FR64XRegClass);
56905         return std::make_pair(0U, &X86::FR64RegClass);
56906       case MVT::i128:
56907         if (Subtarget.is64Bit()) {
56908           if (VConstraint && Subtarget.hasVLX())
56909             return std::make_pair(0U, &X86::VR128XRegClass);
56910           return std::make_pair(0U, &X86::VR128RegClass);
56911         }
56912         break;
56913       // Vector types and fp128.
56914       case MVT::v8f16:
56915         if (!Subtarget.hasFP16())
56916           break;
56917         if (VConstraint)
56918           return std::make_pair(0U, &X86::VR128XRegClass);
56919         return std::make_pair(0U, &X86::VR128RegClass);
56920       case MVT::v8bf16:
56921         if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
56922           break;
56923         if (VConstraint)
56924           return std::make_pair(0U, &X86::VR128XRegClass);
56925         return std::make_pair(0U, &X86::VR128RegClass);
56926       case MVT::f128:
56927       case MVT::v16i8:
56928       case MVT::v8i16:
56929       case MVT::v4i32:
56930       case MVT::v2i64:
56931       case MVT::v4f32:
56932       case MVT::v2f64:
56933         if (VConstraint && Subtarget.hasVLX())
56934           return std::make_pair(0U, &X86::VR128XRegClass);
56935         return std::make_pair(0U, &X86::VR128RegClass);
56936       // AVX types.
56937       case MVT::v16f16:
56938         if (!Subtarget.hasFP16())
56939           break;
56940         if (VConstraint)
56941           return std::make_pair(0U, &X86::VR256XRegClass);
56942         return std::make_pair(0U, &X86::VR256RegClass);
56943       case MVT::v16bf16:
56944         if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
56945           break;
56946         if (VConstraint)
56947           return std::make_pair(0U, &X86::VR256XRegClass);
56948         return std::make_pair(0U, &X86::VR256RegClass);
56949       case MVT::v32i8:
56950       case MVT::v16i16:
56951       case MVT::v8i32:
56952       case MVT::v4i64:
56953       case MVT::v8f32:
56954       case MVT::v4f64:
56955         if (VConstraint && Subtarget.hasVLX())
56956           return std::make_pair(0U, &X86::VR256XRegClass);
56957         if (Subtarget.hasAVX())
56958           return std::make_pair(0U, &X86::VR256RegClass);
56959         break;
56960       case MVT::v32f16:
56961         if (!Subtarget.hasFP16())
56962           break;
56963         if (VConstraint)
56964           return std::make_pair(0U, &X86::VR512RegClass);
56965         return std::make_pair(0U, &X86::VR512_0_15RegClass);
56966       case MVT::v32bf16:
56967         if (!Subtarget.hasBF16())
56968           break;
56969         if (VConstraint)
56970           return std::make_pair(0U, &X86::VR512RegClass);
56971         return std::make_pair(0U, &X86::VR512_0_15RegClass);
56972       case MVT::v64i8:
56973       case MVT::v32i16:
56974       case MVT::v8f64:
56975       case MVT::v16f32:
56976       case MVT::v16i32:
56977       case MVT::v8i64:
56978         if (!Subtarget.hasAVX512()) break;
56979         if (VConstraint)
56980           return std::make_pair(0U, &X86::VR512RegClass);
56981         return std::make_pair(0U, &X86::VR512_0_15RegClass);
56982       }
56983       break;
56984     }
56985   } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
56986     switch (Constraint[1]) {
56987     default:
56988       break;
56989     case 'i':
56990     case 't':
56991     case '2':
56992       return getRegForInlineAsmConstraint(TRI, "x", VT);
56993     case 'm':
56994       if (!Subtarget.hasMMX()) break;
56995       return std::make_pair(0U, &X86::VR64RegClass);
56996     case 'z':
56997       if (!Subtarget.hasSSE1()) break;
56998       switch (VT.SimpleTy) {
56999       default: break;
57000       // Scalar SSE types.
57001       case MVT::f16:
57002         if (!Subtarget.hasFP16())
57003           break;
57004         return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
57005       case MVT::f32:
57006       case MVT::i32:
57007         return std::make_pair(X86::XMM0, &X86::FR32RegClass);
57008       case MVT::f64:
57009       case MVT::i64:
57010         return std::make_pair(X86::XMM0, &X86::FR64RegClass);
57011       case MVT::v8f16:
57012         if (!Subtarget.hasFP16())
57013           break;
57014         return std::make_pair(X86::XMM0, &X86::VR128RegClass);
57015       case MVT::v8bf16:
57016         if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
57017           break;
57018         return std::make_pair(X86::XMM0, &X86::VR128RegClass);
57019       case MVT::f128:
57020       case MVT::v16i8:
57021       case MVT::v8i16:
57022       case MVT::v4i32:
57023       case MVT::v2i64:
57024       case MVT::v4f32:
57025       case MVT::v2f64:
57026         return std::make_pair(X86::XMM0, &X86::VR128RegClass);
57027       // AVX types.
57028       case MVT::v16f16:
57029         if (!Subtarget.hasFP16())
57030           break;
57031         return std::make_pair(X86::YMM0, &X86::VR256RegClass);
57032       case MVT::v16bf16:
57033         if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
57034           break;
57035         return std::make_pair(X86::YMM0, &X86::VR256RegClass);
57036       case MVT::v32i8:
57037       case MVT::v16i16:
57038       case MVT::v8i32:
57039       case MVT::v4i64:
57040       case MVT::v8f32:
57041       case MVT::v4f64:
57042         if (Subtarget.hasAVX())
57043           return std::make_pair(X86::YMM0, &X86::VR256RegClass);
57044         break;
57045       case MVT::v32f16:
57046         if (!Subtarget.hasFP16())
57047           break;
57048         return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
57049       case MVT::v32bf16:
57050         if (!Subtarget.hasBF16())
57051           break;
57052         return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
57053       case MVT::v64i8:
57054       case MVT::v32i16:
57055       case MVT::v8f64:
57056       case MVT::v16f32:
57057       case MVT::v16i32:
57058       case MVT::v8i64:
57059         if (Subtarget.hasAVX512())
57060           return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
57061         break;
57062       }
57063       break;
57064     case 'k':
57065       // This register class doesn't allocate k0 for masked vector operation.
57066       if (Subtarget.hasAVX512()) {
57067         if (VT == MVT::i1)
57068           return std::make_pair(0U, &X86::VK1WMRegClass);
57069         if (VT == MVT::i8)
57070           return std::make_pair(0U, &X86::VK8WMRegClass);
57071         if (VT == MVT::i16)
57072           return std::make_pair(0U, &X86::VK16WMRegClass);
57073       }
57074       if (Subtarget.hasBWI()) {
57075         if (VT == MVT::i32)
57076           return std::make_pair(0U, &X86::VK32WMRegClass);
57077         if (VT == MVT::i64)
57078           return std::make_pair(0U, &X86::VK64WMRegClass);
57079       }
57080       break;
57081     }
57082   }
57083
57084   if (parseConstraintCode(Constraint) != X86::COND_INVALID)
57085     return std::make_pair(0U, &X86::GR32RegClass);
57086
57087   // Use the default implementation in TargetLowering to convert the register
57088   // constraint into a member of a register class.
57089   std::pair<Register, const TargetRegisterClass*> Res;
57090   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
57091
57092   // Not found as a standard register?
57093   if (!Res.second) {
57094     // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
57095     // to/from f80.
57096     if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
57097       // Map st(0) -> st(7) -> ST0
57098       if (Constraint.size() == 7 && Constraint[0] == '{' &&
57099           tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
57100           Constraint[3] == '(' &&
57101           (Constraint[4] >= '0' && Constraint[4] <= '7') &&
57102           Constraint[5] == ')' && Constraint[6] == '}') {
57103         // st(7) is not allocatable and thus not a member of RFP80. Return
57104         // singleton class in cases where we have a reference to it.
57105         if (Constraint[4] == '7')
57106           return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
57107         return std::make_pair(X86::FP0 + Constraint[4] - '0',
57108                               &X86::RFP80RegClass);
57109       }
57110
57111       // GCC allows "st(0)" to be called just plain "st".
57112       if (StringRef("{st}").equals_insensitive(Constraint))
57113         return std::make_pair(X86::FP0, &X86::RFP80RegClass);
57114     }
57115
57116     // flags -> EFLAGS
57117     if (StringRef("{flags}").equals_insensitive(Constraint))
57118       return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
57119
57120     // dirflag -> DF
57121     // Only allow for clobber.
57122     if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
57123         VT == MVT::Other)
57124       return std::make_pair(X86::DF, &X86::DFCCRRegClass);
57125
57126     // fpsr -> FPSW
57127     if (StringRef("{fpsr}").equals_insensitive(Constraint))
57128       return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
57129
57130     return Res;
57131   }
57132
57133   // Make sure it isn't a register that requires 64-bit mode.
57134   if (!Subtarget.is64Bit() &&
57135       (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
57136       TRI->getEncodingValue(Res.first) >= 8) {
57137     // Register requires REX prefix, but we're in 32-bit mode.
57138     return std::make_pair(0, nullptr);
57139   }
57140
57141   // Make sure it isn't a register that requires AVX512.
57142   if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
57143       TRI->getEncodingValue(Res.first) & 0x10) {
57144     // Register requires EVEX prefix.
57145     return std::make_pair(0, nullptr);
57146   }
57147
57148   // Otherwise, check to see if this is a register class of the wrong value
57149   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
57150   // turn into {ax},{dx}.
57151   // MVT::Other is used to specify clobber names.
57152   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
57153     return Res;   // Correct type already, nothing to do.
57154
57155   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
57156   // return "eax". This should even work for things like getting 64bit integer
57157   // registers when given an f64 type.
57158   const TargetRegisterClass *Class = Res.second;
57159   // The generic code will match the first register class that contains the
57160   // given register. Thus, based on the ordering of the tablegened file,
57161   // the "plain" GR classes might not come first.
57162   // Therefore, use a helper method.
57163   if (isGRClass(*Class)) {
57164     unsigned Size = VT.getSizeInBits();
57165     if (Size == 1) Size = 8;
57166     if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
57167       return std::make_pair(0, nullptr);
57168     Register DestReg = getX86SubSuperRegister(Res.first, Size);
57169     if (DestReg.isValid()) {
57170       bool is64Bit = Subtarget.is64Bit();
57171       const TargetRegisterClass *RC =
57172           Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
57173         : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
57174         : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
57175         : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
57176       if (Size == 64 && !is64Bit) {
57177         // Model GCC's behavior here and select a fixed pair of 32-bit
57178         // registers.
57179         switch (DestReg) {
57180         case X86::RAX:
57181           return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
57182         case X86::RDX:
57183           return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
57184         case X86::RCX:
57185           return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
57186         case X86::RBX:
57187           return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
57188         case X86::RSI:
57189           return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
57190         case X86::RDI:
57191           return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
57192         case X86::RBP:
57193           return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
57194         default:
57195           return std::make_pair(0, nullptr);
57196         }
57197       }
57198       if (RC && RC->contains(DestReg))
57199         return std::make_pair(DestReg, RC);
57200       return Res;
57201     }
57202     // No register found/type mismatch.
57203     return std::make_pair(0, nullptr);
57204   } else if (isFRClass(*Class)) {
57205     // Handle references to XMM physical registers that got mapped into the
57206     // wrong class.  This can happen with constraints like {xmm0} where the
57207     // target independent register mapper will just pick the first match it can
57208     // find, ignoring the required type.
57209
57210     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
57211     if (VT == MVT::f16)
57212       Res.second = &X86::FR16XRegClass;
57213     else if (VT == MVT::f32 || VT == MVT::i32)
57214       Res.second = &X86::FR32XRegClass;
57215     else if (VT == MVT::f64 || VT == MVT::i64)
57216       Res.second = &X86::FR64XRegClass;
57217     else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
57218       Res.second = &X86::VR128XRegClass;
57219     else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
57220       Res.second = &X86::VR256XRegClass;
57221     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
57222       Res.second = &X86::VR512RegClass;
57223     else {
57224       // Type mismatch and not a clobber: Return an error;
57225       Res.first = 0;
57226       Res.second = nullptr;
57227     }
57228   } else if (isVKClass(*Class)) {
57229     if (VT == MVT::i1)
57230       Res.second = &X86::VK1RegClass;
57231     else if (VT == MVT::i8)
57232       Res.second = &X86::VK8RegClass;
57233     else if (VT == MVT::i16)
57234       Res.second = &X86::VK16RegClass;
57235     else if (VT == MVT::i32)
57236       Res.second = &X86::VK32RegClass;
57237     else if (VT == MVT::i64)
57238       Res.second = &X86::VK64RegClass;
57239     else {
57240       // Type mismatch and not a clobber: Return an error;
57241       Res.first = 0;
57242       Res.second = nullptr;
57243     }
57244   }
57245
57246   return Res;
57247 }
57248
57249 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
57250   // Integer division on x86 is expensive. However, when aggressively optimizing
57251   // for code size, we prefer to use a div instruction, as it is usually smaller
57252   // than the alternative sequence.
57253   // The exception to this is vector division. Since x86 doesn't have vector
57254   // integer division, leaving the division as-is is a loss even in terms of
57255   // size, because it will have to be scalarized, while the alternative code
57256   // sequence can be performed in vector form.
57257   bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
57258   return OptSize && !VT.isVector();
57259 }
57260
57261 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
57262   if (!Subtarget.is64Bit())
57263     return;
57264
57265   // Update IsSplitCSR in X86MachineFunctionInfo.
57266   X86MachineFunctionInfo *AFI =
57267       Entry->getParent()->getInfo<X86MachineFunctionInfo>();
57268   AFI->setIsSplitCSR(true);
57269 }
57270
57271 void X86TargetLowering::insertCopiesSplitCSR(
57272     MachineBasicBlock *Entry,
57273     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
57274   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
57275   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
57276   if (!IStart)
57277     return;
57278
57279   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
57280   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
57281   MachineBasicBlock::iterator MBBI = Entry->begin();
57282   for (const MCPhysReg *I = IStart; *I; ++I) {
57283     const TargetRegisterClass *RC = nullptr;
57284     if (X86::GR64RegClass.contains(*I))
57285       RC = &X86::GR64RegClass;
57286     else
57287       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
57288
57289     Register NewVR = MRI->createVirtualRegister(RC);
57290     // Create copy from CSR to a virtual register.
57291     // FIXME: this currently does not emit CFI pseudo-instructions, it works
57292     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
57293     // nounwind. If we want to generalize this later, we may need to emit
57294     // CFI pseudo-instructions.
57295     assert(
57296         Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
57297         "Function should be nounwind in insertCopiesSplitCSR!");
57298     Entry->addLiveIn(*I);
57299     BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
57300         .addReg(*I);
57301
57302     // Insert the copy-back instructions right before the terminator.
57303     for (auto *Exit : Exits)
57304       BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
57305               TII->get(TargetOpcode::COPY), *I)
57306           .addReg(NewVR);
57307   }
57308 }
57309
57310 bool X86TargetLowering::supportSwiftError() const {
57311   return Subtarget.is64Bit();
57312 }
57313
57314 MachineInstr *
57315 X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
57316                                  MachineBasicBlock::instr_iterator &MBBI,
57317                                  const TargetInstrInfo *TII) const {
57318   assert(MBBI->isCall() && MBBI->getCFIType() &&
57319          "Invalid call instruction for a KCFI check");
57320
57321   MachineFunction &MF = *MBB.getParent();
57322   // If the call target is a memory operand, unfold it and use R11 for the
57323   // call, so KCFI_CHECK won't have to recompute the address.
57324   switch (MBBI->getOpcode()) {
57325   case X86::CALL64m:
57326   case X86::CALL64m_NT:
57327   case X86::TAILJMPm64:
57328   case X86::TAILJMPm64_REX: {
57329     MachineBasicBlock::instr_iterator OrigCall = MBBI;
57330     SmallVector<MachineInstr *, 2> NewMIs;
57331     if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
57332                                   /*UnfoldStore=*/false, NewMIs))
57333       report_fatal_error("Failed to unfold memory operand for a KCFI check");
57334     for (auto *NewMI : NewMIs)
57335       MBBI = MBB.insert(OrigCall, NewMI);
57336     assert(MBBI->isCall() &&
57337            "Unexpected instruction after memory operand unfolding");
57338     if (OrigCall->shouldUpdateCallSiteInfo())
57339       MF.moveCallSiteInfo(&*OrigCall, &*MBBI);
57340     MBBI->setCFIType(MF, OrigCall->getCFIType());
57341     OrigCall->eraseFromParent();
57342     break;
57343   }
57344   default:
57345     break;
57346   }
57347
57348   MachineOperand &Target = MBBI->getOperand(0);
57349   Register TargetReg;
57350   switch (MBBI->getOpcode()) {
57351   case X86::CALL64r:
57352   case X86::CALL64r_NT:
57353   case X86::TAILJMPr64:
57354   case X86::TAILJMPr64_REX:
57355     assert(Target.isReg() && "Unexpected target operand for an indirect call");
57356     Target.setIsRenamable(false);
57357     TargetReg = Target.getReg();
57358     break;
57359   case X86::CALL64pcrel32:
57360   case X86::TAILJMPd64:
57361     assert(Target.isSymbol() && "Unexpected target operand for a direct call");
57362     // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
57363     // 64-bit indirect thunk calls.
57364     assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
57365            "Unexpected register for an indirect thunk call");
57366     TargetReg = X86::R11;
57367     break;
57368   default:
57369     llvm_unreachable("Unexpected CFI call opcode");
57370     break;
57371   }
57372
57373   return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
57374       .addReg(TargetReg)
57375       .addImm(MBBI->getCFIType())
57376       .getInstr();
57377 }
57378
57379 /// Returns true if stack probing through a function call is requested.
57380 bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {
57381   return !getStackProbeSymbolName(MF).empty();
57382 }
57383
57384 /// Returns true if stack probing through inline assembly is requested.
57385 bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
57386
57387   // No inline stack probe for Windows, they have their own mechanism.
57388   if (Subtarget.isOSWindows() ||
57389       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
57390     return false;
57391
57392   // If the function specifically requests inline stack probes, emit them.
57393   if (MF.getFunction().hasFnAttribute("probe-stack"))
57394     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
57395            "inline-asm";
57396
57397   return false;
57398 }
57399
57400 /// Returns the name of the symbol used to emit stack probes or the empty
57401 /// string if not applicable.
57402 StringRef
57403 X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {
57404   // Inline Stack probes disable stack probe call
57405   if (hasInlineStackProbe(MF))
57406     return "";
57407
57408   // If the function specifically requests stack probes, emit them.
57409   if (MF.getFunction().hasFnAttribute("probe-stack"))
57410     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
57411
57412   // Generally, if we aren't on Windows, the platform ABI does not include
57413   // support for stack probes, so don't emit them.
57414   if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
57415       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
57416     return "";
57417
57418   // We need a stack probe to conform to the Windows ABI. Choose the right
57419   // symbol.
57420   if (Subtarget.is64Bit())
57421     return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
57422   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
57423 }
57424
57425 unsigned
57426 X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
57427   // The default stack probe size is 4096 if the function has no stackprobesize
57428   // attribute.
57429   return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
57430                                                         4096);
57431 }
57432
57433 Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
57434   if (ML->isInnermost() &&
57435       ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
57436     return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
57437   return TargetLowering::getPrefLoopAlignment();
57438 }