llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

   1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines a DAG pattern matching instruction selector for X86,
  10 // converting from a legalized dag to a X86 dag.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "X86ISelDAGToDAG.h"
  15 #include "X86.h"
  16 #include "X86MachineFunctionInfo.h"
  17 #include "X86Subtarget.h"
  18 #include "X86TargetMachine.h"
  19 #include "llvm/ADT/Statistic.h"
  20 #include "llvm/CodeGen/MachineModuleInfo.h"
  21 #include "llvm/CodeGen/SelectionDAGISel.h"
  22 #include "llvm/Config/llvm-config.h"
  23 #include "llvm/IR/ConstantRange.h"
  24 #include "llvm/IR/Function.h"
  25 #include "llvm/IR/Instructions.h"
  26 #include "llvm/IR/Intrinsics.h"
  27 #include "llvm/IR/IntrinsicsX86.h"
  28 #include "llvm/IR/Module.h"
  29 #include "llvm/IR/Type.h"
  30 #include "llvm/Support/Debug.h"
  31 #include "llvm/Support/ErrorHandling.h"
  32 #include "llvm/Support/KnownBits.h"
  33 #include "llvm/Support/MathExtras.h"
  34 #include <cstdint>
  35
  36 using namespace llvm;
  37
  38 #define DEBUG_TYPE "x86-isel"
  39 #define PASS_NAME "X86 DAG->DAG Instruction Selection"
  40
  41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
  42
  43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
  44     cl::desc("Enable setting constant bits to reduce size of mask immediates"),
  45     cl::Hidden);
  46
  47 static cl::opt<bool> EnablePromoteAnyextLoad(
  48     "x86-promote-anyext-load", cl::init(true),
  49     cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
  50
  51 extern cl::opt<bool> IndirectBranchTracking;
  52
  53 //===----------------------------------------------------------------------===//
  54 //                      Pattern Matcher Implementation
  55 //===----------------------------------------------------------------------===//
  56
  57 namespace {
  58   /// This corresponds to X86AddressMode, but uses SDValue's instead of register
  59   /// numbers for the leaves of the matched tree.
  60   struct X86ISelAddressMode {
  61     enum {
  62       RegBase,
  63       FrameIndexBase
  64     } BaseType = RegBase;
  65
  66     // This is really a union, discriminated by BaseType!
  67     SDValue Base_Reg;
  68     int Base_FrameIndex = 0;
  69
  70     unsigned Scale = 1;
  71     SDValue IndexReg;
  72     int32_t Disp = 0;
  73     SDValue Segment;
  74     const GlobalValue *GV = nullptr;
  75     const Constant *CP = nullptr;
  76     const BlockAddress *BlockAddr = nullptr;
  77     const char *ES = nullptr;
  78     MCSymbol *MCSym = nullptr;
  79     int JT = -1;
  80     Align Alignment;            // CP alignment.
  81     unsigned char SymbolFlags = X86II::MO_NO_FLAG;  // X86II::MO_*
  82     bool NegateIndex = false;
  83
  84     X86ISelAddressMode() = default;
  85
  86     bool hasSymbolicDisplacement() const {
  87       return GV != nullptr || CP != nullptr || ES != nullptr ||
  88              MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
  89     }
  90
  91     bool hasBaseOrIndexReg() const {
  92       return BaseType == FrameIndexBase ||
  93              IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
  94     }
  95
  96     /// Return true if this addressing mode is already RIP-relative.
  97     bool isRIPRelative() const {
  98       if (BaseType != RegBase) return false;
  99       if (RegisterSDNode *RegNode =
 100             dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
 101         return RegNode->getReg() == X86::RIP;
 102       return false;
 103     }
 104
 105     void setBaseReg(SDValue Reg) {
 106       BaseType = RegBase;
 107       Base_Reg = Reg;
 108     }
 109
 110 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 111     void dump(SelectionDAG *DAG = nullptr) {
 112       dbgs() << "X86ISelAddressMode " << this << '\n';
 113       dbgs() << "Base_Reg ";
 114       if (Base_Reg.getNode())
 115         Base_Reg.getNode()->dump(DAG);
 116       else
 117         dbgs() << "nul\n";
 118       if (BaseType == FrameIndexBase)
 119         dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
 120       dbgs() << " Scale " << Scale << '\n'
 121              << "IndexReg ";
 122       if (NegateIndex)
 123         dbgs() << "negate ";
 124       if (IndexReg.getNode())
 125         IndexReg.getNode()->dump(DAG);
 126       else
 127         dbgs() << "nul\n";
 128       dbgs() << " Disp " << Disp << '\n'
 129              << "GV ";
 130       if (GV)
 131         GV->dump();
 132       else
 133         dbgs() << "nul";
 134       dbgs() << " CP ";
 135       if (CP)
 136         CP->dump();
 137       else
 138         dbgs() << "nul";
 139       dbgs() << '\n'
 140              << "ES ";
 141       if (ES)
 142         dbgs() << ES;
 143       else
 144         dbgs() << "nul";
 145       dbgs() << " MCSym ";
 146       if (MCSym)
 147         dbgs() << MCSym;
 148       else
 149         dbgs() << "nul";
 150       dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
 151     }
 152 #endif
 153   };
 154 }
 155
 156 namespace {
 157   //===--------------------------------------------------------------------===//
 158   /// ISel - X86-specific code to select X86 machine instructions for
 159   /// SelectionDAG operations.
 160   ///
 161   class X86DAGToDAGISel final : public SelectionDAGISel {
 162     /// Keep a pointer to the X86Subtarget around so that we can
 163     /// make the right decision when generating code for different targets.
 164     const X86Subtarget *Subtarget;
 165
 166     /// If true, selector should try to optimize for minimum code size.
 167     bool OptForMinSize;
 168
 169     /// Disable direct TLS access through segment registers.
 170     bool IndirectTlsSegRefs;
 171
 172   public:
 173     X86DAGToDAGISel() = delete;
 174
 175     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
 176         : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
 177           OptForMinSize(false), IndirectTlsSegRefs(false) {}
 178
 179     bool runOnMachineFunction(MachineFunction &MF) override {
 180       // Reset the subtarget each time through.
 181       Subtarget = &MF.getSubtarget<X86Subtarget>();
 182       IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
 183                              "indirect-tls-seg-refs");
 184
 185       // OptFor[Min]Size are used in pattern predicates that isel is matching.
 186       OptForMinSize = MF.getFunction().hasMinSize();
 187       assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
 188              "OptForMinSize implies OptForSize");
 189       return SelectionDAGISel::runOnMachineFunction(MF);
 190     }
 191
 192     void emitFunctionEntryCode() override;
 193
 194     bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
 195
 196     void PreprocessISelDAG() override;
 197     void PostprocessISelDAG() override;
 198
 199 // Include the pieces autogenerated from the target description.
 200 #include "X86GenDAGISel.inc"
 201
 202   private:
 203     void Select(SDNode *N) override;
 204
 205     bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
 206     bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
 207                             bool AllowSegmentRegForX32 = false);
 208     bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
 209     bool matchAddress(SDValue N, X86ISelAddressMode &AM);
 210     bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
 211     bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
 212     SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
 213                                   unsigned Depth);
 214     bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 215                                  unsigned Depth);
 216     bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 217                                        unsigned Depth);
 218     bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
 219     bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
 220                     SDValue &Scale, SDValue &Index, SDValue &Disp,
 221                     SDValue &Segment);
 222     bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
 223                           SDValue ScaleOp, SDValue &Base, SDValue &Scale,
 224                           SDValue &Index, SDValue &Disp, SDValue &Segment);
 225     bool selectMOV64Imm32(SDValue N, SDValue &Imm);
 226     bool selectLEAAddr(SDValue N, SDValue &Base,
 227                        SDValue &Scale, SDValue &Index, SDValue &Disp,
 228                        SDValue &Segment);
 229     bool selectLEA64_32Addr(SDValue N, SDValue &Base,
 230                             SDValue &Scale, SDValue &Index, SDValue &Disp,
 231                             SDValue &Segment);
 232     bool selectTLSADDRAddr(SDValue N, SDValue &Base,
 233                            SDValue &Scale, SDValue &Index, SDValue &Disp,
 234                            SDValue &Segment);
 235     bool selectRelocImm(SDValue N, SDValue &Op);
 236
 237     bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
 238                      SDValue &Base, SDValue &Scale,
 239                      SDValue &Index, SDValue &Disp,
 240                      SDValue &Segment);
 241
 242     // Convenience method where P is also root.
 243     bool tryFoldLoad(SDNode *P, SDValue N,
 244                      SDValue &Base, SDValue &Scale,
 245                      SDValue &Index, SDValue &Disp,
 246                      SDValue &Segment) {
 247       return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
 248     }
 249
 250     bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
 251                           SDValue &Base, SDValue &Scale,
 252                           SDValue &Index, SDValue &Disp,
 253                           SDValue &Segment);
 254
 255     bool isProfitableToFormMaskedOp(SDNode *N) const;
 256
 257     /// Implement addressing mode selection for inline asm expressions.
 258     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
 259                                       InlineAsm::ConstraintCode ConstraintID,
 260                                       std::vector<SDValue> &OutOps) override;
 261
 262     void emitSpecialCodeForMain();
 263
 264     inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
 265                                    MVT VT, SDValue &Base, SDValue &Scale,
 266                                    SDValue &Index, SDValue &Disp,
 267                                    SDValue &Segment) {
 268       if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
 269         Base = CurDAG->getTargetFrameIndex(
 270             AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
 271       else if (AM.Base_Reg.getNode())
 272         Base = AM.Base_Reg;
 273       else
 274         Base = CurDAG->getRegister(0, VT);
 275
 276       Scale = getI8Imm(AM.Scale, DL);
 277
 278 #define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
 279       // Negate the index if needed.
 280       if (AM.NegateIndex) {
 281         unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
 282                                          : GET_ND_IF_ENABLED(X86::NEG32r);
 283         SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
 284                                                      AM.IndexReg), 0);
 285         AM.IndexReg = Neg;
 286       }
 287
 288       if (AM.IndexReg.getNode())
 289         Index = AM.IndexReg;
 290       else
 291         Index = CurDAG->getRegister(0, VT);
 292
 293       // These are 32-bit even in 64-bit mode since RIP-relative offset
 294       // is 32-bit.
 295       if (AM.GV)
 296         Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
 297                                               MVT::i32, AM.Disp,
 298                                               AM.SymbolFlags);
 299       else if (AM.CP)
 300         Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
 301                                              AM.Disp, AM.SymbolFlags);
 302       else if (AM.ES) {
 303         assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
 304         Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
 305       } else if (AM.MCSym) {
 306         assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
 307         assert(AM.SymbolFlags == 0 && "oo");
 308         Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
 309       } else if (AM.JT != -1) {
 310         assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
 311         Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
 312       } else if (AM.BlockAddr)
 313         Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
 314                                              AM.SymbolFlags);
 315       else
 316         Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32);
 317
 318       if (AM.Segment.getNode())
 319         Segment = AM.Segment;
 320       else
 321         Segment = CurDAG->getRegister(0, MVT::i16);
 322     }
 323
 324     // Utility function to determine whether it is AMX SDNode right after
 325     // lowering but before ISEL.
 326     bool isAMXSDNode(SDNode *N) const {
 327       // Check if N is AMX SDNode:
 328       // 1. check specific opcode since these carry MVT::Untyped instead of
 329       // x86amx_type;
 330       // 2. check result type;
 331       // 3. check operand type;
 332       switch (N->getOpcode()) {
 333       default:
 334         break;
 335       case X86::PT2RPNTLVWZ0V:
 336       case X86::PT2RPNTLVWZ0T1V:
 337       case X86::PT2RPNTLVWZ1V:
 338       case X86::PT2RPNTLVWZ1T1V:
 339       case X86::PT2RPNTLVWZ0RSV:
 340       case X86::PT2RPNTLVWZ0RST1V:
 341       case X86::PT2RPNTLVWZ1RSV:
 342       case X86::PT2RPNTLVWZ1RST1V:
 343         return true;
 344       }
 345       for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
 346         if (N->getValueType(Idx) == MVT::x86amx)
 347           return true;
 348       }
 349       for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
 350         SDValue Op = N->getOperand(Idx);
 351         if (Op.getValueType() == MVT::x86amx)
 352           return true;
 353       }
 354       return false;
 355     }
 356
 357     // Utility function to determine whether we should avoid selecting
 358     // immediate forms of instructions for better code size or not.
 359     // At a high level, we'd like to avoid such instructions when
 360     // we have similar constants used within the same basic block
 361     // that can be kept in a register.
 362     //
 363     bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
 364       uint32_t UseCount = 0;
 365
 366       // Do not want to hoist if we're not optimizing for size.
 367       // TODO: We'd like to remove this restriction.
 368       // See the comment in X86InstrInfo.td for more info.
 369       if (!CurDAG->shouldOptForSize())
 370         return false;
 371
 372       // Walk all the users of the immediate.
 373       for (const SDNode *User : N->users()) {
 374         if (UseCount >= 2)
 375           break;
 376
 377         // This user is already selected. Count it as a legitimate use and
 378         // move on.
 379         if (User->isMachineOpcode()) {
 380           UseCount++;
 381           continue;
 382         }
 383
 384         // We want to count stores of immediates as real uses.
 385         if (User->getOpcode() == ISD::STORE &&
 386             User->getOperand(1).getNode() == N) {
 387           UseCount++;
 388           continue;
 389         }
 390
 391         // We don't currently match users that have > 2 operands (except
 392         // for stores, which are handled above)
 393         // Those instruction won't match in ISEL, for now, and would
 394         // be counted incorrectly.
 395         // This may change in the future as we add additional instruction
 396         // types.
 397         if (User->getNumOperands() != 2)
 398           continue;
 399
 400         // If this is a sign-extended 8-bit integer immediate used in an ALU
 401         // instruction, there is probably an opcode encoding to save space.
 402         auto *C = dyn_cast<ConstantSDNode>(N);
 403         if (C && isInt<8>(C->getSExtValue()))
 404           continue;
 405
 406         // Immediates that are used for offsets as part of stack
 407         // manipulation should be left alone. These are typically
 408         // used to indicate SP offsets for argument passing and
 409         // will get pulled into stores/pushes (implicitly).
 410         if (User->getOpcode() == X86ISD::ADD ||
 411             User->getOpcode() == ISD::ADD    ||
 412             User->getOpcode() == X86ISD::SUB ||
 413             User->getOpcode() == ISD::SUB) {
 414
 415           // Find the other operand of the add/sub.
 416           SDValue OtherOp = User->getOperand(0);
 417           if (OtherOp.getNode() == N)
 418             OtherOp = User->getOperand(1);
 419
 420           // Don't count if the other operand is SP.
 421           RegisterSDNode *RegNode;
 422           if (OtherOp->getOpcode() == ISD::CopyFromReg &&
 423               (RegNode = dyn_cast_or_null<RegisterSDNode>(
 424                  OtherOp->getOperand(1).getNode())))
 425             if ((RegNode->getReg() == X86::ESP) ||
 426                 (RegNode->getReg() == X86::RSP))
 427               continue;
 428         }
 429
 430         // ... otherwise, count this and move on.
 431         UseCount++;
 432       }
 433
 434       // If we have more than 1 use, then recommend for hoisting.
 435       return (UseCount > 1);
 436     }
 437
 438     /// Return a target constant with the specified value of type i8.
 439     inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
 440       return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
 441     }
 442
 443     /// Return a target constant with the specified value, of type i32.
 444     inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
 445       return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
 446     }
 447
 448     /// Return a target constant with the specified value, of type i64.
 449     inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
 450       return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
 451     }
 452
 453     SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
 454                                         const SDLoc &DL) {
 455       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
 456       uint64_t Index = N->getConstantOperandVal(1);
 457       MVT VecVT = N->getOperand(0).getSimpleValueType();
 458       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
 459     }
 460
 461     SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
 462                                       const SDLoc &DL) {
 463       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
 464       uint64_t Index = N->getConstantOperandVal(2);
 465       MVT VecVT = N->getSimpleValueType(0);
 466       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
 467     }
 468
 469     SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
 470                                                const SDLoc &DL) {
 471       assert(VecWidth == 128 && "Unexpected vector width");
 472       uint64_t Index = N->getConstantOperandVal(2);
 473       MVT VecVT = N->getSimpleValueType(0);
 474       uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
 475       assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
 476       // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
 477       // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
 478       return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
 479     }
 480
 481     SDValue getSBBZero(SDNode *N) {
 482       SDLoc dl(N);
 483       MVT VT = N->getSimpleValueType(0);
 484
 485       // Create zero.
 486       SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
 487       SDValue Zero =
 488           SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
 489       if (VT == MVT::i64) {
 490         Zero = SDValue(
 491             CurDAG->getMachineNode(
 492                 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
 493                 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
 494                 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
 495             0);
 496       }
 497
 498       // Copy flags to the EFLAGS register and glue it to next node.
 499       unsigned Opcode = N->getOpcode();
 500       assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
 501              "Unexpected opcode for SBB materialization");
 502       unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
 503       SDValue EFLAGS =
 504           CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
 505                                N->getOperand(FlagOpIndex), SDValue());
 506
 507       // Create a 64-bit instruction if the result is 64-bits otherwise use the
 508       // 32-bit version.
 509       unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
 510       MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
 511       VTs = CurDAG->getVTList(SBBVT, MVT::i32);
 512       return SDValue(
 513           CurDAG->getMachineNode(Opc, dl, VTs,
 514                                  {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
 515           0);
 516     }
 517
 518     // Helper to detect unneeded and instructions on shift amounts. Called
 519     // from PatFrags in tablegen.
 520     bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
 521       assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
 522       const APInt &Val = N->getConstantOperandAPInt(1);
 523
 524       if (Val.countr_one() >= Width)
 525         return true;
 526
 527       APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
 528       return Mask.countr_one() >= Width;
 529     }
 530
 531     /// Return an SDNode that returns the value of the global base register.
 532     /// Output instructions required to initialize the global base register,
 533     /// if necessary.
 534     SDNode *getGlobalBaseReg();
 535
 536     /// Return a reference to the TargetMachine, casted to the target-specific
 537     /// type.
 538     const X86TargetMachine &getTargetMachine() const {
 539       return static_cast<const X86TargetMachine &>(TM);
 540     }
 541
 542     /// Return a reference to the TargetInstrInfo, casted to the target-specific
 543     /// type.
 544     const X86InstrInfo *getInstrInfo() const {
 545       return Subtarget->getInstrInfo();
 546     }
 547
 548     /// Return a condition code of the given SDNode
 549     X86::CondCode getCondFromNode(SDNode *N) const;
 550
 551     /// Address-mode matching performs shift-of-and to and-of-shift
 552     /// reassociation in order to expose more scaled addressing
 553     /// opportunities.
 554     bool ComplexPatternFuncMutatesDAG() const override {
 555       return true;
 556     }
 557
 558     bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
 559
 560     // Indicates we should prefer to use a non-temporal load for this load.
 561     bool useNonTemporalLoad(LoadSDNode *N) const {
 562       if (!N->isNonTemporal())
 563         return false;
 564
 565       unsigned StoreSize = N->getMemoryVT().getStoreSize();
 566
 567       if (N->getAlign().value() < StoreSize)
 568         return false;
 569
 570       switch (StoreSize) {
 571       default: llvm_unreachable("Unsupported store size");
 572       case 4:
 573       case 8:
 574         return false;
 575       case 16:
 576         return Subtarget->hasSSE41();
 577       case 32:
 578         return Subtarget->hasAVX2();
 579       case 64:
 580         return Subtarget->hasAVX512();
 581       }
 582     }
 583
 584     bool foldLoadStoreIntoMemOperand(SDNode *Node);
 585     MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
 586     bool matchBitExtract(SDNode *Node);
 587     bool shrinkAndImmediate(SDNode *N);
 588     bool isMaskZeroExtended(SDNode *N) const;
 589     bool tryShiftAmountMod(SDNode *N);
 590     bool tryShrinkShlLogicImm(SDNode *N);
 591     bool tryVPTERNLOG(SDNode *N);
 592     bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
 593                         SDNode *ParentC, SDValue A, SDValue B, SDValue C,
 594                         uint8_t Imm);
 595     bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
 596     bool tryMatchBitSelect(SDNode *N);
 597
 598     MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
 599                                 const SDLoc &dl, MVT VT, SDNode *Node);
 600     MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
 601                                 const SDLoc &dl, MVT VT, SDNode *Node,
 602                                 SDValue &InGlue);
 603
 604     bool tryOptimizeRem8Extend(SDNode *N);
 605
 606     bool onlyUsesZeroFlag(SDValue Flags) const;
 607     bool hasNoSignFlagUses(SDValue Flags) const;
 608     bool hasNoCarryFlagUses(SDValue Flags) const;
 609   };
 610
 611   class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
 612   public:
 613     static char ID;
 614     explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
 615                                    CodeGenOptLevel OptLevel)
 616         : SelectionDAGISelLegacy(
 617               ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
 618   };
 619 }
 620
 621 char X86DAGToDAGISelLegacy::ID = 0;
 622
 623 INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
 624
 625 // Returns true if this masked compare can be implemented legally with this
 626 // type.
 627 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
 628   unsigned Opcode = N->getOpcode();
 629   if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
 630       Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
 631       Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
 632     // We can get 256-bit 8 element types here without VLX being enabled. When
 633     // this happens we will use 512-bit operations and the mask will not be
 634     // zero extended.
 635     EVT OpVT = N->getOperand(0).getValueType();
 636     // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
 637     // second operand.
 638     if (Opcode == X86ISD::STRICT_CMPM)
 639       OpVT = N->getOperand(1).getValueType();
 640     if (OpVT.is256BitVector() || OpVT.is128BitVector())
 641       return Subtarget->hasVLX();
 642
 643     return true;
 644   }
 645   // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
 646   if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
 647       Opcode == X86ISD::FSETCCM_SAE)
 648     return true;
 649
 650   return false;
 651 }
 652
 653 // Returns true if we can assume the writer of the mask has zero extended it
 654 // for us.
 655 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
 656   // If this is an AND, check if we have a compare on either side. As long as
 657   // one side guarantees the mask is zero extended, the AND will preserve those
 658   // zeros.
 659   if (N->getOpcode() == ISD::AND)
 660     return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
 661            isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
 662
 663   return isLegalMaskCompare(N, Subtarget);
 664 }
 665
 666 bool
 667 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
 668   if (OptLevel == CodeGenOptLevel::None)
 669     return false;
 670
 671   if (!N.hasOneUse())
 672     return false;
 673
 674   if (N.getOpcode() != ISD::LOAD)
 675     return true;
 676
 677   // Don't fold non-temporal loads if we have an instruction for them.
 678   if (useNonTemporalLoad(cast<LoadSDNode>(N)))
 679     return false;
 680
 681   // If N is a load, do additional profitability checks.
 682   if (U == Root) {
 683     switch (U->getOpcode()) {
 684     default: break;
 685     case X86ISD::ADD:
 686     case X86ISD::ADC:
 687     case X86ISD::SUB:
 688     case X86ISD::SBB:
 689     case X86ISD::AND:
 690     case X86ISD::XOR:
 691     case X86ISD::OR:
 692     case ISD::ADD:
 693     case ISD::UADDO_CARRY:
 694     case ISD::AND:
 695     case ISD::OR:
 696     case ISD::XOR: {
 697       SDValue Op1 = U->getOperand(1);
 698
 699       // If the other operand is a 8-bit immediate we should fold the immediate
 700       // instead. This reduces code size.
 701       // e.g.
 702       // movl 4(%esp), %eax
 703       // addl $4, %eax
 704       // vs.
 705       // movl $4, %eax
 706       // addl 4(%esp), %eax
 707       // The former is 2 bytes shorter. In case where the increment is 1, then
 708       // the saving can be 4 bytes (by using incl %eax).
 709       if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
 710         if (Imm->getAPIntValue().isSignedIntN(8))
 711           return false;
 712
 713         // If this is a 64-bit AND with an immediate that fits in 32-bits,
 714         // prefer using the smaller and over folding the load. This is needed to
 715         // make sure immediates created by shrinkAndImmediate are always folded.
 716         // Ideally we would narrow the load during DAG combine and get the
 717         // best of both worlds.
 718         if (U->getOpcode() == ISD::AND &&
 719             Imm->getAPIntValue().getBitWidth() == 64 &&
 720             Imm->getAPIntValue().isIntN(32))
 721           return false;
 722
 723         // If this really a zext_inreg that can be represented with a movzx
 724         // instruction, prefer that.
 725         // TODO: We could shrink the load and fold if it is non-volatile.
 726         if (U->getOpcode() == ISD::AND &&
 727             (Imm->getAPIntValue() == UINT8_MAX ||
 728              Imm->getAPIntValue() == UINT16_MAX ||
 729              Imm->getAPIntValue() == UINT32_MAX))
 730           return false;
 731
 732         // ADD/SUB with can negate the immediate and use the opposite operation
 733         // to fit 128 into a sign extended 8 bit immediate.
 734         if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
 735             (-Imm->getAPIntValue()).isSignedIntN(8))
 736           return false;
 737
 738         if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
 739             (-Imm->getAPIntValue()).isSignedIntN(8) &&
 740             hasNoCarryFlagUses(SDValue(U, 1)))
 741           return false;
 742       }
 743
 744       // If the other operand is a TLS address, we should fold it instead.
 745       // This produces
 746       // movl    %gs:0, %eax
 747       // leal    i@NTPOFF(%eax), %eax
 748       // instead of
 749       // movl    $i@NTPOFF, %eax
 750       // addl    %gs:0, %eax
 751       // if the block also has an access to a second TLS address this will save
 752       // a load.
 753       // FIXME: This is probably also true for non-TLS addresses.
 754       if (Op1.getOpcode() == X86ISD::Wrapper) {
 755         SDValue Val = Op1.getOperand(0);
 756         if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
 757           return false;
 758       }
 759
 760       // Don't fold load if this matches the BTS/BTR/BTC patterns.
 761       // BTS: (or X, (shl 1, n))
 762       // BTR: (and X, (rotl -2, n))
 763       // BTC: (xor X, (shl 1, n))
 764       if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
 765         if (U->getOperand(0).getOpcode() == ISD::SHL &&
 766             isOneConstant(U->getOperand(0).getOperand(0)))
 767           return false;
 768
 769         if (U->getOperand(1).getOpcode() == ISD::SHL &&
 770             isOneConstant(U->getOperand(1).getOperand(0)))
 771           return false;
 772       }
 773       if (U->getOpcode() == ISD::AND) {
 774         SDValue U0 = U->getOperand(0);
 775         SDValue U1 = U->getOperand(1);
 776         if (U0.getOpcode() == ISD::ROTL) {
 777           auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
 778           if (C && C->getSExtValue() == -2)
 779             return false;
 780         }
 781
 782         if (U1.getOpcode() == ISD::ROTL) {
 783           auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
 784           if (C && C->getSExtValue() == -2)
 785             return false;
 786         }
 787       }
 788
 789       break;
 790     }
 791     case ISD::SHL:
 792     case ISD::SRA:
 793     case ISD::SRL:
 794       // Don't fold a load into a shift by immediate. The BMI2 instructions
 795       // support folding a load, but not an immediate. The legacy instructions
 796       // support folding an immediate, but can't fold a load. Folding an
 797       // immediate is preferable to folding a load.
 798       if (isa<ConstantSDNode>(U->getOperand(1)))
 799         return false;
 800
 801       break;
 802     }
 803   }
 804
 805   // Prevent folding a load if this can implemented with an insert_subreg or
 806   // a move that implicitly zeroes.
 807   if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
 808       isNullConstant(Root->getOperand(2)) &&
 809       (Root->getOperand(0).isUndef() ||
 810        ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
 811     return false;
 812
 813   return true;
 814 }
 815
 816 // Indicates it is profitable to form an AVX512 masked operation. Returning
 817 // false will favor a masked register-register masked move or vblendm and the
 818 // operation will be selected separately.
 819 bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
 820   assert(
 821       (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
 822       "Unexpected opcode!");
 823
 824   // If the operation has additional users, the operation will be duplicated.
 825   // Check the use count to prevent that.
 826   // FIXME: Are there cheap opcodes we might want to duplicate?
 827   return N->getOperand(1).hasOneUse();
 828 }
 829
 830 /// Replace the original chain operand of the call with
 831 /// load's chain operand and move load below the call's chain operand.
 832 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
 833                                SDValue Call, SDValue OrigChain) {
 834   SmallVector<SDValue, 8> Ops;
 835   SDValue Chain = OrigChain.getOperand(0);
 836   if (Chain.getNode() == Load.getNode())
 837     Ops.push_back(Load.getOperand(0));
 838   else {
 839     assert(Chain.getOpcode() == ISD::TokenFactor &&
 840            "Unexpected chain operand");
 841     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
 842       if (Chain.getOperand(i).getNode() == Load.getNode())
 843         Ops.push_back(Load.getOperand(0));
 844       else
 845         Ops.push_back(Chain.getOperand(i));
 846     SDValue NewChain =
 847       CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
 848     Ops.clear();
 849     Ops.push_back(NewChain);
 850   }
 851   Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
 852   CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
 853   CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
 854                              Load.getOperand(1), Load.getOperand(2));
 855
 856   Ops.clear();
 857   Ops.push_back(SDValue(Load.getNode(), 1));
 858   Ops.append(Call->op_begin() + 1, Call->op_end());
 859   CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
 860 }
 861
 862 /// Return true if call address is a load and it can be
 863 /// moved below CALLSEQ_START and the chains leading up to the call.
 864 /// Return the CALLSEQ_START by reference as a second output.
 865 /// In the case of a tail call, there isn't a callseq node between the call
 866 /// chain and the load.
 867 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
 868   // The transformation is somewhat dangerous if the call's chain was glued to
 869   // the call. After MoveBelowOrigChain the load is moved between the call and
 870   // the chain, this can create a cycle if the load is not folded. So it is
 871   // *really* important that we are sure the load will be folded.
 872   if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
 873     return false;
 874   auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
 875   if (!LD ||
 876       !LD->isSimple() ||
 877       LD->getAddressingMode() != ISD::UNINDEXED ||
 878       LD->getExtensionType() != ISD::NON_EXTLOAD)
 879     return false;
 880
 881   // Now let's find the callseq_start.
 882   while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
 883     if (!Chain.hasOneUse())
 884       return false;
 885     Chain = Chain.getOperand(0);
 886   }
 887
 888   if (!Chain.getNumOperands())
 889     return false;
 890   // Since we are not checking for AA here, conservatively abort if the chain
 891   // writes to memory. It's not safe to move the callee (a load) across a store.
 892   if (isa<MemSDNode>(Chain.getNode()) &&
 893       cast<MemSDNode>(Chain.getNode())->writeMem())
 894     return false;
 895   if (Chain.getOperand(0).getNode() == Callee.getNode())
 896     return true;
 897   if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
 898       Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
 899       Callee.getValue(1).hasOneUse())
 900     return true;
 901   return false;
 902 }
 903
 904 static bool isEndbrImm64(uint64_t Imm) {
 905 // There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
 906 // i.g: 0xF3660F1EFA, 0xF3670F1EFA
 907   if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
 908     return false;
 909
 910   uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
 911                                     0x65, 0x66, 0x67, 0xf0, 0xf2};
 912   int i = 24; // 24bit 0x0F1EFA has matched
 913   while (i < 64) {
 914     uint8_t Byte = (Imm >> i) & 0xFF;
 915     if (Byte == 0xF3)
 916       return true;
 917     if (!llvm::is_contained(OptionalPrefixBytes, Byte))
 918       return false;
 919     i += 8;
 920   }
 921
 922   return false;
 923 }
 924
 925 static bool needBWI(MVT VT) {
 926   return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
 927 }
 928
 929 void X86DAGToDAGISel::PreprocessISelDAG() {
 930   bool MadeChange = false;
 931   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
 932        E = CurDAG->allnodes_end(); I != E; ) {
 933     SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
 934
 935     // This is for CET enhancement.
 936     //
 937     // ENDBR32 and ENDBR64 have specific opcodes:
 938     // ENDBR32: F3 0F 1E FB
 939     // ENDBR64: F3 0F 1E FA
 940     // And we want that attackers won’t find unintended ENDBR32/64
 941     // opcode matches in the binary
 942     // Here’s an example:
 943     // If the compiler had to generate asm for the following code:
 944     // a = 0xF30F1EFA
 945     // it could, for example, generate:
 946     // mov 0xF30F1EFA, dword ptr[a]
 947     // In such a case, the binary would include a gadget that starts
 948     // with a fake ENDBR64 opcode. Therefore, we split such generation
 949     // into multiple operations, let it not shows in the binary
 950     if (N->getOpcode() == ISD::Constant) {
 951       MVT VT = N->getSimpleValueType(0);
 952       int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
 953       int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
 954       if (Imm == EndbrImm || isEndbrImm64(Imm)) {
 955         // Check that the cf-protection-branch is enabled.
 956         Metadata *CFProtectionBranch =
 957             MF->getFunction().getParent()->getModuleFlag(
 958                 "cf-protection-branch");
 959         if (CFProtectionBranch || IndirectBranchTracking) {
 960           SDLoc dl(N);
 961           SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
 962           Complement = CurDAG->getNOT(dl, Complement, VT);
 963           --I;
 964           CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
 965           ++I;
 966           MadeChange = true;
 967           continue;
 968         }
 969       }
 970     }
 971
 972     // If this is a target specific AND node with no flag usages, turn it back
 973     // into ISD::AND to enable test instruction matching.
 974     if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
 975       SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
 976                                     N->getOperand(0), N->getOperand(1));
 977       --I;
 978       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
 979       ++I;
 980       MadeChange = true;
 981       continue;
 982     }
 983
 984     // Convert vector increment or decrement to sub/add with an all-ones
 985     // constant:
 986     // add X, <1, 1...> --> sub X, <-1, -1...>
 987     // sub X, <1, 1...> --> add X, <-1, -1...>
 988     // The all-ones vector constant can be materialized using a pcmpeq
 989     // instruction that is commonly recognized as an idiom (has no register
 990     // dependency), so that's better/smaller than loading a splat 1 constant.
 991     //
 992     // But don't do this if it would inhibit a potentially profitable load
 993     // folding opportunity for the other operand. That only occurs with the
 994     // intersection of:
 995     // (1) The other operand (op0) is load foldable.
 996     // (2) The op is an add (otherwise, we are *creating* an add and can still
 997     //     load fold the other op).
 998     // (3) The target has AVX (otherwise, we have a destructive add and can't
 999     //     load fold the other op without killing the constant op).
1000     // (4) The constant 1 vector has multiple uses (so it is profitable to load
1001     //     into a register anyway).
1002     auto mayPreventLoadFold = [&]() {
1003       return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
1004              N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1005              !N->getOperand(1).hasOneUse();
1006     };
1007     if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1008         N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
1009       APInt SplatVal;
1010       if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
1011           SplatVal.isOne()) {
1012         SDLoc DL(N);
1013
1014         MVT VT = N->getSimpleValueType(0);
1015         unsigned NumElts = VT.getSizeInBits() / 32;
1016         SDValue AllOnes =
1017             CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
1018         AllOnes = CurDAG->getBitcast(VT, AllOnes);
1019
1020         unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1021         SDValue Res =
1022             CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
1023         --I;
1024         CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1025         ++I;
1026         MadeChange = true;
1027         continue;
1028       }
1029     }
1030
1031     switch (N->getOpcode()) {
1032     case X86ISD::VBROADCAST: {
1033       MVT VT = N->getSimpleValueType(0);
1034       // Emulate v32i16/v64i8 broadcast without BWI.
1035       if (!Subtarget->hasBWI() && needBWI(VT)) {
1036         MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1037         SDLoc dl(N);
1038         SDValue NarrowBCast =
1039             CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1040         SDValue Res =
1041             CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1042                             NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1043         unsigned Index = NarrowVT.getVectorMinNumElements();
1044         Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1045                               CurDAG->getIntPtrConstant(Index, dl));
1046
1047         --I;
1048         CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1049         ++I;
1050         MadeChange = true;
1051         continue;
1052       }
1053
1054       break;
1055     }
1056     case X86ISD::VBROADCAST_LOAD: {
1057       MVT VT = N->getSimpleValueType(0);
1058       // Emulate v32i16/v64i8 broadcast without BWI.
1059       if (!Subtarget->hasBWI() && needBWI(VT)) {
1060         MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1061         auto *MemNode = cast<MemSDNode>(N);
1062         SDLoc dl(N);
1063         SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1064         SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1065         SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1066             X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1067             MemNode->getMemOperand());
1068         SDValue Res =
1069             CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1070                             NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1071         unsigned Index = NarrowVT.getVectorMinNumElements();
1072         Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1073                               CurDAG->getIntPtrConstant(Index, dl));
1074
1075         --I;
1076         SDValue To[] = {Res, NarrowBCast.getValue(1)};
1077         CurDAG->ReplaceAllUsesWith(N, To);
1078         ++I;
1079         MadeChange = true;
1080         continue;
1081       }
1082
1083       break;
1084     }
1085     case ISD::LOAD: {
1086       // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1087       // load, then just extract the lower subvector and avoid the second load.
1088       auto *Ld = cast<LoadSDNode>(N);
1089       MVT VT = N->getSimpleValueType(0);
1090       if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1091           !(VT.is128BitVector() || VT.is256BitVector()))
1092         break;
1093
1094       MVT MaxVT = VT;
1095       SDNode *MaxLd = nullptr;
1096       SDValue Ptr = Ld->getBasePtr();
1097       SDValue Chain = Ld->getChain();
1098       for (SDNode *User : Ptr->users()) {
1099         auto *UserLd = dyn_cast<LoadSDNode>(User);
1100         MVT UserVT = User->getSimpleValueType(0);
1101         if (User != N && UserLd && ISD::isNormalLoad(User) &&
1102             UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1103             !User->hasAnyUseOfValue(1) &&
1104             (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1105             UserVT.getSizeInBits() > VT.getSizeInBits() &&
1106             (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1107           MaxLd = User;
1108           MaxVT = UserVT;
1109         }
1110       }
1111       if (MaxLd) {
1112         SDLoc dl(N);
1113         unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1114         MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1115         SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1116                                           SDValue(MaxLd, 0),
1117                                           CurDAG->getIntPtrConstant(0, dl));
1118         SDValue Res = CurDAG->getBitcast(VT, Extract);
1119
1120         --I;
1121         SDValue To[] = {Res, SDValue(MaxLd, 1)};
1122         CurDAG->ReplaceAllUsesWith(N, To);
1123         ++I;
1124         MadeChange = true;
1125         continue;
1126       }
1127       break;
1128     }
1129     case ISD::VSELECT: {
1130       // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1131       EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1132       if (EleVT == MVT::i1)
1133         break;
1134
1135       assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1136       assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1137              "We can't replace VSELECT with BLENDV in vXi16!");
1138       SDValue R;
1139       if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1140                                      EleVT.getSizeInBits()) {
1141         R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1142                             N->getOperand(0), N->getOperand(1), N->getOperand(2),
1143                             CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1144       } else {
1145         R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1146                             N->getOperand(0), N->getOperand(1),
1147                             N->getOperand(2));
1148       }
1149       --I;
1150       CurDAG->ReplaceAllUsesWith(N, R.getNode());
1151       ++I;
1152       MadeChange = true;
1153       continue;
1154     }
1155     case ISD::FP_ROUND:
1156     case ISD::STRICT_FP_ROUND:
1157     case ISD::FP_TO_SINT:
1158     case ISD::FP_TO_UINT:
1159     case ISD::STRICT_FP_TO_SINT:
1160     case ISD::STRICT_FP_TO_UINT: {
1161       // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1162       // don't need 2 sets of patterns.
1163       if (!N->getSimpleValueType(0).isVector())
1164         break;
1165
1166       unsigned NewOpc;
1167       switch (N->getOpcode()) {
1168       default: llvm_unreachable("Unexpected opcode!");
1169       case ISD::FP_ROUND:          NewOpc = X86ISD::VFPROUND;        break;
1170       case ISD::STRICT_FP_ROUND:   NewOpc = X86ISD::STRICT_VFPROUND; break;
1171       case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1172       case ISD::FP_TO_SINT:        NewOpc = X86ISD::CVTTP2SI;        break;
1173       case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1174       case ISD::FP_TO_UINT:        NewOpc = X86ISD::CVTTP2UI;        break;
1175       }
1176       SDValue Res;
1177       if (N->isStrictFPOpcode())
1178         Res =
1179             CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1180                             {N->getOperand(0), N->getOperand(1)});
1181       else
1182         Res =
1183             CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1184                             N->getOperand(0));
1185       --I;
1186       CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1187       ++I;
1188       MadeChange = true;
1189       continue;
1190     }
1191     case ISD::SHL:
1192     case ISD::SRA:
1193     case ISD::SRL: {
1194       // Replace vector shifts with their X86 specific equivalent so we don't
1195       // need 2 sets of patterns.
1196       if (!N->getValueType(0).isVector())
1197         break;
1198
1199       unsigned NewOpc;
1200       switch (N->getOpcode()) {
1201       default: llvm_unreachable("Unexpected opcode!");
1202       case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1203       case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1204       case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1205       }
1206       SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1207                                     N->getOperand(0), N->getOperand(1));
1208       --I;
1209       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1210       ++I;
1211       MadeChange = true;
1212       continue;
1213     }
1214     case ISD::ANY_EXTEND:
1215     case ISD::ANY_EXTEND_VECTOR_INREG: {
1216       // Replace vector any extend with the zero extend equivalents so we don't
1217       // need 2 sets of patterns. Ignore vXi1 extensions.
1218       if (!N->getValueType(0).isVector())
1219         break;
1220
1221       unsigned NewOpc;
1222       if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1223         assert(N->getOpcode() == ISD::ANY_EXTEND &&
1224                "Unexpected opcode for mask vector!");
1225         NewOpc = ISD::SIGN_EXTEND;
1226       } else {
1227         NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1228                               ? ISD::ZERO_EXTEND
1229                               : ISD::ZERO_EXTEND_VECTOR_INREG;
1230       }
1231
1232       SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1233                                     N->getOperand(0));
1234       --I;
1235       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1236       ++I;
1237       MadeChange = true;
1238       continue;
1239     }
1240     case ISD::FCEIL:
1241     case ISD::STRICT_FCEIL:
1242     case ISD::FFLOOR:
1243     case ISD::STRICT_FFLOOR:
1244     case ISD::FTRUNC:
1245     case ISD::STRICT_FTRUNC:
1246     case ISD::FROUNDEVEN:
1247     case ISD::STRICT_FROUNDEVEN:
1248     case ISD::FNEARBYINT:
1249     case ISD::STRICT_FNEARBYINT:
1250     case ISD::FRINT:
1251     case ISD::STRICT_FRINT: {
1252       // Replace fp rounding with their X86 specific equivalent so we don't
1253       // need 2 sets of patterns.
1254       unsigned Imm;
1255       switch (N->getOpcode()) {
1256       default: llvm_unreachable("Unexpected opcode!");
1257       case ISD::STRICT_FCEIL:
1258       case ISD::FCEIL:      Imm = 0xA; break;
1259       case ISD::STRICT_FFLOOR:
1260       case ISD::FFLOOR:     Imm = 0x9; break;
1261       case ISD::STRICT_FTRUNC:
1262       case ISD::FTRUNC:     Imm = 0xB; break;
1263       case ISD::STRICT_FROUNDEVEN:
1264       case ISD::FROUNDEVEN: Imm = 0x8; break;
1265       case ISD::STRICT_FNEARBYINT:
1266       case ISD::FNEARBYINT: Imm = 0xC; break;
1267       case ISD::STRICT_FRINT:
1268       case ISD::FRINT:      Imm = 0x4; break;
1269       }
1270       SDLoc dl(N);
1271       bool IsStrict = N->isStrictFPOpcode();
1272       SDValue Res;
1273       if (IsStrict)
1274         Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1275                               {N->getValueType(0), MVT::Other},
1276                               {N->getOperand(0), N->getOperand(1),
1277                                CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1278       else
1279         Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1280                               N->getOperand(0),
1281                               CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1282       --I;
1283       CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1284       ++I;
1285       MadeChange = true;
1286       continue;
1287     }
1288     case X86ISD::FANDN:
1289     case X86ISD::FAND:
1290     case X86ISD::FOR:
1291     case X86ISD::FXOR: {
1292       // Widen scalar fp logic ops to vector to reduce isel patterns.
1293       // FIXME: Can we do this during lowering/combine.
1294       MVT VT = N->getSimpleValueType(0);
1295       if (VT.isVector() || VT == MVT::f128)
1296         break;
1297
1298       MVT VecVT = VT == MVT::f64   ? MVT::v2f64
1299                   : VT == MVT::f32 ? MVT::v4f32
1300                                    : MVT::v8f16;
1301
1302       SDLoc dl(N);
1303       SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1304                                     N->getOperand(0));
1305       SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1306                                     N->getOperand(1));
1307
1308       SDValue Res;
1309       if (Subtarget->hasSSE2()) {
1310         EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1311         Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1312         Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1313         unsigned Opc;
1314         switch (N->getOpcode()) {
1315         default: llvm_unreachable("Unexpected opcode!");
1316         case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1317         case X86ISD::FAND:  Opc = ISD::AND;      break;
1318         case X86ISD::FOR:   Opc = ISD::OR;       break;
1319         case X86ISD::FXOR:  Opc = ISD::XOR;      break;
1320         }
1321         Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1322         Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1323       } else {
1324         Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1325       }
1326       Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1327                             CurDAG->getIntPtrConstant(0, dl));
1328       --I;
1329       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1330       ++I;
1331       MadeChange = true;
1332       continue;
1333     }
1334     }
1335
1336     if (OptLevel != CodeGenOptLevel::None &&
1337         // Only do this when the target can fold the load into the call or
1338         // jmp.
1339         !Subtarget->useIndirectThunkCalls() &&
1340         ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1341          (N->getOpcode() == X86ISD::TC_RETURN &&
1342           (Subtarget->is64Bit() ||
1343            !getTargetMachine().isPositionIndependent())))) {
1344       /// Also try moving call address load from outside callseq_start to just
1345       /// before the call to allow it to be folded.
1346       ///
1347       ///     [Load chain]
1348       ///         ^
1349       ///         |
1350       ///       [Load]
1351       ///       ^    ^
1352       ///       |    |
1353       ///      /      \--
1354       ///     /          |
1355       ///[CALLSEQ_START] |
1356       ///     ^          |
1357       ///     |          |
1358       /// [LOAD/C2Reg]   |
1359       ///     |          |
1360       ///      \        /
1361       ///       \      /
1362       ///       [CALL]
1363       bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1364       SDValue Chain = N->getOperand(0);
1365       SDValue Load  = N->getOperand(1);
1366       if (!isCalleeLoad(Load, Chain, HasCallSeq))
1367         continue;
1368       moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1369       ++NumLoadMoved;
1370       MadeChange = true;
1371       continue;
1372     }
1373
1374     // Lower fpround and fpextend nodes that target the FP stack to be store and
1375     // load to the stack.  This is a gross hack.  We would like to simply mark
1376     // these as being illegal, but when we do that, legalize produces these when
1377     // it expands calls, then expands these in the same legalize pass.  We would
1378     // like dag combine to be able to hack on these between the call expansion
1379     // and the node legalization.  As such this pass basically does "really
1380     // late" legalization of these inline with the X86 isel pass.
1381     // FIXME: This should only happen when not compiled with -O0.
1382     switch (N->getOpcode()) {
1383     default: continue;
1384     case ISD::FP_ROUND:
1385     case ISD::FP_EXTEND:
1386     {
1387       MVT SrcVT = N->getOperand(0).getSimpleValueType();
1388       MVT DstVT = N->getSimpleValueType(0);
1389
1390       // If any of the sources are vectors, no fp stack involved.
1391       if (SrcVT.isVector() || DstVT.isVector())
1392         continue;
1393
1394       // If the source and destination are SSE registers, then this is a legal
1395       // conversion that should not be lowered.
1396       const X86TargetLowering *X86Lowering =
1397           static_cast<const X86TargetLowering *>(TLI);
1398       bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1399       bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1400       if (SrcIsSSE && DstIsSSE)
1401         continue;
1402
1403       if (!SrcIsSSE && !DstIsSSE) {
1404         // If this is an FPStack extension, it is a noop.
1405         if (N->getOpcode() == ISD::FP_EXTEND)
1406           continue;
1407         // If this is a value-preserving FPStack truncation, it is a noop.
1408         if (N->getConstantOperandVal(1))
1409           continue;
1410       }
1411
1412       // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1413       // FPStack has extload and truncstore.  SSE can fold direct loads into other
1414       // operations.  Based on this, decide what we want to do.
1415       MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1416       SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1417       int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1418       MachinePointerInfo MPI =
1419           MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1420       SDLoc dl(N);
1421
1422       // FIXME: optimize the case where the src/dest is a load or store?
1423
1424       SDValue Store = CurDAG->getTruncStore(
1425           CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1426       SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1427                                           MemTmp, MPI, MemVT);
1428
1429       // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1430       // extload we created.  This will cause general havok on the dag because
1431       // anything below the conversion could be folded into other existing nodes.
1432       // To avoid invalidating 'I', back it up to the convert node.
1433       --I;
1434       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1435       break;
1436     }
1437
1438     //The sequence of events for lowering STRICT_FP versions of these nodes requires
1439     //dealing with the chain differently, as there is already a preexisting chain.
1440     case ISD::STRICT_FP_ROUND:
1441     case ISD::STRICT_FP_EXTEND:
1442     {
1443       MVT SrcVT = N->getOperand(1).getSimpleValueType();
1444       MVT DstVT = N->getSimpleValueType(0);
1445
1446       // If any of the sources are vectors, no fp stack involved.
1447       if (SrcVT.isVector() || DstVT.isVector())
1448         continue;
1449
1450       // If the source and destination are SSE registers, then this is a legal
1451       // conversion that should not be lowered.
1452       const X86TargetLowering *X86Lowering =
1453           static_cast<const X86TargetLowering *>(TLI);
1454       bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1455       bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1456       if (SrcIsSSE && DstIsSSE)
1457         continue;
1458
1459       if (!SrcIsSSE && !DstIsSSE) {
1460         // If this is an FPStack extension, it is a noop.
1461         if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1462           continue;
1463         // If this is a value-preserving FPStack truncation, it is a noop.
1464         if (N->getConstantOperandVal(2))
1465           continue;
1466       }
1467
1468       // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1469       // FPStack has extload and truncstore.  SSE can fold direct loads into other
1470       // operations.  Based on this, decide what we want to do.
1471       MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1472       SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1473       int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1474       MachinePointerInfo MPI =
1475           MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1476       SDLoc dl(N);
1477
1478       // FIXME: optimize the case where the src/dest is a load or store?
1479
1480       //Since the operation is StrictFP, use the preexisting chain.
1481       SDValue Store, Result;
1482       if (!SrcIsSSE) {
1483         SDVTList VTs = CurDAG->getVTList(MVT::Other);
1484         SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1485         Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1486                                             MPI, /*Align*/ std::nullopt,
1487                                             MachineMemOperand::MOStore);
1488         if (N->getFlags().hasNoFPExcept()) {
1489           SDNodeFlags Flags = Store->getFlags();
1490           Flags.setNoFPExcept(true);
1491           Store->setFlags(Flags);
1492         }
1493       } else {
1494         assert(SrcVT == MemVT && "Unexpected VT!");
1495         Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1496                                  MPI);
1497       }
1498
1499       if (!DstIsSSE) {
1500         SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1501         SDValue Ops[] = {Store, MemTmp};
1502         Result = CurDAG->getMemIntrinsicNode(
1503             X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1504             /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1505         if (N->getFlags().hasNoFPExcept()) {
1506           SDNodeFlags Flags = Result->getFlags();
1507           Flags.setNoFPExcept(true);
1508           Result->setFlags(Flags);
1509         }
1510       } else {
1511         assert(DstVT == MemVT && "Unexpected VT!");
1512         Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1513       }
1514
1515       // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1516       // extload we created.  This will cause general havok on the dag because
1517       // anything below the conversion could be folded into other existing nodes.
1518       // To avoid invalidating 'I', back it up to the convert node.
1519       --I;
1520       CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1521       break;
1522     }
1523     }
1524
1525
1526     // Now that we did that, the node is dead.  Increment the iterator to the
1527     // next node to process, then delete N.
1528     ++I;
1529     MadeChange = true;
1530   }
1531
1532   // Remove any dead nodes that may have been left behind.
1533   if (MadeChange)
1534     CurDAG->RemoveDeadNodes();
1535 }
1536
1537 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1538 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1539   unsigned Opc = N->getMachineOpcode();
1540   if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1541       Opc != X86::MOVSX64rr8)
1542     return false;
1543
1544   SDValue N0 = N->getOperand(0);
1545
1546   // We need to be extracting the lower bit of an extend.
1547   if (!N0.isMachineOpcode() ||
1548       N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1549       N0.getConstantOperandVal(1) != X86::sub_8bit)
1550     return false;
1551
1552   // We're looking for either a movsx or movzx to match the original opcode.
1553   unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1554                                                 : X86::MOVSX32rr8_NOREX;
1555   SDValue N00 = N0.getOperand(0);
1556   if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1557     return false;
1558
1559   if (Opc == X86::MOVSX64rr8) {
1560     // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1561     // to 64.
1562     MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1563                                                    MVT::i64, N00);
1564     ReplaceUses(N, Extend);
1565   } else {
1566     // Ok we can drop this extend and just use the original extend.
1567     ReplaceUses(N, N00.getNode());
1568   }
1569
1570   return true;
1571 }
1572
1573 void X86DAGToDAGISel::PostprocessISelDAG() {
1574   // Skip peepholes at -O0.
1575   if (TM.getOptLevel() == CodeGenOptLevel::None)
1576     return;
1577
1578   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1579
1580   bool MadeChange = false;
1581   while (Position != CurDAG->allnodes_begin()) {
1582     SDNode *N = &*--Position;
1583     // Skip dead nodes and any non-machine opcodes.
1584     if (N->use_empty() || !N->isMachineOpcode())
1585       continue;
1586
1587     if (tryOptimizeRem8Extend(N)) {
1588       MadeChange = true;
1589       continue;
1590     }
1591
1592     unsigned Opc = N->getMachineOpcode();
1593     switch (Opc) {
1594     default:
1595       continue;
1596     // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1597     case X86::TEST8rr:
1598     case X86::TEST16rr:
1599     case X86::TEST32rr:
1600     case X86::TEST64rr:
1601     // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1602     case X86::CTEST8rr:
1603     case X86::CTEST16rr:
1604     case X86::CTEST32rr:
1605     case X86::CTEST64rr: {
1606       auto &Op0 = N->getOperand(0);
1607       if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1608           !Op0.isMachineOpcode())
1609         continue;
1610       SDValue And = N->getOperand(0);
1611 #define CASE_ND(OP)                                                            \
1612   case X86::OP:                                                                \
1613   case X86::OP##_ND:
1614       switch (And.getMachineOpcode()) {
1615       default:
1616         continue;
1617         CASE_ND(AND8rr)
1618         CASE_ND(AND16rr)
1619         CASE_ND(AND32rr)
1620         CASE_ND(AND64rr) {
1621           if (And->hasAnyUseOfValue(1))
1622             continue;
1623           SmallVector<SDValue> Ops(N->op_values());
1624           Ops[0] = And.getOperand(0);
1625           Ops[1] = And.getOperand(1);
1626           MachineSDNode *Test =
1627               CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1628           ReplaceUses(N, Test);
1629           MadeChange = true;
1630           continue;
1631         }
1632         CASE_ND(AND8rm)
1633         CASE_ND(AND16rm)
1634         CASE_ND(AND32rm)
1635         CASE_ND(AND64rm) {
1636           if (And->hasAnyUseOfValue(1))
1637             continue;
1638           unsigned NewOpc;
1639           bool IsCTESTCC = X86::isCTESTCC(Opc);
1640 #define FROM_TO(A, B)                                                          \
1641   CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B;                          \
1642   break;
1643           switch (And.getMachineOpcode()) {
1644             FROM_TO(AND8rm, TEST8mr);
1645             FROM_TO(AND16rm, TEST16mr);
1646             FROM_TO(AND32rm, TEST32mr);
1647             FROM_TO(AND64rm, TEST64mr);
1648           }
1649 #undef FROM_TO
1650 #undef CASE_ND
1651           // Need to swap the memory and register operand.
1652           SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1653                                       And.getOperand(3), And.getOperand(4),
1654                                       And.getOperand(5), And.getOperand(0)};
1655           // CC, Cflags.
1656           if (IsCTESTCC) {
1657             Ops.push_back(N->getOperand(2));
1658             Ops.push_back(N->getOperand(3));
1659           }
1660           // Chain of memory load
1661           Ops.push_back(And.getOperand(6));
1662           // Glue
1663           if (IsCTESTCC)
1664             Ops.push_back(N->getOperand(4));
1665
1666           MachineSDNode *Test = CurDAG->getMachineNode(
1667               NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1668           CurDAG->setNodeMemRefs(
1669               Test, cast<MachineSDNode>(And.getNode())->memoperands());
1670           ReplaceUses(And.getValue(2), SDValue(Test, 1));
1671           ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1672           MadeChange = true;
1673           continue;
1674         }
1675       }
1676     }
1677     // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1678     // used. We're doing this late so we can prefer to fold the AND into masked
1679     // comparisons. Doing that can be better for the live range of the mask
1680     // register.
1681     case X86::KORTESTBkk:
1682     case X86::KORTESTWkk:
1683     case X86::KORTESTDkk:
1684     case X86::KORTESTQkk: {
1685       SDValue Op0 = N->getOperand(0);
1686       if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1687           !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1688         continue;
1689 #define CASE(A)                                                                \
1690   case X86::A:                                                                 \
1691     break;
1692       switch (Op0.getMachineOpcode()) {
1693       default:
1694         continue;
1695         CASE(KANDBkk)
1696         CASE(KANDWkk)
1697         CASE(KANDDkk)
1698         CASE(KANDQkk)
1699       }
1700       unsigned NewOpc;
1701 #define FROM_TO(A, B)                                                          \
1702   case X86::A:                                                                 \
1703     NewOpc = X86::B;                                                           \
1704     break;
1705       switch (Opc) {
1706         FROM_TO(KORTESTBkk, KTESTBkk)
1707         FROM_TO(KORTESTWkk, KTESTWkk)
1708         FROM_TO(KORTESTDkk, KTESTDkk)
1709         FROM_TO(KORTESTQkk, KTESTQkk)
1710       }
1711       // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1712       // KAND instructions and KTEST use the same ISA feature.
1713       if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1714         continue;
1715 #undef FROM_TO
1716       MachineSDNode *KTest = CurDAG->getMachineNode(
1717           NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1718       ReplaceUses(N, KTest);
1719       MadeChange = true;
1720       continue;
1721     }
1722     // Attempt to remove vectors moves that were inserted to zero upper bits.
1723     case TargetOpcode::SUBREG_TO_REG: {
1724       unsigned SubRegIdx = N->getConstantOperandVal(2);
1725       if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1726         continue;
1727
1728       SDValue Move = N->getOperand(1);
1729       if (!Move.isMachineOpcode())
1730         continue;
1731
1732       // Make sure its one of the move opcodes we recognize.
1733       switch (Move.getMachineOpcode()) {
1734       default:
1735         continue;
1736         CASE(VMOVAPDrr)       CASE(VMOVUPDrr)
1737         CASE(VMOVAPSrr)       CASE(VMOVUPSrr)
1738         CASE(VMOVDQArr)       CASE(VMOVDQUrr)
1739         CASE(VMOVAPDYrr)      CASE(VMOVUPDYrr)
1740         CASE(VMOVAPSYrr)      CASE(VMOVUPSYrr)
1741         CASE(VMOVDQAYrr)      CASE(VMOVDQUYrr)
1742         CASE(VMOVAPDZ128rr)   CASE(VMOVUPDZ128rr)
1743         CASE(VMOVAPSZ128rr)   CASE(VMOVUPSZ128rr)
1744         CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1745         CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1746         CASE(VMOVAPDZ256rr)   CASE(VMOVUPDZ256rr)
1747         CASE(VMOVAPSZ256rr)   CASE(VMOVUPSZ256rr)
1748         CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1749         CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1750       }
1751 #undef CASE
1752
1753     SDValue In = Move.getOperand(0);
1754     if (!In.isMachineOpcode() ||
1755         In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1756       continue;
1757
1758     // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1759     // the SHA instructions which use a legacy encoding.
1760     uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1761     if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1762         (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1763         (TSFlags & X86II::EncodingMask) != X86II::XOP)
1764       continue;
1765
1766     // Producing instruction is another vector instruction. We can drop the
1767     // move.
1768     CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1769     MadeChange = true;
1770     }
1771     }
1772   }
1773
1774   if (MadeChange)
1775     CurDAG->RemoveDeadNodes();
1776 }
1777
1778
1779 /// Emit any code that needs to be executed only in the main function.
1780 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1781   if (Subtarget->isTargetCygMing()) {
1782     TargetLowering::ArgListTy Args;
1783     auto &DL = CurDAG->getDataLayout();
1784
1785     TargetLowering::CallLoweringInfo CLI(*CurDAG);
1786     CLI.setChain(CurDAG->getRoot())
1787         .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1788                    CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1789                    std::move(Args));
1790     const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1791     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1792     CurDAG->setRoot(Result.second);
1793   }
1794 }
1795
1796 void X86DAGToDAGISel::emitFunctionEntryCode() {
1797   // If this is main, emit special code for main.
1798   const Function &F = MF->getFunction();
1799   if (F.hasExternalLinkage() && F.getName() == "main")
1800     emitSpecialCodeForMain();
1801 }
1802
1803 static bool isDispSafeForFrameIndex(int64_t Val) {
1804   // On 64-bit platforms, we can run into an issue where a frame index
1805   // includes a displacement that, when added to the explicit displacement,
1806   // will overflow the displacement field. Assuming that the frame index
1807   // displacement fits into a 31-bit integer  (which is only slightly more
1808   // aggressive than the current fundamental assumption that it fits into
1809   // a 32-bit integer), a 31-bit disp should always be safe.
1810   return isInt<31>(Val);
1811 }
1812
1813 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1814                                             X86ISelAddressMode &AM) {
1815   // We may have already matched a displacement and the caller just added the
1816   // symbolic displacement. So we still need to do the checks even if Offset
1817   // is zero.
1818
1819   int64_t Val = AM.Disp + Offset;
1820
1821   // Cannot combine ExternalSymbol displacements with integer offsets.
1822   if (Val != 0 && (AM.ES || AM.MCSym))
1823     return true;
1824
1825   CodeModel::Model M = TM.getCodeModel();
1826   if (Subtarget->is64Bit()) {
1827     if (Val != 0 &&
1828         !X86::isOffsetSuitableForCodeModel(Val, M,
1829                                            AM.hasSymbolicDisplacement()))
1830       return true;
1831     // In addition to the checks required for a register base, check that
1832     // we do not try to use an unsafe Disp with a frame index.
1833     if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1834         !isDispSafeForFrameIndex(Val))
1835       return true;
1836     // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1837     // 64 bits. Instructions with 32-bit register addresses perform this zero
1838     // extension for us and we can safely ignore the high bits of Offset.
1839     // Instructions with only a 32-bit immediate address do not, though: they
1840     // sign extend instead. This means only address the low 2GB of address space
1841     // is directly addressable, we need indirect addressing for the high 2GB of
1842     // address space.
1843     // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1844     // implicit zero extension of instructions would cover up any problem.
1845     // However, we have asserts elsewhere that get triggered if we do, so keep
1846     // the checks for now.
1847     // TODO: We would actually be able to accept these, as well as the same
1848     // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1849     // to get an address size override to be emitted. However, this
1850     // pseudo-register is not part of any register class and therefore causes
1851     // MIR verification to fail.
1852     if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
1853         !AM.hasBaseOrIndexReg())
1854       return true;
1855   }
1856   AM.Disp = Val;
1857   return false;
1858 }
1859
1860 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1861                                          bool AllowSegmentRegForX32) {
1862   SDValue Address = N->getOperand(1);
1863
1864   // load gs:0 -> GS segment register.
1865   // load fs:0 -> FS segment register.
1866   //
1867   // This optimization is generally valid because the GNU TLS model defines that
1868   // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1869   // with 32-bit registers, as we get in ILP32 mode, those registers are first
1870   // zero-extended to 64 bits and then added it to the base address, which gives
1871   // unwanted results when the register holds a negative value.
1872   // For more information see http://people.redhat.com/drepper/tls.pdf
1873   if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1874       !IndirectTlsSegRefs &&
1875       (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1876        Subtarget->isTargetFuchsia())) {
1877     if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1878       return true;
1879     switch (N->getPointerInfo().getAddrSpace()) {
1880     case X86AS::GS:
1881       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1882       return false;
1883     case X86AS::FS:
1884       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1885       return false;
1886       // Address space X86AS::SS is not handled here, because it is not used to
1887       // address TLS areas.
1888     }
1889   }
1890
1891   return true;
1892 }
1893
1894 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1895 /// mode. These wrap things that will resolve down into a symbol reference.
1896 /// If no match is possible, this returns true, otherwise it returns false.
1897 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1898   // If the addressing mode already has a symbol as the displacement, we can
1899   // never match another symbol.
1900   if (AM.hasSymbolicDisplacement())
1901     return true;
1902
1903   bool IsRIPRelTLS = false;
1904   bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1905   if (IsRIPRel) {
1906     SDValue Val = N.getOperand(0);
1907     if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1908       IsRIPRelTLS = true;
1909   }
1910
1911   // We can't use an addressing mode in the 64-bit large code model.
1912   // Global TLS addressing is an exception. In the medium code model,
1913   // we use can use a mode when RIP wrappers are present.
1914   // That signifies access to globals that are known to be "near",
1915   // such as the GOT itself.
1916   CodeModel::Model M = TM.getCodeModel();
1917   if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1918     return true;
1919
1920   // Base and index reg must be 0 in order to use %rip as base.
1921   if (IsRIPRel && AM.hasBaseOrIndexReg())
1922     return true;
1923
1924   // Make a local copy in case we can't do this fold.
1925   X86ISelAddressMode Backup = AM;
1926
1927   int64_t Offset = 0;
1928   SDValue N0 = N.getOperand(0);
1929   if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1930     AM.GV = G->getGlobal();
1931     AM.SymbolFlags = G->getTargetFlags();
1932     Offset = G->getOffset();
1933   } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1934     AM.CP = CP->getConstVal();
1935     AM.Alignment = CP->getAlign();
1936     AM.SymbolFlags = CP->getTargetFlags();
1937     Offset = CP->getOffset();
1938   } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1939     AM.ES = S->getSymbol();
1940     AM.SymbolFlags = S->getTargetFlags();
1941   } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1942     AM.MCSym = S->getMCSymbol();
1943   } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1944     AM.JT = J->getIndex();
1945     AM.SymbolFlags = J->getTargetFlags();
1946   } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1947     AM.BlockAddr = BA->getBlockAddress();
1948     AM.SymbolFlags = BA->getTargetFlags();
1949     Offset = BA->getOffset();
1950   } else
1951     llvm_unreachable("Unhandled symbol reference node.");
1952
1953   // Can't use an addressing mode with large globals.
1954   if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1955       TM.isLargeGlobalValue(AM.GV)) {
1956     AM = Backup;
1957     return true;
1958   }
1959
1960   if (foldOffsetIntoAddress(Offset, AM)) {
1961     AM = Backup;
1962     return true;
1963   }
1964
1965   if (IsRIPRel)
1966     AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1967
1968   // Commit the changes now that we know this fold is safe.
1969   return false;
1970 }
1971
1972 /// Add the specified node to the specified addressing mode, returning true if
1973 /// it cannot be done. This just pattern matches for the addressing mode.
1974 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1975   if (matchAddressRecursively(N, AM, 0))
1976     return true;
1977
1978   // Post-processing: Make a second attempt to fold a load, if we now know
1979   // that there will not be any other register. This is only performed for
1980   // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1981   // any foldable load the first time.
1982   if (Subtarget->isTarget64BitILP32() &&
1983       AM.BaseType == X86ISelAddressMode::RegBase &&
1984       AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1985     SDValue Save_Base_Reg = AM.Base_Reg;
1986     if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1987       AM.Base_Reg = SDValue();
1988       if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1989         AM.Base_Reg = Save_Base_Reg;
1990     }
1991   }
1992
1993   // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1994   // a smaller encoding and avoids a scaled-index.
1995   if (AM.Scale == 2 &&
1996       AM.BaseType == X86ISelAddressMode::RegBase &&
1997       AM.Base_Reg.getNode() == nullptr) {
1998     AM.Base_Reg = AM.IndexReg;
1999     AM.Scale = 1;
2000   }
2001
2002   // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2003   // because it has a smaller encoding.
2004   if (TM.getCodeModel() != CodeModel::Large &&
2005       (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
2006       AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2007       AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2008       AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2009     // However, when GV is a local function symbol and in the same section as
2010     // the current instruction, and AM.Disp is negative and near INT32_MIN,
2011     // referencing GV+Disp generates a relocation referencing the section symbol
2012     // with an even smaller offset, which might underflow. We should bail out if
2013     // the negative offset is too close to INT32_MIN. Actually, we are more
2014     // conservative here, using a smaller magic number also used by
2015     // isOffsetSuitableForCodeModel.
2016     if (isa_and_nonnull<Function>(AM.GV) && AM.Disp < -16 * 1024 * 1024)
2017       return true;
2018
2019     AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
2020   }
2021
2022   return false;
2023 }
2024
2025 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2026                                unsigned Depth) {
2027   // Add an artificial use to this node so that we can keep track of
2028   // it if it gets CSE'd with a different node.
2029   HandleSDNode Handle(N);
2030
2031   X86ISelAddressMode Backup = AM;
2032   if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
2033       !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
2034     return false;
2035   AM = Backup;
2036
2037   // Try again after commutating the operands.
2038   if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
2039                                Depth + 1) &&
2040       !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
2041     return false;
2042   AM = Backup;
2043
2044   // If we couldn't fold both operands into the address at the same time,
2045   // see if we can just put each operand into a register and fold at least
2046   // the add.
2047   if (AM.BaseType == X86ISelAddressMode::RegBase &&
2048       !AM.Base_Reg.getNode() &&
2049       !AM.IndexReg.getNode()) {
2050     N = Handle.getValue();
2051     AM.Base_Reg = N.getOperand(0);
2052     AM.IndexReg = N.getOperand(1);
2053     AM.Scale = 1;
2054     return false;
2055   }
2056   N = Handle.getValue();
2057   return true;
2058 }
2059
2060 // Insert a node into the DAG at least before the Pos node's position. This
2061 // will reposition the node as needed, and will assign it a node ID that is <=
2062 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2063 // IDs! The selection DAG must no longer depend on their uniqueness when this
2064 // is used.
2065 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2066   if (N->getNodeId() == -1 ||
2067       (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
2068        SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
2069     DAG.RepositionNode(Pos->getIterator(), N.getNode());
2070     // Mark Node as invalid for pruning as after this it may be a successor to a
2071     // selected node but otherwise be in the same position of Pos.
2072     // Conservatively mark it with the same -abs(Id) to assure node id
2073     // invariant is preserved.
2074     N->setNodeId(Pos->getNodeId());
2075     SelectionDAGISel::InvalidateNodeId(N.getNode());
2076   }
2077 }
2078
2079 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2080 // safe. This allows us to convert the shift and and into an h-register
2081 // extract and a scaled index. Returns false if the simplification is
2082 // performed.
2083 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2084                                       uint64_t Mask,
2085                                       SDValue Shift, SDValue X,
2086                                       X86ISelAddressMode &AM) {
2087   if (Shift.getOpcode() != ISD::SRL ||
2088       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2089       !Shift.hasOneUse())
2090     return true;
2091
2092   int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2093   if (ScaleLog <= 0 || ScaleLog >= 4 ||
2094       Mask != (0xffu << ScaleLog))
2095     return true;
2096
2097   MVT XVT = X.getSimpleValueType();
2098   MVT VT = N.getSimpleValueType();
2099   SDLoc DL(N);
2100   SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2101   SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2102   SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2103   SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2104   SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2105   SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2106   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2107
2108   // Insert the new nodes into the topological ordering. We must do this in
2109   // a valid topological ordering as nothing is going to go back and re-sort
2110   // these nodes. We continually insert before 'N' in sequence as this is
2111   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2112   // hierarchy left to express.
2113   insertDAGNode(DAG, N, Eight);
2114   insertDAGNode(DAG, N, NewMask);
2115   insertDAGNode(DAG, N, Srl);
2116   insertDAGNode(DAG, N, And);
2117   insertDAGNode(DAG, N, Ext);
2118   insertDAGNode(DAG, N, ShlCount);
2119   insertDAGNode(DAG, N, Shl);
2120   DAG.ReplaceAllUsesWith(N, Shl);
2121   DAG.RemoveDeadNode(N.getNode());
2122   AM.IndexReg = Ext;
2123   AM.Scale = (1 << ScaleLog);
2124   return false;
2125 }
2126
2127 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2128 // allows us to fold the shift into this addressing mode. Returns false if the
2129 // transform succeeded.
2130 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2131                                         X86ISelAddressMode &AM) {
2132   SDValue Shift = N.getOperand(0);
2133
2134   // Use a signed mask so that shifting right will insert sign bits. These
2135   // bits will be removed when we shift the result left so it doesn't matter
2136   // what we use. This might allow a smaller immediate encoding.
2137   int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2138
2139   // If we have an any_extend feeding the AND, look through it to see if there
2140   // is a shift behind it. But only if the AND doesn't use the extended bits.
2141   // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2142   bool FoundAnyExtend = false;
2143   if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2144       Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2145       isUInt<32>(Mask)) {
2146     FoundAnyExtend = true;
2147     Shift = Shift.getOperand(0);
2148   }
2149
2150   if (Shift.getOpcode() != ISD::SHL ||
2151       !isa<ConstantSDNode>(Shift.getOperand(1)))
2152     return true;
2153
2154   SDValue X = Shift.getOperand(0);
2155
2156   // Not likely to be profitable if either the AND or SHIFT node has more
2157   // than one use (unless all uses are for address computation). Besides,
2158   // isel mechanism requires their node ids to be reused.
2159   if (!N.hasOneUse() || !Shift.hasOneUse())
2160     return true;
2161
2162   // Verify that the shift amount is something we can fold.
2163   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2164   if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2165     return true;
2166
2167   MVT VT = N.getSimpleValueType();
2168   SDLoc DL(N);
2169   if (FoundAnyExtend) {
2170     SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2171     insertDAGNode(DAG, N, NewX);
2172     X = NewX;
2173   }
2174
2175   SDValue NewMask = DAG.getSignedConstant(Mask >> ShiftAmt, DL, VT);
2176   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2177   SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2178
2179   // Insert the new nodes into the topological ordering. We must do this in
2180   // a valid topological ordering as nothing is going to go back and re-sort
2181   // these nodes. We continually insert before 'N' in sequence as this is
2182   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2183   // hierarchy left to express.
2184   insertDAGNode(DAG, N, NewMask);
2185   insertDAGNode(DAG, N, NewAnd);
2186   insertDAGNode(DAG, N, NewShift);
2187   DAG.ReplaceAllUsesWith(N, NewShift);
2188   DAG.RemoveDeadNode(N.getNode());
2189
2190   AM.Scale = 1 << ShiftAmt;
2191   AM.IndexReg = NewAnd;
2192   return false;
2193 }
2194
2195 // Implement some heroics to detect shifts of masked values where the mask can
2196 // be replaced by extending the shift and undoing that in the addressing mode
2197 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2198 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2199 // the addressing mode. This results in code such as:
2200 //
2201 //   int f(short *y, int *lookup_table) {
2202 //     ...
2203 //     return *y + lookup_table[*y >> 11];
2204 //   }
2205 //
2206 // Turning into:
2207 //   movzwl (%rdi), %eax
2208 //   movl %eax, %ecx
2209 //   shrl $11, %ecx
2210 //   addl (%rsi,%rcx,4), %eax
2211 //
2212 // Instead of:
2213 //   movzwl (%rdi), %eax
2214 //   movl %eax, %ecx
2215 //   shrl $9, %ecx
2216 //   andl $124, %rcx
2217 //   addl (%rsi,%rcx), %eax
2218 //
2219 // Note that this function assumes the mask is provided as a mask *after* the
2220 // value is shifted. The input chain may or may not match that, but computing
2221 // such a mask is trivial.
2222 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2223                                     uint64_t Mask,
2224                                     SDValue Shift, SDValue X,
2225                                     X86ISelAddressMode &AM) {
2226   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2227       !isa<ConstantSDNode>(Shift.getOperand(1)))
2228     return true;
2229
2230   // We need to ensure that mask is a continuous run of bits.
2231   unsigned MaskIdx, MaskLen;
2232   if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2233     return true;
2234   unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2235
2236   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2237
2238   // The amount of shift we're trying to fit into the addressing mode is taken
2239   // from the shifted mask index (number of trailing zeros of the mask).
2240   unsigned AMShiftAmt = MaskIdx;
2241
2242   // There is nothing we can do here unless the mask is removing some bits.
2243   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2244   if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2245
2246   // Scale the leading zero count down based on the actual size of the value.
2247   // Also scale it down based on the size of the shift.
2248   unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2249   if (MaskLZ < ScaleDown)
2250     return true;
2251   MaskLZ -= ScaleDown;
2252
2253   // The final check is to ensure that any masked out high bits of X are
2254   // already known to be zero. Otherwise, the mask has a semantic impact
2255   // other than masking out a couple of low bits. Unfortunately, because of
2256   // the mask, zero extensions will be removed from operands in some cases.
2257   // This code works extra hard to look through extensions because we can
2258   // replace them with zero extensions cheaply if necessary.
2259   bool ReplacingAnyExtend = false;
2260   if (X.getOpcode() == ISD::ANY_EXTEND) {
2261     unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2262                           X.getOperand(0).getSimpleValueType().getSizeInBits();
2263     // Assume that we'll replace the any-extend with a zero-extend, and
2264     // narrow the search to the extended value.
2265     X = X.getOperand(0);
2266     MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2267     ReplacingAnyExtend = true;
2268   }
2269   APInt MaskedHighBits =
2270     APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2271   if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2272     return true;
2273
2274   // We've identified a pattern that can be transformed into a single shift
2275   // and an addressing mode. Make it so.
2276   MVT VT = N.getSimpleValueType();
2277   if (ReplacingAnyExtend) {
2278     assert(X.getValueType() != VT);
2279     // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2280     SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2281     insertDAGNode(DAG, N, NewX);
2282     X = NewX;
2283   }
2284
2285   MVT XVT = X.getSimpleValueType();
2286   SDLoc DL(N);
2287   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2288   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2289   SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2290   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2291   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2292
2293   // Insert the new nodes into the topological ordering. We must do this in
2294   // a valid topological ordering as nothing is going to go back and re-sort
2295   // these nodes. We continually insert before 'N' in sequence as this is
2296   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2297   // hierarchy left to express.
2298   insertDAGNode(DAG, N, NewSRLAmt);
2299   insertDAGNode(DAG, N, NewSRL);
2300   insertDAGNode(DAG, N, NewExt);
2301   insertDAGNode(DAG, N, NewSHLAmt);
2302   insertDAGNode(DAG, N, NewSHL);
2303   DAG.ReplaceAllUsesWith(N, NewSHL);
2304   DAG.RemoveDeadNode(N.getNode());
2305
2306   AM.Scale = 1 << AMShiftAmt;
2307   AM.IndexReg = NewExt;
2308   return false;
2309 }
2310
2311 // Transform "(X >> SHIFT) & (MASK << C1)" to
2312 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2313 // matched to a BEXTR later. Returns false if the simplification is performed.
2314 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2315                                    uint64_t Mask,
2316                                    SDValue Shift, SDValue X,
2317                                    X86ISelAddressMode &AM,
2318                                    const X86Subtarget &Subtarget) {
2319   if (Shift.getOpcode() != ISD::SRL ||
2320       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2321       !Shift.hasOneUse() || !N.hasOneUse())
2322     return true;
2323
2324   // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2325   if (!Subtarget.hasTBM() &&
2326       !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2327     return true;
2328
2329   // We need to ensure that mask is a continuous run of bits.
2330   unsigned MaskIdx, MaskLen;
2331   if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2332     return true;
2333
2334   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2335
2336   // The amount of shift we're trying to fit into the addressing mode is taken
2337   // from the shifted mask index (number of trailing zeros of the mask).
2338   unsigned AMShiftAmt = MaskIdx;
2339
2340   // There is nothing we can do here unless the mask is removing some bits.
2341   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2342   if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2343
2344   MVT XVT = X.getSimpleValueType();
2345   MVT VT = N.getSimpleValueType();
2346   SDLoc DL(N);
2347   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2348   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2349   SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2350   SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2351   SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2352   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2353   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2354
2355   // Insert the new nodes into the topological ordering. We must do this in
2356   // a valid topological ordering as nothing is going to go back and re-sort
2357   // these nodes. We continually insert before 'N' in sequence as this is
2358   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2359   // hierarchy left to express.
2360   insertDAGNode(DAG, N, NewSRLAmt);
2361   insertDAGNode(DAG, N, NewSRL);
2362   insertDAGNode(DAG, N, NewMask);
2363   insertDAGNode(DAG, N, NewAnd);
2364   insertDAGNode(DAG, N, NewExt);
2365   insertDAGNode(DAG, N, NewSHLAmt);
2366   insertDAGNode(DAG, N, NewSHL);
2367   DAG.ReplaceAllUsesWith(N, NewSHL);
2368   DAG.RemoveDeadNode(N.getNode());
2369
2370   AM.Scale = 1 << AMShiftAmt;
2371   AM.IndexReg = NewExt;
2372   return false;
2373 }
2374
2375 // Attempt to peek further into a scaled index register, collecting additional
2376 // extensions / offsets / etc. Returns /p N if we can't peek any further.
2377 SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2378                                                X86ISelAddressMode &AM,
2379                                                unsigned Depth) {
2380   assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2381   assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2382          "Illegal index scale");
2383
2384   // Limit recursion.
2385   if (Depth >= SelectionDAG::MaxRecursionDepth)
2386     return N;
2387
2388   EVT VT = N.getValueType();
2389   unsigned Opc = N.getOpcode();
2390
2391   // index: add(x,c) -> index: x, disp + c
2392   if (CurDAG->isBaseWithConstantOffset(N)) {
2393     auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2394     uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2395     if (!foldOffsetIntoAddress(Offset, AM))
2396       return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2397   }
2398
2399   // index: add(x,x) -> index: x, scale * 2
2400   if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2401     if (AM.Scale <= 4) {
2402       AM.Scale *= 2;
2403       return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2404     }
2405   }
2406
2407   // index: shl(x,i) -> index: x, scale * (1 << i)
2408   if (Opc == X86ISD::VSHLI) {
2409     uint64_t ShiftAmt = N.getConstantOperandVal(1);
2410     uint64_t ScaleAmt = 1ULL << ShiftAmt;
2411     if ((AM.Scale * ScaleAmt) <= 8) {
2412       AM.Scale *= ScaleAmt;
2413       return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2414     }
2415   }
2416
2417   // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2418   // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2419   if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2420     SDValue Src = N.getOperand(0);
2421     if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2422         Src.hasOneUse()) {
2423       if (CurDAG->isBaseWithConstantOffset(Src)) {
2424         SDValue AddSrc = Src.getOperand(0);
2425         auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2426         int64_t Offset = AddVal->getSExtValue();
2427         if (!foldOffsetIntoAddress((uint64_t)Offset * AM.Scale, AM)) {
2428           SDLoc DL(N);
2429           SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2430           SDValue ExtVal = CurDAG->getSignedConstant(Offset, DL, VT);
2431           SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2432           insertDAGNode(*CurDAG, N, ExtSrc);
2433           insertDAGNode(*CurDAG, N, ExtVal);
2434           insertDAGNode(*CurDAG, N, ExtAdd);
2435           CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2436           CurDAG->RemoveDeadNode(N.getNode());
2437           return ExtSrc;
2438         }
2439       }
2440     }
2441   }
2442
2443   // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2444   // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2445   // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2446   if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2447     SDValue Src = N.getOperand(0);
2448     unsigned SrcOpc = Src.getOpcode();
2449     if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2450          CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2451         Src.hasOneUse()) {
2452       if (CurDAG->isBaseWithConstantOffset(Src)) {
2453         SDValue AddSrc = Src.getOperand(0);
2454         uint64_t Offset = Src.getConstantOperandVal(1);
2455         if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2456           SDLoc DL(N);
2457           SDValue Res;
2458           // If we're also scaling, see if we can use that as well.
2459           if (AddSrc.getOpcode() == ISD::SHL &&
2460               isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2461             SDValue ShVal = AddSrc.getOperand(0);
2462             uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2463             APInt HiBits =
2464                 APInt::getHighBitsSet(AddSrc.getScalarValueSizeInBits(), ShAmt);
2465             uint64_t ScaleAmt = 1ULL << ShAmt;
2466             if ((AM.Scale * ScaleAmt) <= 8 &&
2467                 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2468                  CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2469               AM.Scale *= ScaleAmt;
2470               SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2471               SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2472                                                  AddSrc.getOperand(1));
2473               insertDAGNode(*CurDAG, N, ExtShVal);
2474               insertDAGNode(*CurDAG, N, ExtShift);
2475               AddSrc = ExtShift;
2476               Res = ExtShVal;
2477             }
2478           }
2479           SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2480           SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2481           SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2482           insertDAGNode(*CurDAG, N, ExtSrc);
2483           insertDAGNode(*CurDAG, N, ExtVal);
2484           insertDAGNode(*CurDAG, N, ExtAdd);
2485           CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2486           CurDAG->RemoveDeadNode(N.getNode());
2487           return Res ? Res : ExtSrc;
2488         }
2489       }
2490     }
2491   }
2492
2493   // TODO: Handle extensions, shifted masks etc.
2494   return N;
2495 }
2496
2497 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2498                                               unsigned Depth) {
2499   SDLoc dl(N);
2500   LLVM_DEBUG({
2501     dbgs() << "MatchAddress: ";
2502     AM.dump(CurDAG);
2503   });
2504   // Limit recursion.
2505   if (Depth >= SelectionDAG::MaxRecursionDepth)
2506     return matchAddressBase(N, AM);
2507
2508   // If this is already a %rip relative address, we can only merge immediates
2509   // into it.  Instead of handling this in every case, we handle it here.
2510   // RIP relative addressing: %rip + 32-bit displacement!
2511   if (AM.isRIPRelative()) {
2512     // FIXME: JumpTable and ExternalSymbol address currently don't like
2513     // displacements.  It isn't very important, but this should be fixed for
2514     // consistency.
2515     if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2516       return true;
2517
2518     if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2519       if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2520         return false;
2521     return true;
2522   }
2523
2524   switch (N.getOpcode()) {
2525   default: break;
2526   case ISD::LOCAL_RECOVER: {
2527     if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2528       if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2529         // Use the symbol and don't prefix it.
2530         AM.MCSym = ESNode->getMCSymbol();
2531         return false;
2532       }
2533     break;
2534   }
2535   case ISD::Constant: {
2536     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2537     if (!foldOffsetIntoAddress(Val, AM))
2538       return false;
2539     break;
2540   }
2541
2542   case X86ISD::Wrapper:
2543   case X86ISD::WrapperRIP:
2544     if (!matchWrapper(N, AM))
2545       return false;
2546     break;
2547
2548   case ISD::LOAD:
2549     if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2550       return false;
2551     break;
2552
2553   case ISD::FrameIndex:
2554     if (AM.BaseType == X86ISelAddressMode::RegBase &&
2555         AM.Base_Reg.getNode() == nullptr &&
2556         (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2557       AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2558       AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2559       return false;
2560     }
2561     break;
2562
2563   case ISD::SHL:
2564     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2565       break;
2566
2567     if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2568       unsigned Val = CN->getZExtValue();
2569       // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2570       // that the base operand remains free for further matching. If
2571       // the base doesn't end up getting used, a post-processing step
2572       // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2573       if (Val == 1 || Val == 2 || Val == 3) {
2574         SDValue ShVal = N.getOperand(0);
2575         AM.Scale = 1 << Val;
2576         AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2577         return false;
2578       }
2579     }
2580     break;
2581
2582   case ISD::SRL: {
2583     // Scale must not be used already.
2584     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2585
2586     // We only handle up to 64-bit values here as those are what matter for
2587     // addressing mode optimizations.
2588     assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2589            "Unexpected value size!");
2590
2591     SDValue And = N.getOperand(0);
2592     if (And.getOpcode() != ISD::AND) break;
2593     SDValue X = And.getOperand(0);
2594
2595     // The mask used for the transform is expected to be post-shift, but we
2596     // found the shift first so just apply the shift to the mask before passing
2597     // it down.
2598     if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2599         !isa<ConstantSDNode>(And.getOperand(1)))
2600       break;
2601     uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2602
2603     // Try to fold the mask and shift into the scale, and return false if we
2604     // succeed.
2605     if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2606       return false;
2607     break;
2608   }
2609
2610   case ISD::SMUL_LOHI:
2611   case ISD::UMUL_LOHI:
2612     // A mul_lohi where we need the low part can be folded as a plain multiply.
2613     if (N.getResNo() != 0) break;
2614     [[fallthrough]];
2615   case ISD::MUL:
2616   case X86ISD::MUL_IMM:
2617     // X*[3,5,9] -> X+X*[2,4,8]
2618     if (AM.BaseType == X86ISelAddressMode::RegBase &&
2619         AM.Base_Reg.getNode() == nullptr &&
2620         AM.IndexReg.getNode() == nullptr) {
2621       if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2622         if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2623             CN->getZExtValue() == 9) {
2624           AM.Scale = unsigned(CN->getZExtValue())-1;
2625
2626           SDValue MulVal = N.getOperand(0);
2627           SDValue Reg;
2628
2629           // Okay, we know that we have a scale by now.  However, if the scaled
2630           // value is an add of something and a constant, we can fold the
2631           // constant into the disp field here.
2632           if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2633               isa<ConstantSDNode>(MulVal.getOperand(1))) {
2634             Reg = MulVal.getOperand(0);
2635             auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2636             uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2637             if (foldOffsetIntoAddress(Disp, AM))
2638               Reg = N.getOperand(0);
2639           } else {
2640             Reg = N.getOperand(0);
2641           }
2642
2643           AM.IndexReg = AM.Base_Reg = Reg;
2644           return false;
2645         }
2646     }
2647     break;
2648
2649   case ISD::SUB: {
2650     // Given A-B, if A can be completely folded into the address and
2651     // the index field with the index field unused, use -B as the index.
2652     // This is a win if a has multiple parts that can be folded into
2653     // the address. Also, this saves a mov if the base register has
2654     // other uses, since it avoids a two-address sub instruction, however
2655     // it costs an additional mov if the index register has other uses.
2656
2657     // Add an artificial use to this node so that we can keep track of
2658     // it if it gets CSE'd with a different node.
2659     HandleSDNode Handle(N);
2660
2661     // Test if the LHS of the sub can be folded.
2662     X86ISelAddressMode Backup = AM;
2663     if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2664       N = Handle.getValue();
2665       AM = Backup;
2666       break;
2667     }
2668     N = Handle.getValue();
2669     // Test if the index field is free for use.
2670     if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2671       AM = Backup;
2672       break;
2673     }
2674
2675     int Cost = 0;
2676     SDValue RHS = N.getOperand(1);
2677     // If the RHS involves a register with multiple uses, this
2678     // transformation incurs an extra mov, due to the neg instruction
2679     // clobbering its operand.
2680     if (!RHS.getNode()->hasOneUse() ||
2681         RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2682         RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2683         RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2684         (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2685          RHS.getOperand(0).getValueType() == MVT::i32))
2686       ++Cost;
2687     // If the base is a register with multiple uses, this
2688     // transformation may save a mov.
2689     if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2690          !AM.Base_Reg.getNode()->hasOneUse()) ||
2691         AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2692       --Cost;
2693     // If the folded LHS was interesting, this transformation saves
2694     // address arithmetic.
2695     if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2696         ((AM.Disp != 0) && (Backup.Disp == 0)) +
2697         (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2698       --Cost;
2699     // If it doesn't look like it may be an overall win, don't do it.
2700     if (Cost >= 0) {
2701       AM = Backup;
2702       break;
2703     }
2704
2705     // Ok, the transformation is legal and appears profitable. Go for it.
2706     // Negation will be emitted later to avoid creating dangling nodes if this
2707     // was an unprofitable LEA.
2708     AM.IndexReg = RHS;
2709     AM.NegateIndex = true;
2710     AM.Scale = 1;
2711     return false;
2712   }
2713
2714   case ISD::OR:
2715   case ISD::XOR:
2716     // See if we can treat the OR/XOR node as an ADD node.
2717     if (!CurDAG->isADDLike(N))
2718       break;
2719     [[fallthrough]];
2720   case ISD::ADD:
2721     if (!matchAdd(N, AM, Depth))
2722       return false;
2723     break;
2724
2725   case ISD::AND: {
2726     // Perform some heroic transforms on an and of a constant-count shift
2727     // with a constant to enable use of the scaled offset field.
2728
2729     // Scale must not be used already.
2730     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2731
2732     // We only handle up to 64-bit values here as those are what matter for
2733     // addressing mode optimizations.
2734     assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2735            "Unexpected value size!");
2736
2737     if (!isa<ConstantSDNode>(N.getOperand(1)))
2738       break;
2739
2740     if (N.getOperand(0).getOpcode() == ISD::SRL) {
2741       SDValue Shift = N.getOperand(0);
2742       SDValue X = Shift.getOperand(0);
2743
2744       uint64_t Mask = N.getConstantOperandVal(1);
2745
2746       // Try to fold the mask and shift into an extract and scale.
2747       if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2748         return false;
2749
2750       // Try to fold the mask and shift directly into the scale.
2751       if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2752         return false;
2753
2754       // Try to fold the mask and shift into BEXTR and scale.
2755       if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2756         return false;
2757     }
2758
2759     // Try to swap the mask and shift to place shifts which can be done as
2760     // a scale on the outside of the mask.
2761     if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2762       return false;
2763
2764     break;
2765   }
2766   case ISD::ZERO_EXTEND: {
2767     // Try to widen a zexted shift left to the same size as its use, so we can
2768     // match the shift as a scale factor.
2769     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2770       break;
2771
2772     SDValue Src = N.getOperand(0);
2773
2774     // See if we can match a zext(addlike(x,c)).
2775     // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2776     if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2777       if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2778         if (Index != N) {
2779           AM.IndexReg = Index;
2780           return false;
2781         }
2782
2783     // Peek through mask: zext(and(shl(x,c1),c2))
2784     APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2785     if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2786       if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2787         Mask = MaskC->getAPIntValue();
2788         Src = Src.getOperand(0);
2789       }
2790
2791     if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2792       // Give up if the shift is not a valid scale factor [1,2,3].
2793       SDValue ShlSrc = Src.getOperand(0);
2794       SDValue ShlAmt = Src.getOperand(1);
2795       auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2796       if (!ShAmtC)
2797         break;
2798       unsigned ShAmtV = ShAmtC->getZExtValue();
2799       if (ShAmtV > 3)
2800         break;
2801
2802       // The narrow shift must only shift out zero bits (it must be 'nuw').
2803       // That makes it safe to widen to the destination type.
2804       APInt HighZeros =
2805           APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2806       if (!Src->getFlags().hasNoUnsignedWrap() &&
2807           !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2808         break;
2809
2810       // zext (shl nuw i8 %x, C1) to i32
2811       // --> shl (zext i8 %x to i32), (zext C1)
2812       // zext (and (shl nuw i8 %x, C1), C2) to i32
2813       // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2814       MVT SrcVT = ShlSrc.getSimpleValueType();
2815       MVT VT = N.getSimpleValueType();
2816       SDLoc DL(N);
2817
2818       SDValue Res = ShlSrc;
2819       if (!Mask.isAllOnes()) {
2820         Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2821         insertDAGNode(*CurDAG, N, Res);
2822         Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2823         insertDAGNode(*CurDAG, N, Res);
2824       }
2825       SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2826       insertDAGNode(*CurDAG, N, Zext);
2827       SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2828       insertDAGNode(*CurDAG, N, NewShl);
2829       CurDAG->ReplaceAllUsesWith(N, NewShl);
2830       CurDAG->RemoveDeadNode(N.getNode());
2831
2832       // Convert the shift to scale factor.
2833       AM.Scale = 1 << ShAmtV;
2834       // If matchIndexRecursively is not called here,
2835       // Zext may be replaced by other nodes but later used to call a builder
2836       // method
2837       AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2838       return false;
2839     }
2840
2841     if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2842       // Try to fold the mask and shift into an extract and scale.
2843       if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2844                                      Src.getOperand(0), AM))
2845         return false;
2846
2847       // Try to fold the mask and shift directly into the scale.
2848       if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2849                                    Src.getOperand(0), AM))
2850         return false;
2851
2852       // Try to fold the mask and shift into BEXTR and scale.
2853       if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2854                                   Src.getOperand(0), AM, *Subtarget))
2855         return false;
2856     }
2857
2858     break;
2859   }
2860   }
2861
2862   return matchAddressBase(N, AM);
2863 }
2864
2865 /// Helper for MatchAddress. Add the specified node to the
2866 /// specified addressing mode without any further recursion.
2867 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2868   // Is the base register already occupied?
2869   if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2870     // If so, check to see if the scale index register is set.
2871     if (!AM.IndexReg.getNode()) {
2872       AM.IndexReg = N;
2873       AM.Scale = 1;
2874       return false;
2875     }
2876
2877     // Otherwise, we cannot select it.
2878     return true;
2879   }
2880
2881   // Default, generate it as a register.
2882   AM.BaseType = X86ISelAddressMode::RegBase;
2883   AM.Base_Reg = N;
2884   return false;
2885 }
2886
2887 bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2888                                                     X86ISelAddressMode &AM,
2889                                                     unsigned Depth) {
2890   SDLoc dl(N);
2891   LLVM_DEBUG({
2892     dbgs() << "MatchVectorAddress: ";
2893     AM.dump(CurDAG);
2894   });
2895   // Limit recursion.
2896   if (Depth >= SelectionDAG::MaxRecursionDepth)
2897     return matchAddressBase(N, AM);
2898
2899   // TODO: Support other operations.
2900   switch (N.getOpcode()) {
2901   case ISD::Constant: {
2902     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2903     if (!foldOffsetIntoAddress(Val, AM))
2904       return false;
2905     break;
2906   }
2907   case X86ISD::Wrapper:
2908     if (!matchWrapper(N, AM))
2909       return false;
2910     break;
2911   case ISD::ADD: {
2912     // Add an artificial use to this node so that we can keep track of
2913     // it if it gets CSE'd with a different node.
2914     HandleSDNode Handle(N);
2915
2916     X86ISelAddressMode Backup = AM;
2917     if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2918         !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2919                                        Depth + 1))
2920       return false;
2921     AM = Backup;
2922
2923     // Try again after commuting the operands.
2924     if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2925                                        Depth + 1) &&
2926         !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2927                                        Depth + 1))
2928       return false;
2929     AM = Backup;
2930
2931     N = Handle.getValue();
2932     break;
2933   }
2934   }
2935
2936   return matchAddressBase(N, AM);
2937 }
2938
2939 /// Helper for selectVectorAddr. Handles things that can be folded into a
2940 /// gather/scatter address. The index register and scale should have already
2941 /// been handled.
2942 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2943   return matchVectorAddressRecursively(N, AM, 0);
2944 }
2945
2946 bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2947                                        SDValue IndexOp, SDValue ScaleOp,
2948                                        SDValue &Base, SDValue &Scale,
2949                                        SDValue &Index, SDValue &Disp,
2950                                        SDValue &Segment) {
2951   X86ISelAddressMode AM;
2952   AM.Scale = ScaleOp->getAsZExtVal();
2953
2954   // Attempt to match index patterns, as long as we're not relying on implicit
2955   // sign-extension, which is performed BEFORE scale.
2956   if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2957     AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2958   else
2959     AM.IndexReg = IndexOp;
2960
2961   unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2962   if (AddrSpace == X86AS::GS)
2963     AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2964   if (AddrSpace == X86AS::FS)
2965     AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2966   if (AddrSpace == X86AS::SS)
2967     AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2968
2969   SDLoc DL(BasePtr);
2970   MVT VT = BasePtr.getSimpleValueType();
2971
2972   // Try to match into the base and displacement fields.
2973   if (matchVectorAddress(BasePtr, AM))
2974     return false;
2975
2976   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2977   return true;
2978 }
2979
2980 /// Returns true if it is able to pattern match an addressing mode.
2981 /// It returns the operands which make up the maximal addressing mode it can
2982 /// match by reference.
2983 ///
2984 /// Parent is the parent node of the addr operand that is being matched.  It
2985 /// is always a load, store, atomic node, or null.  It is only null when
2986 /// checking memory operands for inline asm nodes.
2987 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2988                                  SDValue &Scale, SDValue &Index,
2989                                  SDValue &Disp, SDValue &Segment) {
2990   X86ISelAddressMode AM;
2991
2992   if (Parent &&
2993       // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2994       // that are not a MemSDNode, and thus don't have proper addrspace info.
2995       Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2996       Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2997       Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2998       Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2999       Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3000       Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3001       Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3002     unsigned AddrSpace =
3003       cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
3004     if (AddrSpace == X86AS::GS)
3005       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
3006     if (AddrSpace == X86AS::FS)
3007       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
3008     if (AddrSpace == X86AS::SS)
3009       AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
3010   }
3011
3012   // Save the DL and VT before calling matchAddress, it can invalidate N.
3013   SDLoc DL(N);
3014   MVT VT = N.getSimpleValueType();
3015
3016   if (matchAddress(N, AM))
3017     return false;
3018
3019   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3020   return true;
3021 }
3022
3023 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3024   // Cannot use 32 bit constants to reference objects in kernel/large code
3025   // model.
3026   if (TM.getCodeModel() == CodeModel::Kernel ||
3027       TM.getCodeModel() == CodeModel::Large)
3028     return false;
3029
3030   // In static codegen with small code model, we can get the address of a label
3031   // into a register with 'movl'
3032   if (N->getOpcode() != X86ISD::Wrapper)
3033     return false;
3034
3035   N = N.getOperand(0);
3036
3037   // At least GNU as does not accept 'movl' for TPOFF relocations.
3038   // FIXME: We could use 'movl' when we know we are targeting MC.
3039   if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3040     return false;
3041
3042   Imm = N;
3043   // Small/medium code model can reference non-TargetGlobalAddress objects with
3044   // 32 bit constants.
3045   if (N->getOpcode() != ISD::TargetGlobalAddress) {
3046     return TM.getCodeModel() == CodeModel::Small ||
3047            TM.getCodeModel() == CodeModel::Medium;
3048   }
3049
3050   const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3051   if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3052     return CR->getUnsignedMax().ult(1ull << 32);
3053
3054   return !TM.isLargeGlobalValue(GV);
3055 }
3056
3057 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
3058                                          SDValue &Scale, SDValue &Index,
3059                                          SDValue &Disp, SDValue &Segment) {
3060   // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3061   SDLoc DL(N);
3062
3063   if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3064     return false;
3065
3066   auto *RN = dyn_cast<RegisterSDNode>(Base);
3067   if (RN && RN->getReg() == 0)
3068     Base = CurDAG->getRegister(0, MVT::i64);
3069   else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
3070     // Base could already be %rip, particularly in the x32 ABI.
3071     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3072                                                      MVT::i64), 0);
3073     Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3074                                          Base);
3075   }
3076
3077   RN = dyn_cast<RegisterSDNode>(Index);
3078   if (RN && RN->getReg() == 0)
3079     Index = CurDAG->getRegister(0, MVT::i64);
3080   else {
3081     assert(Index.getValueType() == MVT::i32 &&
3082            "Expect to be extending 32-bit registers for use in LEA");
3083     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3084                                                      MVT::i64), 0);
3085     Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3086                                           Index);
3087   }
3088
3089   return true;
3090 }
3091
3092 /// Calls SelectAddr and determines if the maximal addressing
3093 /// mode it matches can be cost effectively emitted as an LEA instruction.
3094 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3095                                     SDValue &Base, SDValue &Scale,
3096                                     SDValue &Index, SDValue &Disp,
3097                                     SDValue &Segment) {
3098   X86ISelAddressMode AM;
3099
3100   // Save the DL and VT before calling matchAddress, it can invalidate N.
3101   SDLoc DL(N);
3102   MVT VT = N.getSimpleValueType();
3103
3104   // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3105   // segments.
3106   SDValue Copy = AM.Segment;
3107   SDValue T = CurDAG->getRegister(0, MVT::i32);
3108   AM.Segment = T;
3109   if (matchAddress(N, AM))
3110     return false;
3111   assert (T == AM.Segment);
3112   AM.Segment = Copy;
3113
3114   unsigned Complexity = 0;
3115   if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3116     Complexity = 1;
3117   else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3118     Complexity = 4;
3119
3120   if (AM.IndexReg.getNode())
3121     Complexity++;
3122
3123   // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3124   // a simple shift.
3125   if (AM.Scale > 1)
3126     Complexity++;
3127
3128   // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3129   // to a LEA. This is determined with some experimentation but is by no means
3130   // optimal (especially for code size consideration). LEA is nice because of
3131   // its three-address nature. Tweak the cost function again when we can run
3132   // convertToThreeAddress() at register allocation time.
3133   if (AM.hasSymbolicDisplacement()) {
3134     // For X86-64, always use LEA to materialize RIP-relative addresses.
3135     if (Subtarget->is64Bit())
3136       Complexity = 4;
3137     else
3138       Complexity += 2;
3139   }
3140
3141   // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3142   // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3143   // duplicating flag-producing instructions later in the pipeline.
3144   if (N.getOpcode() == ISD::ADD) {
3145     auto isMathWithFlags = [](SDValue V) {
3146       switch (V.getOpcode()) {
3147       case X86ISD::ADD:
3148       case X86ISD::SUB:
3149       case X86ISD::ADC:
3150       case X86ISD::SBB:
3151       case X86ISD::SMUL:
3152       case X86ISD::UMUL:
3153       /* TODO: These opcodes can be added safely, but we may want to justify
3154                their inclusion for different reasons (better for reg-alloc).
3155       case X86ISD::OR:
3156       case X86ISD::XOR:
3157       case X86ISD::AND:
3158       */
3159         // Value 1 is the flag output of the node - verify it's not dead.
3160         return !SDValue(V.getNode(), 1).use_empty();
3161       default:
3162         return false;
3163       }
3164     };
3165     // TODO: We might want to factor in whether there's a load folding
3166     // opportunity for the math op that disappears with LEA.
3167     if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3168       Complexity++;
3169   }
3170
3171   if (AM.Disp)
3172     Complexity++;
3173
3174   // If it isn't worth using an LEA, reject it.
3175   if (Complexity <= 2)
3176     return false;
3177
3178   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3179   return true;
3180 }
3181
3182 /// This is only run on TargetGlobalTLSAddress nodes.
3183 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3184                                         SDValue &Scale, SDValue &Index,
3185                                         SDValue &Disp, SDValue &Segment) {
3186   assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3187          N.getOpcode() == ISD::TargetExternalSymbol);
3188
3189   X86ISelAddressMode AM;
3190   if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3191     AM.GV = GA->getGlobal();
3192     AM.Disp += GA->getOffset();
3193     AM.SymbolFlags = GA->getTargetFlags();
3194   } else {
3195     auto *SA = cast<ExternalSymbolSDNode>(N);
3196     AM.ES = SA->getSymbol();
3197     AM.SymbolFlags = SA->getTargetFlags();
3198   }
3199
3200   if (Subtarget->is32Bit()) {
3201     AM.Scale = 1;
3202     AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3203   }
3204
3205   MVT VT = N.getSimpleValueType();
3206   getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3207   return true;
3208 }
3209
3210 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3211   // Keep track of the original value type and whether this value was
3212   // truncated. If we see a truncation from pointer type to VT that truncates
3213   // bits that are known to be zero, we can use a narrow reference.
3214   EVT VT = N.getValueType();
3215   bool WasTruncated = false;
3216   if (N.getOpcode() == ISD::TRUNCATE) {
3217     WasTruncated = true;
3218     N = N.getOperand(0);
3219   }
3220
3221   if (N.getOpcode() != X86ISD::Wrapper)
3222     return false;
3223
3224   // We can only use non-GlobalValues as immediates if they were not truncated,
3225   // as we do not have any range information. If we have a GlobalValue and the
3226   // address was not truncated, we can select it as an operand directly.
3227   unsigned Opc = N.getOperand(0)->getOpcode();
3228   if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3229     Op = N.getOperand(0);
3230     // We can only select the operand directly if we didn't have to look past a
3231     // truncate.
3232     return !WasTruncated;
3233   }
3234
3235   // Check that the global's range fits into VT.
3236   auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3237   std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3238   if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3239     return false;
3240
3241   // Okay, we can use a narrow reference.
3242   Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3243                                       GA->getOffset(), GA->getTargetFlags());
3244   return true;
3245 }
3246
3247 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3248                                   SDValue &Base, SDValue &Scale,
3249                                   SDValue &Index, SDValue &Disp,
3250                                   SDValue &Segment) {
3251   assert(Root && P && "Unknown root/parent nodes");
3252   if (!ISD::isNON_EXTLoad(N.getNode()) ||
3253       !IsProfitableToFold(N, P, Root) ||
3254       !IsLegalToFold(N, P, Root, OptLevel))
3255     return false;
3256
3257   return selectAddr(N.getNode(),
3258                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
3259 }
3260
3261 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3262                                        SDValue &Base, SDValue &Scale,
3263                                        SDValue &Index, SDValue &Disp,
3264                                        SDValue &Segment) {
3265   assert(Root && P && "Unknown root/parent nodes");
3266   if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3267       !IsProfitableToFold(N, P, Root) ||
3268       !IsLegalToFold(N, P, Root, OptLevel))
3269     return false;
3270
3271   return selectAddr(N.getNode(),
3272                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
3273 }
3274
3275 /// Return an SDNode that returns the value of the global base register.
3276 /// Output instructions required to initialize the global base register,
3277 /// if necessary.
3278 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3279   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3280   auto &DL = MF->getDataLayout();
3281   return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3282 }
3283
3284 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3285   if (N->getOpcode() == ISD::TRUNCATE)
3286     N = N->getOperand(0).getNode();
3287   if (N->getOpcode() != X86ISD::Wrapper)
3288     return false;
3289
3290   auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3291   if (!GA)
3292     return false;
3293
3294   auto *GV = GA->getGlobal();
3295   std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3296   if (CR)
3297     return CR->getSignedMin().sge(-1ull << Width) &&
3298            CR->getSignedMax().slt(1ull << Width);
3299   // In the kernel code model, globals are in the negative 2GB of the address
3300   // space, so globals can be a sign extended 32-bit immediate.
3301   // In other code models, small globals are in the low 2GB of the address
3302   // space, so sign extending them is equivalent to zero extending them.
3303   return Width == 32 && !TM.isLargeGlobalValue(GV);
3304 }
3305
3306 X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3307   assert(N->isMachineOpcode() && "Unexpected node");
3308   unsigned Opc = N->getMachineOpcode();
3309   const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3310   int CondNo = X86::getCondSrcNoFromDesc(MCID);
3311   if (CondNo < 0)
3312     return X86::COND_INVALID;
3313
3314   return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3315 }
3316
3317 /// Test whether the given X86ISD::CMP node has any users that use a flag
3318 /// other than ZF.
3319 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3320   // Examine each user of the node.
3321   for (SDUse &Use : Flags->uses()) {
3322     // Only check things that use the flags.
3323     if (Use.getResNo() != Flags.getResNo())
3324       continue;
3325     SDNode *User = Use.getUser();
3326     // Only examine CopyToReg uses that copy to EFLAGS.
3327     if (User->getOpcode() != ISD::CopyToReg ||
3328         cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3329       return false;
3330     // Examine each user of the CopyToReg use.
3331     for (SDUse &FlagUse : User->uses()) {
3332       // Only examine the Flag result.
3333       if (FlagUse.getResNo() != 1)
3334         continue;
3335       // Anything unusual: assume conservatively.
3336       if (!FlagUse.getUser()->isMachineOpcode())
3337         return false;
3338       // Examine the condition code of the user.
3339       X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3340
3341       switch (CC) {
3342       // Comparisons which only use the zero flag.
3343       case X86::COND_E: case X86::COND_NE:
3344         continue;
3345       // Anything else: assume conservatively.
3346       default:
3347         return false;
3348       }
3349     }
3350   }
3351   return true;
3352 }
3353
3354 /// Test whether the given X86ISD::CMP node has any uses which require the SF
3355 /// flag to be accurate.
3356 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3357   // Examine each user of the node.
3358   for (SDUse &Use : Flags->uses()) {
3359     // Only check things that use the flags.
3360     if (Use.getResNo() != Flags.getResNo())
3361       continue;
3362     SDNode *User = Use.getUser();
3363     // Only examine CopyToReg uses that copy to EFLAGS.
3364     if (User->getOpcode() != ISD::CopyToReg ||
3365         cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3366       return false;
3367     // Examine each user of the CopyToReg use.
3368     for (SDUse &FlagUse : User->uses()) {
3369       // Only examine the Flag result.
3370       if (FlagUse.getResNo() != 1)
3371         continue;
3372       // Anything unusual: assume conservatively.
3373       if (!FlagUse.getUser()->isMachineOpcode())
3374         return false;
3375       // Examine the condition code of the user.
3376       X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3377
3378       switch (CC) {
3379       // Comparisons which don't examine the SF flag.
3380       case X86::COND_A: case X86::COND_AE:
3381       case X86::COND_B: case X86::COND_BE:
3382       case X86::COND_E: case X86::COND_NE:
3383       case X86::COND_O: case X86::COND_NO:
3384       case X86::COND_P: case X86::COND_NP:
3385         continue;
3386       // Anything else: assume conservatively.
3387       default:
3388         return false;
3389       }
3390     }
3391   }
3392   return true;
3393 }
3394
3395 static bool mayUseCarryFlag(X86::CondCode CC) {
3396   switch (CC) {
3397   // Comparisons which don't examine the CF flag.
3398   case X86::COND_O: case X86::COND_NO:
3399   case X86::COND_E: case X86::COND_NE:
3400   case X86::COND_S: case X86::COND_NS:
3401   case X86::COND_P: case X86::COND_NP:
3402   case X86::COND_L: case X86::COND_GE:
3403   case X86::COND_G: case X86::COND_LE:
3404     return false;
3405   // Anything else: assume conservatively.
3406   default:
3407     return true;
3408   }
3409 }
3410
3411 /// Test whether the given node which sets flags has any uses which require the
3412 /// CF flag to be accurate.
3413  bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3414   // Examine each user of the node.
3415   for (SDUse &Use : Flags->uses()) {
3416     // Only check things that use the flags.
3417     if (Use.getResNo() != Flags.getResNo())
3418       continue;
3419
3420     SDNode *User = Use.getUser();
3421     unsigned UserOpc = User->getOpcode();
3422
3423     if (UserOpc == ISD::CopyToReg) {
3424       // Only examine CopyToReg uses that copy to EFLAGS.
3425       if (cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3426         return false;
3427       // Examine each user of the CopyToReg use.
3428       for (SDUse &FlagUse : User->uses()) {
3429         // Only examine the Flag result.
3430         if (FlagUse.getResNo() != 1)
3431           continue;
3432         // Anything unusual: assume conservatively.
3433         if (!FlagUse.getUser()->isMachineOpcode())
3434           return false;
3435         // Examine the condition code of the user.
3436         X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3437
3438         if (mayUseCarryFlag(CC))
3439           return false;
3440       }
3441
3442       // This CopyToReg is ok. Move on to the next user.
3443       continue;
3444     }
3445
3446     // This might be an unselected node. So look for the pre-isel opcodes that
3447     // use flags.
3448     unsigned CCOpNo;
3449     switch (UserOpc) {
3450     default:
3451       // Something unusual. Be conservative.
3452       return false;
3453     case X86ISD::SETCC:       CCOpNo = 0; break;
3454     case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3455     case X86ISD::CMOV:        CCOpNo = 2; break;
3456     case X86ISD::BRCOND:      CCOpNo = 2; break;
3457     }
3458
3459     X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
3460     if (mayUseCarryFlag(CC))
3461       return false;
3462   }
3463   return true;
3464 }
3465
3466 /// Check whether or not the chain ending in StoreNode is suitable for doing
3467 /// the {load; op; store} to modify transformation.
3468 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3469                                         SDValue StoredVal, SelectionDAG *CurDAG,
3470                                         unsigned LoadOpNo,
3471                                         LoadSDNode *&LoadNode,
3472                                         SDValue &InputChain) {
3473   // Is the stored value result 0 of the operation?
3474   if (StoredVal.getResNo() != 0) return false;
3475
3476   // Are there other uses of the operation other than the store?
3477   if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3478
3479   // Is the store non-extending and non-indexed?
3480   if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3481     return false;
3482
3483   SDValue Load = StoredVal->getOperand(LoadOpNo);
3484   // Is the stored value a non-extending and non-indexed load?
3485   if (!ISD::isNormalLoad(Load.getNode())) return false;
3486
3487   // Return LoadNode by reference.
3488   LoadNode = cast<LoadSDNode>(Load);
3489
3490   // Is store the only read of the loaded value?
3491   if (!Load.hasOneUse())
3492     return false;
3493
3494   // Is the address of the store the same as the load?
3495   if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3496       LoadNode->getOffset() != StoreNode->getOffset())
3497     return false;
3498
3499   bool FoundLoad = false;
3500   SmallVector<SDValue, 4> ChainOps;
3501   SmallVector<const SDNode *, 4> LoopWorklist;
3502   SmallPtrSet<const SDNode *, 16> Visited;
3503   const unsigned int Max = 1024;
3504
3505   //  Visualization of Load-Op-Store fusion:
3506   // -------------------------
3507   // Legend:
3508   //    *-lines = Chain operand dependencies.
3509   //    |-lines = Normal operand dependencies.
3510   //    Dependencies flow down and right. n-suffix references multiple nodes.
3511   //
3512   //        C                        Xn  C
3513   //        *                         *  *
3514   //        *                          * *
3515   //  Xn  A-LD    Yn                    TF         Yn
3516   //   *    * \   |                       *        |
3517   //    *   *  \  |                        *       |
3518   //     *  *   \ |             =>       A--LD_OP_ST
3519   //      * *    \|                                 \
3520   //       TF    OP                                  \
3521   //         *   | \                                  Zn
3522   //          *  |  \
3523   //         A-ST    Zn
3524   //
3525
3526   // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3527   //                                      #2: Yn -> LD
3528   //                                      #3: ST -> Zn
3529
3530   // Ensure the transform is safe by checking for the dual
3531   // dependencies to make sure we do not induce a loop.
3532
3533   // As LD is a predecessor to both OP and ST we can do this by checking:
3534   //  a). if LD is a predecessor to a member of Xn or Yn.
3535   //  b). if a Zn is a predecessor to ST.
3536
3537   // However, (b) can only occur through being a chain predecessor to
3538   // ST, which is the same as Zn being a member or predecessor of Xn,
3539   // which is a subset of LD being a predecessor of Xn. So it's
3540   // subsumed by check (a).
3541
3542   SDValue Chain = StoreNode->getChain();
3543
3544   // Gather X elements in ChainOps.
3545   if (Chain == Load.getValue(1)) {
3546     FoundLoad = true;
3547     ChainOps.push_back(Load.getOperand(0));
3548   } else if (Chain.getOpcode() == ISD::TokenFactor) {
3549     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3550       SDValue Op = Chain.getOperand(i);
3551       if (Op == Load.getValue(1)) {
3552         FoundLoad = true;
3553         // Drop Load, but keep its chain. No cycle check necessary.
3554         ChainOps.push_back(Load.getOperand(0));
3555         continue;
3556       }
3557       LoopWorklist.push_back(Op.getNode());
3558       ChainOps.push_back(Op);
3559     }
3560   }
3561
3562   if (!FoundLoad)
3563     return false;
3564
3565   // Worklist is currently Xn. Add Yn to worklist.
3566   for (SDValue Op : StoredVal->ops())
3567     if (Op.getNode() != LoadNode)
3568       LoopWorklist.push_back(Op.getNode());
3569
3570   // Check (a) if Load is a predecessor to Xn + Yn
3571   if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3572                                    true))
3573     return false;
3574
3575   InputChain =
3576       CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3577   return true;
3578 }
3579
3580 // Change a chain of {load; op; store} of the same value into a simple op
3581 // through memory of that value, if the uses of the modified value and its
3582 // address are suitable.
3583 //
3584 // The tablegen pattern memory operand pattern is currently not able to match
3585 // the case where the EFLAGS on the original operation are used.
3586 //
3587 // To move this to tablegen, we'll need to improve tablegen to allow flags to
3588 // be transferred from a node in the pattern to the result node, probably with
3589 // a new keyword. For example, we have this
3590 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3591 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3592 // but maybe need something like this
3593 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3594 //  [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3595 //   (transferrable EFLAGS)]>;
3596 //
3597 // Until then, we manually fold these and instruction select the operation
3598 // here.
3599 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3600   auto *StoreNode = cast<StoreSDNode>(Node);
3601   SDValue StoredVal = StoreNode->getOperand(1);
3602   unsigned Opc = StoredVal->getOpcode();
3603
3604   // Before we try to select anything, make sure this is memory operand size
3605   // and opcode we can handle. Note that this must match the code below that
3606   // actually lowers the opcodes.
3607   EVT MemVT = StoreNode->getMemoryVT();
3608   if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3609       MemVT != MVT::i8)
3610     return false;
3611
3612   bool IsCommutable = false;
3613   bool IsNegate = false;
3614   switch (Opc) {
3615   default:
3616     return false;
3617   case X86ISD::SUB:
3618     IsNegate = isNullConstant(StoredVal.getOperand(0));
3619     break;
3620   case X86ISD::SBB:
3621     break;
3622   case X86ISD::ADD:
3623   case X86ISD::ADC:
3624   case X86ISD::AND:
3625   case X86ISD::OR:
3626   case X86ISD::XOR:
3627     IsCommutable = true;
3628     break;
3629   }
3630
3631   unsigned LoadOpNo = IsNegate ? 1 : 0;
3632   LoadSDNode *LoadNode = nullptr;
3633   SDValue InputChain;
3634   if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3635                                    LoadNode, InputChain)) {
3636     if (!IsCommutable)
3637       return false;
3638
3639     // This operation is commutable, try the other operand.
3640     LoadOpNo = 1;
3641     if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3642                                      LoadNode, InputChain))
3643       return false;
3644   }
3645
3646   SDValue Base, Scale, Index, Disp, Segment;
3647   if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3648                   Segment))
3649     return false;
3650
3651   auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3652                           unsigned Opc8) {
3653     switch (MemVT.getSimpleVT().SimpleTy) {
3654     case MVT::i64:
3655       return Opc64;
3656     case MVT::i32:
3657       return Opc32;
3658     case MVT::i16:
3659       return Opc16;
3660     case MVT::i8:
3661       return Opc8;
3662     default:
3663       llvm_unreachable("Invalid size!");
3664     }
3665   };
3666
3667   MachineSDNode *Result;
3668   switch (Opc) {
3669   case X86ISD::SUB:
3670     // Handle negate.
3671     if (IsNegate) {
3672       unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3673                                      X86::NEG8m);
3674       const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3675       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3676                                       MVT::Other, Ops);
3677       break;
3678     }
3679    [[fallthrough]];
3680   case X86ISD::ADD:
3681     // Try to match inc/dec.
3682     if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3683       bool IsOne = isOneConstant(StoredVal.getOperand(1));
3684       bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3685       // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3686       if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3687         unsigned NewOpc =
3688           ((Opc == X86ISD::ADD) == IsOne)
3689               ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3690               : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3691         const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3692         Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3693                                         MVT::Other, Ops);
3694         break;
3695       }
3696     }
3697     [[fallthrough]];
3698   case X86ISD::ADC:
3699   case X86ISD::SBB:
3700   case X86ISD::AND:
3701   case X86ISD::OR:
3702   case X86ISD::XOR: {
3703     auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3704       switch (Opc) {
3705       case X86ISD::ADD:
3706         return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3707                             X86::ADD8mr);
3708       case X86ISD::ADC:
3709         return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3710                             X86::ADC8mr);
3711       case X86ISD::SUB:
3712         return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3713                             X86::SUB8mr);
3714       case X86ISD::SBB:
3715         return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3716                             X86::SBB8mr);
3717       case X86ISD::AND:
3718         return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3719                             X86::AND8mr);
3720       case X86ISD::OR:
3721         return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3722       case X86ISD::XOR:
3723         return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3724                             X86::XOR8mr);
3725       default:
3726         llvm_unreachable("Invalid opcode!");
3727       }
3728     };
3729     auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3730       switch (Opc) {
3731       case X86ISD::ADD:
3732         return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3733                             X86::ADD8mi);
3734       case X86ISD::ADC:
3735         return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3736                             X86::ADC8mi);
3737       case X86ISD::SUB:
3738         return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3739                             X86::SUB8mi);
3740       case X86ISD::SBB:
3741         return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3742                             X86::SBB8mi);
3743       case X86ISD::AND:
3744         return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3745                             X86::AND8mi);
3746       case X86ISD::OR:
3747         return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3748                             X86::OR8mi);
3749       case X86ISD::XOR:
3750         return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3751                             X86::XOR8mi);
3752       default:
3753         llvm_unreachable("Invalid opcode!");
3754       }
3755     };
3756
3757     unsigned NewOpc = SelectRegOpcode(Opc);
3758     SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3759
3760     // See if the operand is a constant that we can fold into an immediate
3761     // operand.
3762     if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3763       int64_t OperandV = OperandC->getSExtValue();
3764
3765       // Check if we can shrink the operand enough to fit in an immediate (or
3766       // fit into a smaller immediate) by negating it and switching the
3767       // operation.
3768       if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3769           ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3770            (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3771             isInt<32>(-OperandV))) &&
3772           hasNoCarryFlagUses(StoredVal.getValue(1))) {
3773         OperandV = -OperandV;
3774         Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3775       }
3776
3777       if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3778         Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT);
3779         NewOpc = SelectImmOpcode(Opc);
3780       }
3781     }
3782
3783     if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3784       SDValue CopyTo =
3785           CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3786                                StoredVal.getOperand(2), SDValue());
3787
3788       const SDValue Ops[] = {Base,    Scale,   Index,  Disp,
3789                              Segment, Operand, CopyTo, CopyTo.getValue(1)};
3790       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3791                                       Ops);
3792     } else {
3793       const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
3794                              Segment, Operand, InputChain};
3795       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3796                                       Ops);
3797     }
3798     break;
3799   }
3800   default:
3801     llvm_unreachable("Invalid opcode!");
3802   }
3803
3804   MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3805                                  LoadNode->getMemOperand()};
3806   CurDAG->setNodeMemRefs(Result, MemOps);
3807
3808   // Update Load Chain uses as well.
3809   ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3810   ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3811   ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3812   CurDAG->RemoveDeadNode(Node);
3813   return true;
3814 }
3815
3816 // See if this is an  X & Mask  that we can match to BEXTR/BZHI.
3817 // Where Mask is one of the following patterns:
3818 //   a) x &  (1 << nbits) - 1
3819 //   b) x & ~(-1 << nbits)
3820 //   c) x &  (-1 >> (32 - y))
3821 //   d) x << (32 - y) >> (32 - y)
3822 //   e) (1 << nbits) - 1
3823 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3824   assert(
3825       (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3826        Node->getOpcode() == ISD::SRL) &&
3827       "Should be either an and-mask, or right-shift after clearing high bits.");
3828
3829   // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3830   if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3831     return false;
3832
3833   MVT NVT = Node->getSimpleValueType(0);
3834
3835   // Only supported for 32 and 64 bits.
3836   if (NVT != MVT::i32 && NVT != MVT::i64)
3837     return false;
3838
3839   SDValue NBits;
3840   bool NegateNBits;
3841
3842   // If we have BMI2's BZHI, we are ok with muti-use patterns.
3843   // Else, if we only have BMI1's BEXTR, we require one-use.
3844   const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3845   auto checkUses = [AllowExtraUsesByDefault](
3846                        SDValue Op, unsigned NUses,
3847                        std::optional<bool> AllowExtraUses) {
3848     return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3849            Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3850   };
3851   auto checkOneUse = [checkUses](SDValue Op,
3852                                  std::optional<bool> AllowExtraUses =
3853                                      std::nullopt) {
3854     return checkUses(Op, 1, AllowExtraUses);
3855   };
3856   auto checkTwoUse = [checkUses](SDValue Op,
3857                                  std::optional<bool> AllowExtraUses =
3858                                      std::nullopt) {
3859     return checkUses(Op, 2, AllowExtraUses);
3860   };
3861
3862   auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3863     if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3864       assert(V.getSimpleValueType() == MVT::i32 &&
3865              V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3866              "Expected i64 -> i32 truncation");
3867       V = V.getOperand(0);
3868     }
3869     return V;
3870   };
3871
3872   // a) x & ((1 << nbits) + (-1))
3873   auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3874                         &NegateNBits](SDValue Mask) -> bool {
3875     // Match `add`. Must only have one use!
3876     if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3877       return false;
3878     // We should be adding all-ones constant (i.e. subtracting one.)
3879     if (!isAllOnesConstant(Mask->getOperand(1)))
3880       return false;
3881     // Match `1 << nbits`. Might be truncated. Must only have one use!
3882     SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3883     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3884       return false;
3885     if (!isOneConstant(M0->getOperand(0)))
3886       return false;
3887     NBits = M0->getOperand(1);
3888     NegateNBits = false;
3889     return true;
3890   };
3891
3892   auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3893     V = peekThroughOneUseTruncation(V);
3894     return CurDAG->MaskedValueIsAllOnes(
3895         V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3896                                 NVT.getSizeInBits()));
3897   };
3898
3899   // b) x & ~(-1 << nbits)
3900   auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3901                         &NBits, &NegateNBits](SDValue Mask) -> bool {
3902     // Match `~()`. Must only have one use!
3903     if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3904       return false;
3905     // The -1 only has to be all-ones for the final Node's NVT.
3906     if (!isAllOnes(Mask->getOperand(1)))
3907       return false;
3908     // Match `-1 << nbits`. Might be truncated. Must only have one use!
3909     SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3910     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3911       return false;
3912     // The -1 only has to be all-ones for the final Node's NVT.
3913     if (!isAllOnes(M0->getOperand(0)))
3914       return false;
3915     NBits = M0->getOperand(1);
3916     NegateNBits = false;
3917     return true;
3918   };
3919
3920   // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3921   // or leave the shift amount as-is, but then we'll have to negate it.
3922   auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3923                                                      unsigned Bitwidth) {
3924     NBits = ShiftAmt;
3925     NegateNBits = true;
3926     // Skip over a truncate of the shift amount, if any.
3927     if (NBits.getOpcode() == ISD::TRUNCATE)
3928       NBits = NBits.getOperand(0);
3929     // Try to match the shift amount as (bitwidth - y). It should go away, too.
3930     // If it doesn't match, that's fine, we'll just negate it ourselves.
3931     if (NBits.getOpcode() != ISD::SUB)
3932       return;
3933     auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3934     if (!V0 || V0->getZExtValue() != Bitwidth)
3935       return;
3936     NBits = NBits.getOperand(1);
3937     NegateNBits = false;
3938   };
3939
3940   // c) x &  (-1 >> z)  but then we'll have to subtract z from bitwidth
3941   //   or
3942   // c) x &  (-1 >> (32 - y))
3943   auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3944                         canonicalizeShiftAmt](SDValue Mask) -> bool {
3945     // The mask itself may be truncated.
3946     Mask = peekThroughOneUseTruncation(Mask);
3947     unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3948     // Match `l>>`. Must only have one use!
3949     if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3950       return false;
3951     // We should be shifting truly all-ones constant.
3952     if (!isAllOnesConstant(Mask.getOperand(0)))
3953       return false;
3954     SDValue M1 = Mask.getOperand(1);
3955     // The shift amount should not be used externally.
3956     if (!checkOneUse(M1))
3957       return false;
3958     canonicalizeShiftAmt(M1, Bitwidth);
3959     // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3960     // is no extra use of the mask. Clearly, there was one since we are here.
3961     // But at the same time, if we need to negate the shift amount,
3962     // then we don't want the mask to stick around, else it's unprofitable.
3963     return !NegateNBits;
3964   };
3965
3966   SDValue X;
3967
3968   // d) x << z >> z  but then we'll have to subtract z from bitwidth
3969   //   or
3970   // d) x << (32 - y) >> (32 - y)
3971   auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3972                         AllowExtraUsesByDefault, &NegateNBits,
3973                         &X](SDNode *Node) -> bool {
3974     if (Node->getOpcode() != ISD::SRL)
3975       return false;
3976     SDValue N0 = Node->getOperand(0);
3977     if (N0->getOpcode() != ISD::SHL)
3978       return false;
3979     unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3980     SDValue N1 = Node->getOperand(1);
3981     SDValue N01 = N0->getOperand(1);
3982     // Both of the shifts must be by the exact same value.
3983     if (N1 != N01)
3984       return false;
3985     canonicalizeShiftAmt(N1, Bitwidth);
3986     // There should not be any external uses of the inner shift / shift amount.
3987     // Note that while we are generally okay with external uses given BMI2,
3988     // iff we need to negate the shift amount, we are not okay with extra uses.
3989     const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3990     if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3991       return false;
3992     X = N0->getOperand(0);
3993     return true;
3994   };
3995
3996   auto matchLowBitMask = [matchPatternA, matchPatternB,
3997                           matchPatternC](SDValue Mask) -> bool {
3998     return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3999   };
4000
4001   if (Node->getOpcode() == ISD::AND) {
4002     X = Node->getOperand(0);
4003     SDValue Mask = Node->getOperand(1);
4004
4005     if (matchLowBitMask(Mask)) {
4006       // Great.
4007     } else {
4008       std::swap(X, Mask);
4009       if (!matchLowBitMask(Mask))
4010         return false;
4011     }
4012   } else if (matchLowBitMask(SDValue(Node, 0))) {
4013     X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
4014   } else if (!matchPatternD(Node))
4015     return false;
4016
4017   // If we need to negate the shift amount, require BMI2 BZHI support.
4018   // It's just too unprofitable for BMI1 BEXTR.
4019   if (NegateNBits && !Subtarget->hasBMI2())
4020     return false;
4021
4022   SDLoc DL(Node);
4023
4024   // Truncate the shift amount.
4025   NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
4026   insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4027
4028   // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4029   // All the other bits are undefined, we do not care about them.
4030   SDValue ImplDef = SDValue(
4031       CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
4032   insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
4033
4034   SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
4035   insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
4036   NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
4037                                          MVT::i32, ImplDef, NBits, SRIdxVal),
4038                   0);
4039   insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4040
4041   // We might have matched the amount of high bits to be cleared,
4042   // but we want the amount of low bits to be kept, so negate it then.
4043   if (NegateNBits) {
4044     SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4045     insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4046
4047     NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4048     insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4049   }
4050
4051   if (Subtarget->hasBMI2()) {
4052     // Great, just emit the BZHI..
4053     if (NVT != MVT::i32) {
4054       // But have to place the bit count into the wide-enough register first.
4055       NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4056       insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4057     }
4058
4059     SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4060     ReplaceNode(Node, Extract.getNode());
4061     SelectCode(Extract.getNode());
4062     return true;
4063   }
4064
4065   // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4066   // *logically* shifted (potentially with one-use trunc inbetween),
4067   // and the truncation was the only use of the shift,
4068   // and if so look past one-use truncation.
4069   {
4070     SDValue RealX = peekThroughOneUseTruncation(X);
4071     // FIXME: only if the shift is one-use?
4072     if (RealX != X && RealX.getOpcode() == ISD::SRL)
4073       X = RealX;
4074   }
4075
4076   MVT XVT = X.getSimpleValueType();
4077
4078   // Else, emitting BEXTR requires one more step.
4079   // The 'control' of BEXTR has the pattern of:
4080   // [15...8 bit][ 7...0 bit] location
4081   // [ bit count][     shift] name
4082   // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
4083
4084   // Shift NBits left by 8 bits, thus producing 'control'.
4085   // This makes the low 8 bits to be zero.
4086   SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4087   insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4088   SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4089   insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4090
4091   // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4092   // FIXME: only if the shift is one-use?
4093   if (X.getOpcode() == ISD::SRL) {
4094     SDValue ShiftAmt = X.getOperand(1);
4095     X = X.getOperand(0);
4096
4097     assert(ShiftAmt.getValueType() == MVT::i8 &&
4098            "Expected shift amount to be i8");
4099
4100     // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4101     // We could zext to i16 in some form, but we intentionally don't do that.
4102     SDValue OrigShiftAmt = ShiftAmt;
4103     ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4104     insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4105
4106     // And now 'or' these low 8 bits of shift amount into the 'control'.
4107     Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4108     insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4109   }
4110
4111   // But have to place the 'control' into the wide-enough register first.
4112   if (XVT != MVT::i32) {
4113     Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4114     insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4115   }
4116
4117   // And finally, form the BEXTR itself.
4118   SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4119
4120   // The 'X' was originally truncated. Do that now.
4121   if (XVT != NVT) {
4122     insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4123     Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4124   }
4125
4126   ReplaceNode(Node, Extract.getNode());
4127   SelectCode(Extract.getNode());
4128
4129   return true;
4130 }
4131
4132 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4133 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4134   MVT NVT = Node->getSimpleValueType(0);
4135   SDLoc dl(Node);
4136
4137   SDValue N0 = Node->getOperand(0);
4138   SDValue N1 = Node->getOperand(1);
4139
4140   // If we have TBM we can use an immediate for the control. If we have BMI
4141   // we should only do this if the BEXTR instruction is implemented well.
4142   // Otherwise moving the control into a register makes this more costly.
4143   // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4144   // hoisting the move immediate would make it worthwhile with a less optimal
4145   // BEXTR?
4146   bool PreferBEXTR =
4147       Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4148   if (!PreferBEXTR && !Subtarget->hasBMI2())
4149     return nullptr;
4150
4151   // Must have a shift right.
4152   if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4153     return nullptr;
4154
4155   // Shift can't have additional users.
4156   if (!N0->hasOneUse())
4157     return nullptr;
4158
4159   // Only supported for 32 and 64 bits.
4160   if (NVT != MVT::i32 && NVT != MVT::i64)
4161     return nullptr;
4162
4163   // Shift amount and RHS of and must be constant.
4164   auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4165   auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4166   if (!MaskCst || !ShiftCst)
4167     return nullptr;
4168
4169   // And RHS must be a mask.
4170   uint64_t Mask = MaskCst->getZExtValue();
4171   if (!isMask_64(Mask))
4172     return nullptr;
4173
4174   uint64_t Shift = ShiftCst->getZExtValue();
4175   uint64_t MaskSize = llvm::popcount(Mask);
4176
4177   // Don't interfere with something that can be handled by extracting AH.
4178   // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4179   if (Shift == 8 && MaskSize == 8)
4180     return nullptr;
4181
4182   // Make sure we are only using bits that were in the original value, not
4183   // shifted in.
4184   if (Shift + MaskSize > NVT.getSizeInBits())
4185     return nullptr;
4186
4187   // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4188   // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4189   // does not fit into 32 bits. Load folding is not a sufficient reason.
4190   if (!PreferBEXTR && MaskSize <= 32)
4191     return nullptr;
4192
4193   SDValue Control;
4194   unsigned ROpc, MOpc;
4195
4196 #define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4197   if (!PreferBEXTR) {
4198     assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4199     // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4200     // Let's perform the mask first, and apply shift later. Note that we need to
4201     // widen the mask to account for the fact that we'll apply shift afterwards!
4202     Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4203     ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4204                            : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4205     MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4206                            : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4207     unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4208     Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4209   } else {
4210     // The 'control' of BEXTR has the pattern of:
4211     // [15...8 bit][ 7...0 bit] location
4212     // [ bit count][     shift] name
4213     // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
4214     Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4215     if (Subtarget->hasTBM()) {
4216       ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4217       MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4218     } else {
4219       assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4220       // BMI requires the immediate to placed in a register.
4221       ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4222                              : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4223       MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4224                              : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4225       unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4226       Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4227     }
4228   }
4229
4230   MachineSDNode *NewNode;
4231   SDValue Input = N0->getOperand(0);
4232   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4233   if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4234     SDValue Ops[] = {
4235         Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4236     SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4237     NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4238     // Update the chain.
4239     ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4240     // Record the mem-refs
4241     CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4242   } else {
4243     NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4244   }
4245
4246   if (!PreferBEXTR) {
4247     // We still need to apply the shift.
4248     SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4249     unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4250                                       : GET_ND_IF_ENABLED(X86::SHR32ri);
4251     NewNode =
4252         CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4253   }
4254
4255   return NewNode;
4256 }
4257
4258 // Emit a PCMISTR(I/M) instruction.
4259 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4260                                              bool MayFoldLoad, const SDLoc &dl,
4261                                              MVT VT, SDNode *Node) {
4262   SDValue N0 = Node->getOperand(0);
4263   SDValue N1 = Node->getOperand(1);
4264   SDValue Imm = Node->getOperand(2);
4265   auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4266   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4267
4268   // Try to fold a load. No need to check alignment.
4269   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4270   if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4271     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4272                       N1.getOperand(0) };
4273     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4274     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4275     // Update the chain.
4276     ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4277     // Record the mem-refs
4278     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4279     return CNode;
4280   }
4281
4282   SDValue Ops[] = { N0, N1, Imm };
4283   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4284   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4285   return CNode;
4286 }
4287
4288 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4289 // to emit a second instruction after this one. This is needed since we have two
4290 // copyToReg nodes glued before this and we need to continue that glue through.
4291 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4292                                              bool MayFoldLoad, const SDLoc &dl,
4293                                              MVT VT, SDNode *Node,
4294                                              SDValue &InGlue) {
4295   SDValue N0 = Node->getOperand(0);
4296   SDValue N2 = Node->getOperand(2);
4297   SDValue Imm = Node->getOperand(4);
4298   auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4299   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4300
4301   // Try to fold a load. No need to check alignment.
4302   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4303   if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4304     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4305                       N2.getOperand(0), InGlue };
4306     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4307     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4308     InGlue = SDValue(CNode, 3);
4309     // Update the chain.
4310     ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4311     // Record the mem-refs
4312     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4313     return CNode;
4314   }
4315
4316   SDValue Ops[] = { N0, N2, Imm, InGlue };
4317   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4318   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4319   InGlue = SDValue(CNode, 2);
4320   return CNode;
4321 }
4322
4323 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4324   EVT VT = N->getValueType(0);
4325
4326   // Only handle scalar shifts.
4327   if (VT.isVector())
4328     return false;
4329
4330   // Narrower shifts only mask to 5 bits in hardware.
4331   unsigned Size = VT == MVT::i64 ? 64 : 32;
4332
4333   SDValue OrigShiftAmt = N->getOperand(1);
4334   SDValue ShiftAmt = OrigShiftAmt;
4335   SDLoc DL(N);
4336
4337   // Skip over a truncate of the shift amount.
4338   if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4339     ShiftAmt = ShiftAmt->getOperand(0);
4340
4341   // This function is called after X86DAGToDAGISel::matchBitExtract(),
4342   // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4343
4344   SDValue NewShiftAmt;
4345   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4346       ShiftAmt->getOpcode() == ISD::XOR) {
4347     SDValue Add0 = ShiftAmt->getOperand(0);
4348     SDValue Add1 = ShiftAmt->getOperand(1);
4349     auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4350     auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4351     // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4352     // to avoid the ADD/SUB/XOR.
4353     if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4354       NewShiftAmt = Add0;
4355
4356     } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4357                ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4358                 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4359       // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4360       // we can replace it with a NOT. In the XOR case it may save some code
4361       // size, in the SUB case it also may save a move.
4362       assert(Add0C == nullptr || Add1C == nullptr);
4363
4364       // We can only do N-X, not X-N
4365       if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4366         return false;
4367
4368       EVT OpVT = ShiftAmt.getValueType();
4369
4370       SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4371       NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4372                                     Add0C == nullptr ? Add0 : Add1, AllOnes);
4373       insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4374       insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4375       // If we are shifting by N-X where N == 0 mod Size, then just shift by
4376       // -X to generate a NEG instead of a SUB of a constant.
4377     } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4378                Add0C->getZExtValue() != 0) {
4379       EVT SubVT = ShiftAmt.getValueType();
4380       SDValue X;
4381       if (Add0C->getZExtValue() % Size == 0)
4382         X = Add1;
4383       else if (ShiftAmt.hasOneUse() && Size == 64 &&
4384                Add0C->getZExtValue() % 32 == 0) {
4385         // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4386         // This is mainly beneficial if we already compute (x+n*32).
4387         if (Add1.getOpcode() == ISD::TRUNCATE) {
4388           Add1 = Add1.getOperand(0);
4389           SubVT = Add1.getValueType();
4390         }
4391         if (Add0.getValueType() != SubVT) {
4392           Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4393           insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4394         }
4395
4396         X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4397         insertDAGNode(*CurDAG, OrigShiftAmt, X);
4398       } else
4399         return false;
4400       // Insert a negate op.
4401       // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4402       // that uses it that's not a shift.
4403       SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4404       SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4405       NewShiftAmt = Neg;
4406
4407       // Insert these operands into a valid topological order so they can
4408       // get selected independently.
4409       insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4410       insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4411     } else
4412       return false;
4413   } else
4414     return false;
4415
4416   if (NewShiftAmt.getValueType() != MVT::i8) {
4417     // Need to truncate the shift amount.
4418     NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4419     // Add to a correct topological ordering.
4420     insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4421   }
4422
4423   // Insert a new mask to keep the shift amount legal. This should be removed
4424   // by isel patterns.
4425   NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4426                                 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4427   // Place in a correct topological ordering.
4428   insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4429
4430   SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4431                                                    NewShiftAmt);
4432   if (UpdatedNode != N) {
4433     // If we found an existing node, we should replace ourselves with that node
4434     // and wait for it to be selected after its other users.
4435     ReplaceNode(N, UpdatedNode);
4436     return true;
4437   }
4438
4439   // If the original shift amount is now dead, delete it so that we don't run
4440   // it through isel.
4441   if (OrigShiftAmt.getNode()->use_empty())
4442     CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4443
4444   // Now that we've optimized the shift amount, defer to normal isel to get
4445   // load folding and legacy vs BMI2 selection without repeating it here.
4446   SelectCode(N);
4447   return true;
4448 }
4449
4450 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4451   MVT NVT = N->getSimpleValueType(0);
4452   unsigned Opcode = N->getOpcode();
4453   SDLoc dl(N);
4454
4455   // For operations of the form (x << C1) op C2, check if we can use a smaller
4456   // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4457   SDValue Shift = N->getOperand(0);
4458   SDValue N1 = N->getOperand(1);
4459
4460   auto *Cst = dyn_cast<ConstantSDNode>(N1);
4461   if (!Cst)
4462     return false;
4463
4464   int64_t Val = Cst->getSExtValue();
4465
4466   // If we have an any_extend feeding the AND, look through it to see if there
4467   // is a shift behind it. But only if the AND doesn't use the extended bits.
4468   // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4469   bool FoundAnyExtend = false;
4470   if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4471       Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4472       isUInt<32>(Val)) {
4473     FoundAnyExtend = true;
4474     Shift = Shift.getOperand(0);
4475   }
4476
4477   if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4478     return false;
4479
4480   // i8 is unshrinkable, i16 should be promoted to i32.
4481   if (NVT != MVT::i32 && NVT != MVT::i64)
4482     return false;
4483
4484   auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4485   if (!ShlCst)
4486     return false;
4487
4488   uint64_t ShAmt = ShlCst->getZExtValue();
4489
4490   // Make sure that we don't change the operation by removing bits.
4491   // This only matters for OR and XOR, AND is unaffected.
4492   uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4493   if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4494     return false;
4495
4496   // Check the minimum bitwidth for the new constant.
4497   // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4498   auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4499     if (Opcode == ISD::AND) {
4500       // AND32ri is the same as AND64ri32 with zext imm.
4501       // Try this before sign extended immediates below.
4502       ShiftedVal = (uint64_t)Val >> ShAmt;
4503       if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4504         return true;
4505       // Also swap order when the AND can become MOVZX.
4506       if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4507         return true;
4508     }
4509     ShiftedVal = Val >> ShAmt;
4510     if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4511         (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4512       return true;
4513     if (Opcode != ISD::AND) {
4514       // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4515       ShiftedVal = (uint64_t)Val >> ShAmt;
4516       if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4517         return true;
4518     }
4519     return false;
4520   };
4521
4522   int64_t ShiftedVal;
4523   if (!CanShrinkImmediate(ShiftedVal))
4524     return false;
4525
4526   // Ok, we can reorder to get a smaller immediate.
4527
4528   // But, its possible the original immediate allowed an AND to become MOVZX.
4529   // Doing this late due to avoid the MakedValueIsZero call as late as
4530   // possible.
4531   if (Opcode == ISD::AND) {
4532     // Find the smallest zext this could possibly be.
4533     unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4534     ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4535
4536     // Figure out which bits need to be zero to achieve that mask.
4537     APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4538                                             ZExtWidth);
4539     NeededMask &= ~Cst->getAPIntValue();
4540
4541     if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4542       return false;
4543   }
4544
4545   SDValue X = Shift.getOperand(0);
4546   if (FoundAnyExtend) {
4547     SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4548     insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4549     X = NewX;
4550   }
4551
4552   SDValue NewCst = CurDAG->getSignedConstant(ShiftedVal, dl, NVT);
4553   insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4554   SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4555   insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4556   SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4557                                    Shift.getOperand(1));
4558   ReplaceNode(N, NewSHL.getNode());
4559   SelectCode(NewSHL.getNode());
4560   return true;
4561 }
4562
4563 bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4564                                      SDNode *ParentB, SDNode *ParentC,
4565                                      SDValue A, SDValue B, SDValue C,
4566                                      uint8_t Imm) {
4567   assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4568          C.isOperandOf(ParentC) && "Incorrect parent node");
4569
4570   auto tryFoldLoadOrBCast =
4571       [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4572              SDValue &Index, SDValue &Disp, SDValue &Segment) {
4573         if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4574           return true;
4575
4576         // Not a load, check for broadcast which may be behind a bitcast.
4577         if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4578           P = L.getNode();
4579           L = L.getOperand(0);
4580         }
4581
4582         if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4583           return false;
4584
4585         // Only 32 and 64 bit broadcasts are supported.
4586         auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4587         unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4588         if (Size != 32 && Size != 64)
4589           return false;
4590
4591         return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4592       };
4593
4594   bool FoldedLoad = false;
4595   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4596   if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4597     FoldedLoad = true;
4598   } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4599                                 Tmp4)) {
4600     FoldedLoad = true;
4601     std::swap(A, C);
4602     // Swap bits 1/4 and 3/6.
4603     uint8_t OldImm = Imm;
4604     Imm = OldImm & 0xa5;
4605     if (OldImm & 0x02) Imm |= 0x10;
4606     if (OldImm & 0x10) Imm |= 0x02;
4607     if (OldImm & 0x08) Imm |= 0x40;
4608     if (OldImm & 0x40) Imm |= 0x08;
4609   } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4610                                 Tmp4)) {
4611     FoldedLoad = true;
4612     std::swap(B, C);
4613     // Swap bits 1/2 and 5/6.
4614     uint8_t OldImm = Imm;
4615     Imm = OldImm & 0x99;
4616     if (OldImm & 0x02) Imm |= 0x04;
4617     if (OldImm & 0x04) Imm |= 0x02;
4618     if (OldImm & 0x20) Imm |= 0x40;
4619     if (OldImm & 0x40) Imm |= 0x20;
4620   }
4621
4622   SDLoc DL(Root);
4623
4624   SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4625
4626   MVT NVT = Root->getSimpleValueType(0);
4627
4628   MachineSDNode *MNode;
4629   if (FoldedLoad) {
4630     SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4631
4632     unsigned Opc;
4633     if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4634       auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4635       unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4636       assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4637
4638       bool UseD = EltSize == 32;
4639       if (NVT.is128BitVector())
4640         Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4641       else if (NVT.is256BitVector())
4642         Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4643       else if (NVT.is512BitVector())
4644         Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4645       else
4646         llvm_unreachable("Unexpected vector size!");
4647     } else {
4648       bool UseD = NVT.getVectorElementType() == MVT::i32;
4649       if (NVT.is128BitVector())
4650         Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4651       else if (NVT.is256BitVector())
4652         Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4653       else if (NVT.is512BitVector())
4654         Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4655       else
4656         llvm_unreachable("Unexpected vector size!");
4657     }
4658
4659     SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4660     MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4661
4662     // Update the chain.
4663     ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4664     // Record the mem-refs
4665     CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4666   } else {
4667     bool UseD = NVT.getVectorElementType() == MVT::i32;
4668     unsigned Opc;
4669     if (NVT.is128BitVector())
4670       Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4671     else if (NVT.is256BitVector())
4672       Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4673     else if (NVT.is512BitVector())
4674       Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4675     else
4676       llvm_unreachable("Unexpected vector size!");
4677
4678     MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4679   }
4680
4681   ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4682   CurDAG->RemoveDeadNode(Root);
4683   return true;
4684 }
4685
4686 // Try to match two logic ops to a VPTERNLOG.
4687 // FIXME: Handle more complex patterns that use an operand more than once?
4688 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4689   MVT NVT = N->getSimpleValueType(0);
4690
4691   // Make sure we support VPTERNLOG.
4692   if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4693       NVT.getVectorElementType() == MVT::i1)
4694     return false;
4695
4696   // We need VLX for 128/256-bit.
4697   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4698     return false;
4699
4700   SDValue N0 = N->getOperand(0);
4701   SDValue N1 = N->getOperand(1);
4702
4703   auto getFoldableLogicOp = [](SDValue Op) {
4704     // Peek through single use bitcast.
4705     if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4706       Op = Op.getOperand(0);
4707
4708     if (!Op.hasOneUse())
4709       return SDValue();
4710
4711     unsigned Opc = Op.getOpcode();
4712     if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4713         Opc == X86ISD::ANDNP)
4714       return Op;
4715
4716     return SDValue();
4717   };
4718
4719   SDValue A, FoldableOp;
4720   if ((FoldableOp = getFoldableLogicOp(N1))) {
4721     A = N0;
4722   } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4723     A = N1;
4724   } else
4725     return false;
4726
4727   SDValue B = FoldableOp.getOperand(0);
4728   SDValue C = FoldableOp.getOperand(1);
4729   SDNode *ParentA = N;
4730   SDNode *ParentB = FoldableOp.getNode();
4731   SDNode *ParentC = FoldableOp.getNode();
4732
4733   // We can build the appropriate control immediate by performing the logic
4734   // operation we're matching using these constants for A, B, and C.
4735   uint8_t TernlogMagicA = 0xf0;
4736   uint8_t TernlogMagicB = 0xcc;
4737   uint8_t TernlogMagicC = 0xaa;
4738
4739   // Some of the inputs may be inverted, peek through them and invert the
4740   // magic values accordingly.
4741   // TODO: There may be a bitcast before the xor that we should peek through.
4742   auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4743     if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4744         ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4745       Magic = ~Magic;
4746       Parent = Op.getNode();
4747       Op = Op.getOperand(0);
4748     }
4749   };
4750
4751   PeekThroughNot(A, ParentA, TernlogMagicA);
4752   PeekThroughNot(B, ParentB, TernlogMagicB);
4753   PeekThroughNot(C, ParentC, TernlogMagicC);
4754
4755   uint8_t Imm;
4756   switch (FoldableOp.getOpcode()) {
4757   default: llvm_unreachable("Unexpected opcode!");
4758   case ISD::AND:      Imm = TernlogMagicB & TernlogMagicC; break;
4759   case ISD::OR:       Imm = TernlogMagicB | TernlogMagicC; break;
4760   case ISD::XOR:      Imm = TernlogMagicB ^ TernlogMagicC; break;
4761   case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4762   }
4763
4764   switch (N->getOpcode()) {
4765   default: llvm_unreachable("Unexpected opcode!");
4766   case X86ISD::ANDNP:
4767     if (A == N0)
4768       Imm &= ~TernlogMagicA;
4769     else
4770       Imm = ~(Imm) & TernlogMagicA;
4771     break;
4772   case ISD::AND: Imm &= TernlogMagicA; break;
4773   case ISD::OR:  Imm |= TernlogMagicA; break;
4774   case ISD::XOR: Imm ^= TernlogMagicA; break;
4775   }
4776
4777   return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4778 }
4779
4780 /// If the high bits of an 'and' operand are known zero, try setting the
4781 /// high bits of an 'and' constant operand to produce a smaller encoding by
4782 /// creating a small, sign-extended negative immediate rather than a large
4783 /// positive one. This reverses a transform in SimplifyDemandedBits that
4784 /// shrinks mask constants by clearing bits. There is also a possibility that
4785 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4786 /// case, just replace the 'and'. Return 'true' if the node is replaced.
4787 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4788   // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4789   // have immediate operands.
4790   MVT VT = And->getSimpleValueType(0);
4791   if (VT != MVT::i32 && VT != MVT::i64)
4792     return false;
4793
4794   auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4795   if (!And1C)
4796     return false;
4797
4798   // Bail out if the mask constant is already negative. It's can't shrink more.
4799   // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4800   // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4801   // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4802   // are negative too.
4803   APInt MaskVal = And1C->getAPIntValue();
4804   unsigned MaskLZ = MaskVal.countl_zero();
4805   if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4806     return false;
4807
4808   // Don't extend into the upper 32 bits of a 64 bit mask.
4809   if (VT == MVT::i64 && MaskLZ >= 32) {
4810     MaskLZ -= 32;
4811     MaskVal = MaskVal.trunc(32);
4812   }
4813
4814   SDValue And0 = And->getOperand(0);
4815   APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4816   APInt NegMaskVal = MaskVal | HighZeros;
4817
4818   // If a negative constant would not allow a smaller encoding, there's no need
4819   // to continue. Only change the constant when we know it's a win.
4820   unsigned MinWidth = NegMaskVal.getSignificantBits();
4821   if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4822     return false;
4823
4824   // Extend masks if we truncated above.
4825   if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4826     NegMaskVal = NegMaskVal.zext(64);
4827     HighZeros = HighZeros.zext(64);
4828   }
4829
4830   // The variable operand must be all zeros in the top bits to allow using the
4831   // new, negative constant as the mask.
4832   // TODO: Handle constant folding?
4833   KnownBits Known0 = CurDAG->computeKnownBits(And0);
4834   if (Known0.isConstant() || !HighZeros.isSubsetOf(Known0.Zero))
4835     return false;
4836
4837   // Check if the mask is -1. In that case, this is an unnecessary instruction
4838   // that escaped earlier analysis.
4839   if (NegMaskVal.isAllOnes()) {
4840     ReplaceNode(And, And0.getNode());
4841     return true;
4842   }
4843
4844   // A negative mask allows a smaller encoding. Create a new 'and' node.
4845   SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4846   insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4847   SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4848   ReplaceNode(And, NewAnd.getNode());
4849   SelectCode(NewAnd.getNode());
4850   return true;
4851 }
4852
4853 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4854                               bool FoldedBCast, bool Masked) {
4855 #define VPTESTM_CASE(VT, SUFFIX) \
4856 case MVT::VT: \
4857   if (Masked) \
4858     return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4859   return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4860
4861
4862 #define VPTESTM_BROADCAST_CASES(SUFFIX) \
4863 default: llvm_unreachable("Unexpected VT!"); \
4864 VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4865 VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4866 VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4867 VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4868 VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4869 VPTESTM_CASE(v8i64, QZ##SUFFIX)
4870
4871 #define VPTESTM_FULL_CASES(SUFFIX) \
4872 VPTESTM_BROADCAST_CASES(SUFFIX) \
4873 VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4874 VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4875 VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4876 VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4877 VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4878 VPTESTM_CASE(v32i16, WZ##SUFFIX)
4879
4880   if (FoldedBCast) {
4881     switch (TestVT.SimpleTy) {
4882     VPTESTM_BROADCAST_CASES(rmb)
4883     }
4884   }
4885
4886   if (FoldedLoad) {
4887     switch (TestVT.SimpleTy) {
4888     VPTESTM_FULL_CASES(rm)
4889     }
4890   }
4891
4892   switch (TestVT.SimpleTy) {
4893   VPTESTM_FULL_CASES(rr)
4894   }
4895
4896 #undef VPTESTM_FULL_CASES
4897 #undef VPTESTM_BROADCAST_CASES
4898 #undef VPTESTM_CASE
4899 }
4900
4901 // Try to create VPTESTM instruction. If InMask is not null, it will be used
4902 // to form a masked operation.
4903 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4904                                  SDValue InMask) {
4905   assert(Subtarget->hasAVX512() && "Expected AVX512!");
4906   assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4907          "Unexpected VT!");
4908
4909   // Look for equal and not equal compares.
4910   ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4911   if (CC != ISD::SETEQ && CC != ISD::SETNE)
4912     return false;
4913
4914   SDValue SetccOp0 = Setcc.getOperand(0);
4915   SDValue SetccOp1 = Setcc.getOperand(1);
4916
4917   // Canonicalize the all zero vector to the RHS.
4918   if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4919     std::swap(SetccOp0, SetccOp1);
4920
4921   // See if we're comparing against zero.
4922   if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4923     return false;
4924
4925   SDValue N0 = SetccOp0;
4926
4927   MVT CmpVT = N0.getSimpleValueType();
4928   MVT CmpSVT = CmpVT.getVectorElementType();
4929
4930   // Start with both operands the same. We'll try to refine this.
4931   SDValue Src0 = N0;
4932   SDValue Src1 = N0;
4933
4934   {
4935     // Look through single use bitcasts.
4936     SDValue N0Temp = N0;
4937     if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4938       N0Temp = N0.getOperand(0);
4939
4940      // Look for single use AND.
4941     if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4942       Src0 = N0Temp.getOperand(0);
4943       Src1 = N0Temp.getOperand(1);
4944     }
4945   }
4946
4947   // Without VLX we need to widen the operation.
4948   bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4949
4950   auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4951                                 SDValue &Base, SDValue &Scale, SDValue &Index,
4952                                 SDValue &Disp, SDValue &Segment) {
4953     // If we need to widen, we can't fold the load.
4954     if (!Widen)
4955       if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4956         return true;
4957
4958     // If we didn't fold a load, try to match broadcast. No widening limitation
4959     // for this. But only 32 and 64 bit types are supported.
4960     if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4961       return false;
4962
4963     // Look through single use bitcasts.
4964     if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4965       P = L.getNode();
4966       L = L.getOperand(0);
4967     }
4968
4969     if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4970       return false;
4971
4972     auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4973     if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4974       return false;
4975
4976     return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4977   };
4978
4979   // We can only fold loads if the sources are unique.
4980   bool CanFoldLoads = Src0 != Src1;
4981
4982   bool FoldedLoad = false;
4983   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4984   if (CanFoldLoads) {
4985     FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4986                                     Tmp3, Tmp4);
4987     if (!FoldedLoad) {
4988       // And is commutative.
4989       FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
4990                                       Tmp2, Tmp3, Tmp4);
4991       if (FoldedLoad)
4992         std::swap(Src0, Src1);
4993     }
4994   }
4995
4996   bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4997
4998   bool IsMasked = InMask.getNode() != nullptr;
4999
5000   SDLoc dl(Root);
5001
5002   MVT ResVT = Setcc.getSimpleValueType();
5003   MVT MaskVT = ResVT;
5004   if (Widen) {
5005     // Widen the inputs using insert_subreg or copy_to_regclass.
5006     unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5007     unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5008     unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5009     CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
5010     MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
5011     SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
5012                                                      CmpVT), 0);
5013     Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
5014
5015     if (!FoldedBCast)
5016       Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
5017
5018     if (IsMasked) {
5019       // Widen the mask.
5020       unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
5021       SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5022       InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5023                                               dl, MaskVT, InMask, RC), 0);
5024     }
5025   }
5026
5027   bool IsTestN = CC == ISD::SETEQ;
5028   unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5029                                IsMasked);
5030
5031   MachineSDNode *CNode;
5032   if (FoldedLoad) {
5033     SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
5034
5035     if (IsMasked) {
5036       SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5037                         Src1.getOperand(0) };
5038       CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5039     } else {
5040       SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5041                         Src1.getOperand(0) };
5042       CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5043     }
5044
5045     // Update the chain.
5046     ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5047     // Record the mem-refs
5048     CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5049   } else {
5050     if (IsMasked)
5051       CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5052     else
5053       CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5054   }
5055
5056   // If we widened, we need to shrink the mask VT.
5057   if (Widen) {
5058     unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5059     SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5060     CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5061                                    dl, ResVT, SDValue(CNode, 0), RC);
5062   }
5063
5064   ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5065   CurDAG->RemoveDeadNode(Root);
5066   return true;
5067 }
5068
5069 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5070 // into vpternlog.
5071 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5072   assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5073
5074   MVT NVT = N->getSimpleValueType(0);
5075
5076   // Make sure we support VPTERNLOG.
5077   if (!NVT.isVector() || !Subtarget->hasAVX512())
5078     return false;
5079
5080   // We need VLX for 128/256-bit.
5081   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5082     return false;
5083
5084   SDValue N0 = N->getOperand(0);
5085   SDValue N1 = N->getOperand(1);
5086
5087   // Canonicalize AND to LHS.
5088   if (N1.getOpcode() == ISD::AND)
5089     std::swap(N0, N1);
5090
5091   if (N0.getOpcode() != ISD::AND ||
5092       N1.getOpcode() != X86ISD::ANDNP ||
5093       !N0.hasOneUse() || !N1.hasOneUse())
5094     return false;
5095
5096   // ANDN is not commutable, use it to pick down A and C.
5097   SDValue A = N1.getOperand(0);
5098   SDValue C = N1.getOperand(1);
5099
5100   // AND is commutable, if one operand matches A, the other operand is B.
5101   // Otherwise this isn't a match.
5102   SDValue B;
5103   if (N0.getOperand(0) == A)
5104     B = N0.getOperand(1);
5105   else if (N0.getOperand(1) == A)
5106     B = N0.getOperand(0);
5107   else
5108     return false;
5109
5110   SDLoc dl(N);
5111   SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5112   SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5113   ReplaceNode(N, Ternlog.getNode());
5114
5115   return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5116                         Ternlog.getNode(), A, B, C, 0xCA);
5117 }
5118
5119 void X86DAGToDAGISel::Select(SDNode *Node) {
5120   MVT NVT = Node->getSimpleValueType(0);
5121   unsigned Opcode = Node->getOpcode();
5122   SDLoc dl(Node);
5123
5124   if (Node->isMachineOpcode()) {
5125     LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5126     Node->setNodeId(-1);
5127     return;   // Already selected.
5128   }
5129
5130   switch (Opcode) {
5131   default: break;
5132   case ISD::INTRINSIC_W_CHAIN: {
5133     unsigned IntNo = Node->getConstantOperandVal(1);
5134     switch (IntNo) {
5135     default: break;
5136     case Intrinsic::x86_encodekey128:
5137     case Intrinsic::x86_encodekey256: {
5138       if (!Subtarget->hasKL())
5139         break;
5140
5141       unsigned Opcode;
5142       switch (IntNo) {
5143       default: llvm_unreachable("Impossible intrinsic");
5144       case Intrinsic::x86_encodekey128:
5145         Opcode = X86::ENCODEKEY128;
5146         break;
5147       case Intrinsic::x86_encodekey256:
5148         Opcode = X86::ENCODEKEY256;
5149         break;
5150       }
5151
5152       SDValue Chain = Node->getOperand(0);
5153       Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5154                                    SDValue());
5155       if (Opcode == X86::ENCODEKEY256)
5156         Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5157                                      Chain.getValue(1));
5158
5159       MachineSDNode *Res = CurDAG->getMachineNode(
5160           Opcode, dl, Node->getVTList(),
5161           {Node->getOperand(2), Chain, Chain.getValue(1)});
5162       ReplaceNode(Node, Res);
5163       return;
5164     }
5165     case Intrinsic::x86_tileloaddrs64_internal:
5166     case Intrinsic::x86_tileloaddrst164_internal:
5167       if (!Subtarget->hasAMXMOVRS())
5168         break;
5169       [[fallthrough]];
5170     case Intrinsic::x86_tileloadd64_internal:
5171     case Intrinsic::x86_tileloaddt164_internal: {
5172       if (!Subtarget->hasAMXTILE())
5173         break;
5174       auto *MFI =
5175           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5176       MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5177       unsigned Opc;
5178       switch (IntNo) {
5179       default:
5180         llvm_unreachable("Unexpected intrinsic!");
5181       case Intrinsic::x86_tileloaddrs64_internal:
5182         Opc = X86::PTILELOADDRSV;
5183         break;
5184       case Intrinsic::x86_tileloaddrst164_internal:
5185         Opc = X86::PTILELOADDRST1V;
5186         break;
5187       case Intrinsic::x86_tileloadd64_internal:
5188         Opc = X86::PTILELOADDV;
5189         break;
5190       case Intrinsic::x86_tileloaddt164_internal:
5191         Opc = X86::PTILELOADDT1V;
5192         break;
5193       }
5194       // _tile_loadd_internal(row, col, buf, STRIDE)
5195       SDValue Base = Node->getOperand(4);
5196       SDValue Scale = getI8Imm(1, dl);
5197       SDValue Index = Node->getOperand(5);
5198       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5199       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5200       SDValue Chain = Node->getOperand(0);
5201       MachineSDNode *CNode;
5202       SDValue Ops[] = {Node->getOperand(2),
5203                        Node->getOperand(3),
5204                        Base,
5205                        Scale,
5206                        Index,
5207                        Disp,
5208                        Segment,
5209                        Chain};
5210       CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5211       ReplaceNode(Node, CNode);
5212       return;
5213     }
5214     }
5215     break;
5216   }
5217   case ISD::INTRINSIC_VOID: {
5218     unsigned IntNo = Node->getConstantOperandVal(1);
5219     switch (IntNo) {
5220     default: break;
5221     case Intrinsic::x86_sse3_monitor:
5222     case Intrinsic::x86_monitorx:
5223     case Intrinsic::x86_clzero: {
5224       bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5225
5226       unsigned Opc = 0;
5227       switch (IntNo) {
5228       default: llvm_unreachable("Unexpected intrinsic!");
5229       case Intrinsic::x86_sse3_monitor:
5230         if (!Subtarget->hasSSE3())
5231           break;
5232         Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5233         break;
5234       case Intrinsic::x86_monitorx:
5235         if (!Subtarget->hasMWAITX())
5236           break;
5237         Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5238         break;
5239       case Intrinsic::x86_clzero:
5240         if (!Subtarget->hasCLZERO())
5241           break;
5242         Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5243         break;
5244       }
5245
5246       if (Opc) {
5247         unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5248         SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5249                                              Node->getOperand(2), SDValue());
5250         SDValue InGlue = Chain.getValue(1);
5251
5252         if (IntNo == Intrinsic::x86_sse3_monitor ||
5253             IntNo == Intrinsic::x86_monitorx) {
5254           // Copy the other two operands to ECX and EDX.
5255           Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5256                                        InGlue);
5257           InGlue = Chain.getValue(1);
5258           Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5259                                        InGlue);
5260           InGlue = Chain.getValue(1);
5261         }
5262
5263         MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5264                                                       { Chain, InGlue});
5265         ReplaceNode(Node, CNode);
5266         return;
5267       }
5268
5269       break;
5270     }
5271     case Intrinsic::x86_tilestored64_internal: {
5272       auto *MFI =
5273           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5274       MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5275       unsigned Opc = X86::PTILESTOREDV;
5276       // _tile_stored_internal(row, col, buf, STRIDE, c)
5277       SDValue Base = Node->getOperand(4);
5278       SDValue Scale = getI8Imm(1, dl);
5279       SDValue Index = Node->getOperand(5);
5280       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5281       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5282       SDValue Chain = Node->getOperand(0);
5283       MachineSDNode *CNode;
5284       SDValue Ops[] = {Node->getOperand(2),
5285                        Node->getOperand(3),
5286                        Base,
5287                        Scale,
5288                        Index,
5289                        Disp,
5290                        Segment,
5291                        Node->getOperand(6),
5292                        Chain};
5293       CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5294       ReplaceNode(Node, CNode);
5295       return;
5296     }
5297     case Intrinsic::x86_tileloaddrs64:
5298     case Intrinsic::x86_tileloaddrst164:
5299       if (!Subtarget->hasAMXMOVRS())
5300         break;
5301       [[fallthrough]];
5302     case Intrinsic::x86_tileloadd64:
5303     case Intrinsic::x86_tileloaddt164:
5304     case Intrinsic::x86_tilestored64: {
5305       if (!Subtarget->hasAMXTILE())
5306         break;
5307       auto *MFI =
5308           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5309       MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5310       unsigned Opc;
5311       switch (IntNo) {
5312       default: llvm_unreachable("Unexpected intrinsic!");
5313       case Intrinsic::x86_tileloadd64:   Opc = X86::PTILELOADD; break;
5314       case Intrinsic::x86_tileloaddrs64:
5315         Opc = X86::PTILELOADDRS;
5316         break;
5317       case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5318       case Intrinsic::x86_tileloaddrst164:
5319         Opc = X86::PTILELOADDRST1;
5320         break;
5321       case Intrinsic::x86_tilestored64:  Opc = X86::PTILESTORED; break;
5322       }
5323       // FIXME: Match displacement and scale.
5324       unsigned TIndex = Node->getConstantOperandVal(2);
5325       SDValue TReg = getI8Imm(TIndex, dl);
5326       SDValue Base = Node->getOperand(3);
5327       SDValue Scale = getI8Imm(1, dl);
5328       SDValue Index = Node->getOperand(4);
5329       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5330       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5331       SDValue Chain = Node->getOperand(0);
5332       MachineSDNode *CNode;
5333       if (Opc == X86::PTILESTORED) {
5334         SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5335         CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5336       } else {
5337         SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5338         CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5339       }
5340       ReplaceNode(Node, CNode);
5341       return;
5342     }
5343     case Intrinsic::x86_t2rpntlvwz0rs:
5344     case Intrinsic::x86_t2rpntlvwz0rst1:
5345     case Intrinsic::x86_t2rpntlvwz1rs:
5346     case Intrinsic::x86_t2rpntlvwz1rst1:
5347       if (!Subtarget->hasAMXMOVRS())
5348         break;
5349       [[fallthrough]];
5350     case Intrinsic::x86_t2rpntlvwz0:
5351     case Intrinsic::x86_t2rpntlvwz0t1:
5352     case Intrinsic::x86_t2rpntlvwz1:
5353     case Intrinsic::x86_t2rpntlvwz1t1: {
5354       if (!Subtarget->hasAMXTRANSPOSE())
5355         break;
5356       auto *MFI =
5357           CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5358       MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5359       unsigned Opc;
5360       switch (IntNo) {
5361       default:
5362         llvm_unreachable("Unexpected intrinsic!");
5363       case Intrinsic::x86_t2rpntlvwz0:
5364         Opc = X86::PT2RPNTLVWZ0;
5365         break;
5366       case Intrinsic::x86_t2rpntlvwz0t1:
5367         Opc = X86::PT2RPNTLVWZ0T1;
5368         break;
5369       case Intrinsic::x86_t2rpntlvwz1:
5370         Opc = X86::PT2RPNTLVWZ1;
5371         break;
5372       case Intrinsic::x86_t2rpntlvwz1t1:
5373         Opc = X86::PT2RPNTLVWZ1T1;
5374         break;
5375       case Intrinsic::x86_t2rpntlvwz0rs:
5376         Opc = X86::PT2RPNTLVWZ0RS;
5377         break;
5378       case Intrinsic::x86_t2rpntlvwz0rst1:
5379         Opc = X86::PT2RPNTLVWZ0RST1;
5380         break;
5381       case Intrinsic::x86_t2rpntlvwz1rs:
5382         Opc = X86::PT2RPNTLVWZ1RS;
5383         break;
5384       case Intrinsic::x86_t2rpntlvwz1rst1:
5385         Opc = X86::PT2RPNTLVWZ1RST1;
5386         break;
5387       }
5388       // FIXME: Match displacement and scale.
5389       unsigned TIndex = Node->getConstantOperandVal(2);
5390       SDValue TReg = getI8Imm(TIndex, dl);
5391       SDValue Base = Node->getOperand(3);
5392       SDValue Scale = getI8Imm(1, dl);
5393       SDValue Index = Node->getOperand(4);
5394       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5395       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5396       SDValue Chain = Node->getOperand(0);
5397       SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
5398       MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5399       ReplaceNode(Node, CNode);
5400       return;
5401     }
5402     }
5403     break;
5404   }
5405   case ISD::BRIND:
5406   case X86ISD::NT_BRIND: {
5407     if (Subtarget->isTargetNaCl())
5408       // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5409       // leave the instruction alone.
5410       break;
5411     if (Subtarget->isTarget64BitILP32()) {
5412       // Converts a 32-bit register to a 64-bit, zero-extended version of
5413       // it. This is needed because x86-64 can do many things, but jmp %r32
5414       // ain't one of them.
5415       SDValue Target = Node->getOperand(1);
5416       assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5417       SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5418       SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5419                                       Node->getOperand(0), ZextTarget);
5420       ReplaceNode(Node, Brind.getNode());
5421       SelectCode(ZextTarget.getNode());
5422       SelectCode(Brind.getNode());
5423       return;
5424     }
5425     break;
5426   }
5427   case X86ISD::GlobalBaseReg:
5428     ReplaceNode(Node, getGlobalBaseReg());
5429     return;
5430
5431   case ISD::BITCAST:
5432     // Just drop all 128/256/512-bit bitcasts.
5433     if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5434         NVT == MVT::f128) {
5435       ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5436       CurDAG->RemoveDeadNode(Node);
5437       return;
5438     }
5439     break;
5440
5441   case ISD::SRL:
5442     if (matchBitExtract(Node))
5443       return;
5444     [[fallthrough]];
5445   case ISD::SRA:
5446   case ISD::SHL:
5447     if (tryShiftAmountMod(Node))
5448       return;
5449     break;
5450
5451   case X86ISD::VPTERNLOG: {
5452     uint8_t Imm = Node->getConstantOperandVal(3);
5453     if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5454                        Node->getOperand(1), Node->getOperand(2), Imm))
5455       return;
5456     break;
5457   }
5458
5459   case X86ISD::ANDNP:
5460     if (tryVPTERNLOG(Node))
5461       return;
5462     break;
5463
5464   case ISD::AND:
5465     if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5466       // Try to form a masked VPTESTM. Operands can be in either order.
5467       SDValue N0 = Node->getOperand(0);
5468       SDValue N1 = Node->getOperand(1);
5469       if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5470           tryVPTESTM(Node, N0, N1))
5471         return;
5472       if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5473           tryVPTESTM(Node, N1, N0))
5474         return;
5475     }
5476
5477     if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5478       ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5479       CurDAG->RemoveDeadNode(Node);
5480       return;
5481     }
5482     if (matchBitExtract(Node))
5483       return;
5484     if (AndImmShrink && shrinkAndImmediate(Node))
5485       return;
5486
5487     [[fallthrough]];
5488   case ISD::OR:
5489   case ISD::XOR:
5490     if (tryShrinkShlLogicImm(Node))
5491       return;
5492     if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5493       return;
5494     if (tryVPTERNLOG(Node))
5495       return;
5496
5497     [[fallthrough]];
5498   case ISD::ADD:
5499     if (Opcode == ISD::ADD && matchBitExtract(Node))
5500       return;
5501     [[fallthrough]];
5502   case ISD::SUB: {
5503     // Try to avoid folding immediates with multiple uses for optsize.
5504     // This code tries to select to register form directly to avoid going
5505     // through the isel table which might fold the immediate. We can't change
5506     // the patterns on the add/sub/and/or/xor with immediate paterns in the
5507     // tablegen files to check immediate use count without making the patterns
5508     // unavailable to the fast-isel table.
5509     if (!CurDAG->shouldOptForSize())
5510       break;
5511
5512     // Only handle i8/i16/i32/i64.
5513     if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5514       break;
5515
5516     SDValue N0 = Node->getOperand(0);
5517     SDValue N1 = Node->getOperand(1);
5518
5519     auto *Cst = dyn_cast<ConstantSDNode>(N1);
5520     if (!Cst)
5521       break;
5522
5523     int64_t Val = Cst->getSExtValue();
5524
5525     // Make sure its an immediate that is considered foldable.
5526     // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5527     if (!isInt<8>(Val) && !isInt<32>(Val))
5528       break;
5529
5530     // If this can match to INC/DEC, let it go.
5531     if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5532       break;
5533
5534     // Check if we should avoid folding this immediate.
5535     if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5536       break;
5537
5538     // We should not fold the immediate. So we need a register form instead.
5539     unsigned ROpc, MOpc;
5540     switch (NVT.SimpleTy) {
5541     default: llvm_unreachable("Unexpected VT!");
5542     case MVT::i8:
5543       switch (Opcode) {
5544       default: llvm_unreachable("Unexpected opcode!");
5545       case ISD::ADD:
5546         ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5547         MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5548         break;
5549       case ISD::SUB:
5550         ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5551         MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5552         break;
5553       case ISD::AND:
5554         ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5555         MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5556         break;
5557       case ISD::OR:
5558         ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5559         MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5560         break;
5561       case ISD::XOR:
5562         ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5563         MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5564         break;
5565       }
5566       break;
5567     case MVT::i16:
5568       switch (Opcode) {
5569       default: llvm_unreachable("Unexpected opcode!");
5570       case ISD::ADD:
5571         ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5572         MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5573         break;
5574       case ISD::SUB:
5575         ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5576         MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5577         break;
5578       case ISD::AND:
5579         ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5580         MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5581         break;
5582       case ISD::OR:
5583         ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5584         MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5585         break;
5586       case ISD::XOR:
5587         ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5588         MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5589         break;
5590       }
5591       break;
5592     case MVT::i32:
5593       switch (Opcode) {
5594       default: llvm_unreachable("Unexpected opcode!");
5595       case ISD::ADD:
5596         ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5597         MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5598         break;
5599       case ISD::SUB:
5600         ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5601         MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5602         break;
5603       case ISD::AND:
5604         ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5605         MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5606         break;
5607       case ISD::OR:
5608         ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5609         MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5610         break;
5611       case ISD::XOR:
5612         ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5613         MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5614         break;
5615       }
5616       break;
5617     case MVT::i64:
5618       switch (Opcode) {
5619       default: llvm_unreachable("Unexpected opcode!");
5620       case ISD::ADD:
5621         ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5622         MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5623         break;
5624       case ISD::SUB:
5625         ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5626         MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5627         break;
5628       case ISD::AND:
5629         ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5630         MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5631         break;
5632       case ISD::OR:
5633         ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5634         MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5635         break;
5636       case ISD::XOR:
5637         ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5638         MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5639         break;
5640       }
5641       break;
5642     }
5643
5644     // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5645
5646     // If this is a not a subtract, we can still try to fold a load.
5647     if (Opcode != ISD::SUB) {
5648       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5649       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5650         SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5651         SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5652         MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5653         // Update the chain.
5654         ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5655         // Record the mem-refs
5656         CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5657         ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5658         CurDAG->RemoveDeadNode(Node);
5659         return;
5660       }
5661     }
5662
5663     CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5664     return;
5665   }
5666
5667   case X86ISD::SMUL:
5668     // i16/i32/i64 are handled with isel patterns.
5669     if (NVT != MVT::i8)
5670       break;
5671     [[fallthrough]];
5672   case X86ISD::UMUL: {
5673     SDValue N0 = Node->getOperand(0);
5674     SDValue N1 = Node->getOperand(1);
5675
5676     unsigned LoReg, ROpc, MOpc;
5677     switch (NVT.SimpleTy) {
5678     default: llvm_unreachable("Unsupported VT!");
5679     case MVT::i8:
5680       LoReg = X86::AL;
5681       ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5682       MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5683       break;
5684     case MVT::i16:
5685       LoReg = X86::AX;
5686       ROpc = X86::MUL16r;
5687       MOpc = X86::MUL16m;
5688       break;
5689     case MVT::i32:
5690       LoReg = X86::EAX;
5691       ROpc = X86::MUL32r;
5692       MOpc = X86::MUL32m;
5693       break;
5694     case MVT::i64:
5695       LoReg = X86::RAX;
5696       ROpc = X86::MUL64r;
5697       MOpc = X86::MUL64m;
5698       break;
5699     }
5700
5701     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5702     bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5703     // Multiply is commutative.
5704     if (!FoldedLoad) {
5705       FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5706       if (FoldedLoad)
5707         std::swap(N0, N1);
5708     }
5709
5710     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5711                                           N0, SDValue()).getValue(1);
5712
5713     MachineSDNode *CNode;
5714     if (FoldedLoad) {
5715       // i16/i32/i64 use an instruction that produces a low and high result even
5716       // though only the low result is used.
5717       SDVTList VTs;
5718       if (NVT == MVT::i8)
5719         VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5720       else
5721         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5722
5723       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5724                         InGlue };
5725       CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5726
5727       // Update the chain.
5728       ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5729       // Record the mem-refs
5730       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5731     } else {
5732       // i16/i32/i64 use an instruction that produces a low and high result even
5733       // though only the low result is used.
5734       SDVTList VTs;
5735       if (NVT == MVT::i8)
5736         VTs = CurDAG->getVTList(NVT, MVT::i32);
5737       else
5738         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5739
5740       CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5741     }
5742
5743     ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5744     ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5745     CurDAG->RemoveDeadNode(Node);
5746     return;
5747   }
5748
5749   case ISD::SMUL_LOHI:
5750   case ISD::UMUL_LOHI: {
5751     SDValue N0 = Node->getOperand(0);
5752     SDValue N1 = Node->getOperand(1);
5753
5754     unsigned Opc, MOpc;
5755     unsigned LoReg, HiReg;
5756     bool IsSigned = Opcode == ISD::SMUL_LOHI;
5757     bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5758     bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5759     switch (NVT.SimpleTy) {
5760     default: llvm_unreachable("Unsupported VT!");
5761     case MVT::i32:
5762       Opc = UseMULXHi  ? X86::MULX32Hrr
5763             : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5764             : IsSigned ? X86::IMUL32r
5765                        : X86::MUL32r;
5766       MOpc = UseMULXHi  ? X86::MULX32Hrm
5767              : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5768              : IsSigned ? X86::IMUL32m
5769                         : X86::MUL32m;
5770       LoReg = UseMULX ? X86::EDX : X86::EAX;
5771       HiReg = X86::EDX;
5772       break;
5773     case MVT::i64:
5774       Opc = UseMULXHi  ? X86::MULX64Hrr
5775             : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5776             : IsSigned ? X86::IMUL64r
5777                        : X86::MUL64r;
5778       MOpc = UseMULXHi  ? X86::MULX64Hrm
5779              : UseMULX  ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5780              : IsSigned ? X86::IMUL64m
5781                         : X86::MUL64m;
5782       LoReg = UseMULX ? X86::RDX : X86::RAX;
5783       HiReg = X86::RDX;
5784       break;
5785     }
5786
5787     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5788     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5789     // Multiply is commutative.
5790     if (!foldedLoad) {
5791       foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5792       if (foldedLoad)
5793         std::swap(N0, N1);
5794     }
5795
5796     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5797                                           N0, SDValue()).getValue(1);
5798     SDValue ResHi, ResLo;
5799     if (foldedLoad) {
5800       SDValue Chain;
5801       MachineSDNode *CNode = nullptr;
5802       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5803                         InGlue };
5804       if (UseMULXHi) {
5805         SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5806         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5807         ResHi = SDValue(CNode, 0);
5808         Chain = SDValue(CNode, 1);
5809       } else if (UseMULX) {
5810         SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5811         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5812         ResHi = SDValue(CNode, 0);
5813         ResLo = SDValue(CNode, 1);
5814         Chain = SDValue(CNode, 2);
5815       } else {
5816         SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5817         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5818         Chain = SDValue(CNode, 0);
5819         InGlue = SDValue(CNode, 1);
5820       }
5821
5822       // Update the chain.
5823       ReplaceUses(N1.getValue(1), Chain);
5824       // Record the mem-refs
5825       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5826     } else {
5827       SDValue Ops[] = { N1, InGlue };
5828       if (UseMULXHi) {
5829         SDVTList VTs = CurDAG->getVTList(NVT);
5830         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5831         ResHi = SDValue(CNode, 0);
5832       } else if (UseMULX) {
5833         SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5834         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5835         ResHi = SDValue(CNode, 0);
5836         ResLo = SDValue(CNode, 1);
5837       } else {
5838         SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5839         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5840         InGlue = SDValue(CNode, 0);
5841       }
5842     }
5843
5844     // Copy the low half of the result, if it is needed.
5845     if (!SDValue(Node, 0).use_empty()) {
5846       if (!ResLo) {
5847         assert(LoReg && "Register for low half is not defined!");
5848         ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5849                                        NVT, InGlue);
5850         InGlue = ResLo.getValue(2);
5851       }
5852       ReplaceUses(SDValue(Node, 0), ResLo);
5853       LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5854                  dbgs() << '\n');
5855     }
5856     // Copy the high half of the result, if it is needed.
5857     if (!SDValue(Node, 1).use_empty()) {
5858       if (!ResHi) {
5859         assert(HiReg && "Register for high half is not defined!");
5860         ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5861                                        NVT, InGlue);
5862         InGlue = ResHi.getValue(2);
5863       }
5864       ReplaceUses(SDValue(Node, 1), ResHi);
5865       LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5866                  dbgs() << '\n');
5867     }
5868
5869     CurDAG->RemoveDeadNode(Node);
5870     return;
5871   }
5872
5873   case ISD::SDIVREM:
5874   case ISD::UDIVREM: {
5875     SDValue N0 = Node->getOperand(0);
5876     SDValue N1 = Node->getOperand(1);
5877
5878     unsigned ROpc, MOpc;
5879     bool isSigned = Opcode == ISD::SDIVREM;
5880     if (!isSigned) {
5881       switch (NVT.SimpleTy) {
5882       default: llvm_unreachable("Unsupported VT!");
5883       case MVT::i8:  ROpc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
5884       case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5885       case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5886       case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5887       }
5888     } else {
5889       switch (NVT.SimpleTy) {
5890       default: llvm_unreachable("Unsupported VT!");
5891       case MVT::i8:  ROpc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
5892       case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5893       case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5894       case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5895       }
5896     }
5897
5898     unsigned LoReg, HiReg, ClrReg;
5899     unsigned SExtOpcode;
5900     switch (NVT.SimpleTy) {
5901     default: llvm_unreachable("Unsupported VT!");
5902     case MVT::i8:
5903       LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
5904       SExtOpcode = 0; // Not used.
5905       break;
5906     case MVT::i16:
5907       LoReg = X86::AX;  HiReg = X86::DX;
5908       ClrReg = X86::DX;
5909       SExtOpcode = X86::CWD;
5910       break;
5911     case MVT::i32:
5912       LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5913       SExtOpcode = X86::CDQ;
5914       break;
5915     case MVT::i64:
5916       LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5917       SExtOpcode = X86::CQO;
5918       break;
5919     }
5920
5921     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5922     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5923     bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5924
5925     SDValue InGlue;
5926     if (NVT == MVT::i8) {
5927       // Special case for div8, just use a move with zero extension to AX to
5928       // clear the upper 8 bits (AH).
5929       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5930       MachineSDNode *Move;
5931       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5932         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5933         unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5934                                                     : X86::MOVZX16rm8;
5935         Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5936         Chain = SDValue(Move, 1);
5937         ReplaceUses(N0.getValue(1), Chain);
5938         // Record the mem-refs
5939         CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5940       } else {
5941         unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5942                                                     : X86::MOVZX16rr8;
5943         Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5944         Chain = CurDAG->getEntryNode();
5945       }
5946       Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5947                                     SDValue());
5948       InGlue = Chain.getValue(1);
5949     } else {
5950       InGlue =
5951         CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5952                              LoReg, N0, SDValue()).getValue(1);
5953       if (isSigned && !signBitIsZero) {
5954         // Sign extend the low part into the high part.
5955         InGlue =
5956           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5957       } else {
5958         // Zero out the high part, effectively zero extending the input.
5959         SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5960         SDValue ClrNode =
5961             SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
5962         switch (NVT.SimpleTy) {
5963         case MVT::i16:
5964           ClrNode =
5965               SDValue(CurDAG->getMachineNode(
5966                           TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5967                           CurDAG->getTargetConstant(X86::sub_16bit, dl,
5968                                                     MVT::i32)),
5969                       0);
5970           break;
5971         case MVT::i32:
5972           break;
5973         case MVT::i64:
5974           ClrNode =
5975               SDValue(CurDAG->getMachineNode(
5976                           TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5977                           CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5978                           CurDAG->getTargetConstant(X86::sub_32bit, dl,
5979                                                     MVT::i32)),
5980                       0);
5981           break;
5982         default:
5983           llvm_unreachable("Unexpected division source");
5984         }
5985
5986         InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5987                                       ClrNode, InGlue).getValue(1);
5988       }
5989     }
5990
5991     if (foldedLoad) {
5992       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5993                         InGlue };
5994       MachineSDNode *CNode =
5995         CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5996       InGlue = SDValue(CNode, 1);
5997       // Update the chain.
5998       ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5999       // Record the mem-refs
6000       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
6001     } else {
6002       InGlue =
6003         SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
6004     }
6005
6006     // Prevent use of AH in a REX instruction by explicitly copying it to
6007     // an ABCD_L register.
6008     //
6009     // The current assumption of the register allocator is that isel
6010     // won't generate explicit references to the GR8_ABCD_H registers. If
6011     // the allocator and/or the backend get enhanced to be more robust in
6012     // that regard, this can be, and should be, removed.
6013     if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
6014       SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
6015       unsigned AHExtOpcode =
6016           isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6017
6018       SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
6019                                              MVT::Glue, AHCopy, InGlue);
6020       SDValue Result(RNode, 0);
6021       InGlue = SDValue(RNode, 1);
6022
6023       Result =
6024           CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
6025
6026       ReplaceUses(SDValue(Node, 1), Result);
6027       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6028                  dbgs() << '\n');
6029     }
6030     // Copy the division (low) result, if it is needed.
6031     if (!SDValue(Node, 0).use_empty()) {
6032       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6033                                                 LoReg, NVT, InGlue);
6034       InGlue = Result.getValue(2);
6035       ReplaceUses(SDValue(Node, 0), Result);
6036       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6037                  dbgs() << '\n');
6038     }
6039     // Copy the remainder (high) result, if it is needed.
6040     if (!SDValue(Node, 1).use_empty()) {
6041       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6042                                               HiReg, NVT, InGlue);
6043       InGlue = Result.getValue(2);
6044       ReplaceUses(SDValue(Node, 1), Result);
6045       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6046                  dbgs() << '\n');
6047     }
6048     CurDAG->RemoveDeadNode(Node);
6049     return;
6050   }
6051
6052   case X86ISD::FCMP:
6053   case X86ISD::STRICT_FCMP:
6054   case X86ISD::STRICT_FCMPS: {
6055     bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6056                        Node->getOpcode() == X86ISD::STRICT_FCMPS;
6057     SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
6058     SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
6059
6060     // Save the original VT of the compare.
6061     MVT CmpVT = N0.getSimpleValueType();
6062
6063     // Floating point needs special handling if we don't have FCOMI.
6064     if (Subtarget->canUseCMOV())
6065       break;
6066
6067     bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6068
6069     unsigned Opc;
6070     switch (CmpVT.SimpleTy) {
6071     default: llvm_unreachable("Unexpected type!");
6072     case MVT::f32:
6073       Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6074       break;
6075     case MVT::f64:
6076       Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6077       break;
6078     case MVT::f80:
6079       Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6080       break;
6081     }
6082
6083     SDValue Chain =
6084         IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
6085     SDValue Glue;
6086     if (IsStrictCmp) {
6087       SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
6088       Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
6089       Glue = Chain.getValue(1);
6090     } else {
6091       Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
6092     }
6093
6094     // Move FPSW to AX.
6095     SDValue FNSTSW =
6096         SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
6097
6098     // Extract upper 8-bits of AX.
6099     SDValue Extract =
6100         CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
6101
6102     // Move AH into flags.
6103     // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6104     assert(Subtarget->canUseLAHFSAHF() &&
6105            "Target doesn't support SAHF or FCOMI?");
6106     SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
6107     Chain = AH;
6108     SDValue SAHF = SDValue(
6109         CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
6110
6111     if (IsStrictCmp)
6112       ReplaceUses(SDValue(Node, 1), Chain);
6113
6114     ReplaceUses(SDValue(Node, 0), SAHF);
6115     CurDAG->RemoveDeadNode(Node);
6116     return;
6117   }
6118
6119   case X86ISD::CMP: {
6120     SDValue N0 = Node->getOperand(0);
6121     SDValue N1 = Node->getOperand(1);
6122
6123     // Optimizations for TEST compares.
6124     if (!isNullConstant(N1))
6125       break;
6126
6127     // Save the original VT of the compare.
6128     MVT CmpVT = N0.getSimpleValueType();
6129
6130     // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6131     // by a test instruction. The test should be removed later by
6132     // analyzeCompare if we are using only the zero flag.
6133     // TODO: Should we check the users and use the BEXTR flags directly?
6134     if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6135       if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6136         unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6137                                              : X86::TEST32rr;
6138         SDValue BEXTR = SDValue(NewNode, 0);
6139         NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6140         ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6141         CurDAG->RemoveDeadNode(Node);
6142         return;
6143       }
6144     }
6145
6146     // We can peek through truncates, but we need to be careful below.
6147     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6148       N0 = N0.getOperand(0);
6149
6150     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6151     // use a smaller encoding.
6152     // Look past the truncate if CMP is the only use of it.
6153     if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6154         N0.getValueType() != MVT::i8) {
6155       auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6156       if (!MaskC)
6157         break;
6158
6159       // We may have looked through a truncate so mask off any bits that
6160       // shouldn't be part of the compare.
6161       uint64_t Mask = MaskC->getZExtValue();
6162       Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
6163
6164       // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6165       // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6166       // zero flag.
6167       if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6168           onlyUsesZeroFlag(SDValue(Node, 0))) {
6169         unsigned ShiftOpcode = ISD::DELETED_NODE;
6170         unsigned ShiftAmt;
6171         unsigned SubRegIdx;
6172         MVT SubRegVT;
6173         unsigned TestOpcode;
6174         unsigned LeadingZeros = llvm::countl_zero(Mask);
6175         unsigned TrailingZeros = llvm::countr_zero(Mask);
6176
6177         // With leading/trailing zeros, the transform is profitable if we can
6178         // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6179         // incurring any extra register moves.
6180         bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6181         if (LeadingZeros == 0 && SavesBytes) {
6182           // If the mask covers the most significant bit, then we can replace
6183           // TEST+AND with a SHR and check eflags.
6184           // This emits a redundant TEST which is subsequently eliminated.
6185           ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6186           ShiftAmt = TrailingZeros;
6187           SubRegIdx = 0;
6188           TestOpcode = X86::TEST64rr;
6189         } else if (TrailingZeros == 0 && SavesBytes) {
6190           // If the mask covers the least significant bit, then we can replace
6191           // TEST+AND with a SHL and check eflags.
6192           // This emits a redundant TEST which is subsequently eliminated.
6193           ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6194           ShiftAmt = LeadingZeros;
6195           SubRegIdx = 0;
6196           TestOpcode = X86::TEST64rr;
6197         } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6198           // If the shifted mask extends into the high half and is 8/16/32 bits
6199           // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6200           unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6201           if (PopCount == 8) {
6202             ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6203             ShiftAmt = TrailingZeros;
6204             SubRegIdx = X86::sub_8bit;
6205             SubRegVT = MVT::i8;
6206             TestOpcode = X86::TEST8rr;
6207           } else if (PopCount == 16) {
6208             ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6209             ShiftAmt = TrailingZeros;
6210             SubRegIdx = X86::sub_16bit;
6211             SubRegVT = MVT::i16;
6212             TestOpcode = X86::TEST16rr;
6213           } else if (PopCount == 32) {
6214             ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6215             ShiftAmt = TrailingZeros;
6216             SubRegIdx = X86::sub_32bit;
6217             SubRegVT = MVT::i32;
6218             TestOpcode = X86::TEST32rr;
6219           }
6220         }
6221         if (ShiftOpcode != ISD::DELETED_NODE) {
6222           SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6223           SDValue Shift = SDValue(
6224               CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6225                                      N0.getOperand(0), ShiftC),
6226               0);
6227           if (SubRegIdx != 0) {
6228             Shift =
6229                 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6230           }
6231           MachineSDNode *Test =
6232               CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6233           ReplaceNode(Node, Test);
6234           return;
6235         }
6236       }
6237
6238       MVT VT;
6239       int SubRegOp;
6240       unsigned ROpc, MOpc;
6241
6242       // For each of these checks we need to be careful if the sign flag is
6243       // being used. It is only safe to use the sign flag in two conditions,
6244       // either the sign bit in the shrunken mask is zero or the final test
6245       // size is equal to the original compare size.
6246
6247       if (isUInt<8>(Mask) &&
6248           (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6249            hasNoSignFlagUses(SDValue(Node, 0)))) {
6250         // For example, convert "testl %eax, $8" to "testb %al, $8"
6251         VT = MVT::i8;
6252         SubRegOp = X86::sub_8bit;
6253         ROpc = X86::TEST8ri;
6254         MOpc = X86::TEST8mi;
6255       } else if (OptForMinSize && isUInt<16>(Mask) &&
6256                  (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6257                   hasNoSignFlagUses(SDValue(Node, 0)))) {
6258         // For example, "testl %eax, $32776" to "testw %ax, $32776".
6259         // NOTE: We only want to form TESTW instructions if optimizing for
6260         // min size. Otherwise we only save one byte and possibly get a length
6261         // changing prefix penalty in the decoders.
6262         VT = MVT::i16;
6263         SubRegOp = X86::sub_16bit;
6264         ROpc = X86::TEST16ri;
6265         MOpc = X86::TEST16mi;
6266       } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6267                  ((!(Mask & 0x80000000) &&
6268                    // Without minsize 16-bit Cmps can get here so we need to
6269                    // be sure we calculate the correct sign flag if needed.
6270                    (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6271                   CmpVT == MVT::i32 ||
6272                   hasNoSignFlagUses(SDValue(Node, 0)))) {
6273         // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6274         // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6275         // Otherwize, we find ourselves in a position where we have to do
6276         // promotion. If previous passes did not promote the and, we assume
6277         // they had a good reason not to and do not promote here.
6278         VT = MVT::i32;
6279         SubRegOp = X86::sub_32bit;
6280         ROpc = X86::TEST32ri;
6281         MOpc = X86::TEST32mi;
6282       } else {
6283         // No eligible transformation was found.
6284         break;
6285       }
6286
6287       SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6288       SDValue Reg = N0.getOperand(0);
6289
6290       // Emit a testl or testw.
6291       MachineSDNode *NewNode;
6292       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6293       if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6294         if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6295           if (!LoadN->isSimple()) {
6296             unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6297             if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6298                 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6299                 (MOpc == X86::TEST32mi && NumVolBits != 32))
6300               break;
6301           }
6302         }
6303         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6304                           Reg.getOperand(0) };
6305         NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6306         // Update the chain.
6307         ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6308         // Record the mem-refs
6309         CurDAG->setNodeMemRefs(NewNode,
6310                                {cast<LoadSDNode>(Reg)->getMemOperand()});
6311       } else {
6312         // Extract the subregister if necessary.
6313         if (N0.getValueType() != VT)
6314           Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6315
6316         NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6317       }
6318       // Replace CMP with TEST.
6319       ReplaceNode(Node, NewNode);
6320       return;
6321     }
6322     break;
6323   }
6324   case X86ISD::PCMPISTR: {
6325     if (!Subtarget->hasSSE42())
6326       break;
6327
6328     bool NeedIndex = !SDValue(Node, 0).use_empty();
6329     bool NeedMask = !SDValue(Node, 1).use_empty();
6330     // We can't fold a load if we are going to make two instructions.
6331     bool MayFoldLoad = !NeedIndex || !NeedMask;
6332
6333     MachineSDNode *CNode;
6334     if (NeedMask) {
6335       unsigned ROpc =
6336           Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6337       unsigned MOpc =
6338           Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6339       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6340       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6341     }
6342     if (NeedIndex || !NeedMask) {
6343       unsigned ROpc =
6344           Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6345       unsigned MOpc =
6346           Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6347       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6348       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6349     }
6350
6351     // Connect the flag usage to the last instruction created.
6352     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6353     CurDAG->RemoveDeadNode(Node);
6354     return;
6355   }
6356   case X86ISD::PCMPESTR: {
6357     if (!Subtarget->hasSSE42())
6358       break;
6359
6360     // Copy the two implicit register inputs.
6361     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6362                                           Node->getOperand(1),
6363                                           SDValue()).getValue(1);
6364     InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6365                                   Node->getOperand(3), InGlue).getValue(1);
6366
6367     bool NeedIndex = !SDValue(Node, 0).use_empty();
6368     bool NeedMask = !SDValue(Node, 1).use_empty();
6369     // We can't fold a load if we are going to make two instructions.
6370     bool MayFoldLoad = !NeedIndex || !NeedMask;
6371
6372     MachineSDNode *CNode;
6373     if (NeedMask) {
6374       unsigned ROpc =
6375           Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6376       unsigned MOpc =
6377           Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6378       CNode =
6379           emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6380       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6381     }
6382     if (NeedIndex || !NeedMask) {
6383       unsigned ROpc =
6384           Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6385       unsigned MOpc =
6386           Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6387       CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6388       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6389     }
6390     // Connect the flag usage to the last instruction created.
6391     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6392     CurDAG->RemoveDeadNode(Node);
6393     return;
6394   }
6395
6396   case ISD::SETCC: {
6397     if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6398       return;
6399
6400     break;
6401   }
6402
6403   case ISD::STORE:
6404     if (foldLoadStoreIntoMemOperand(Node))
6405       return;
6406     break;
6407
6408   case X86ISD::SETCC_CARRY: {
6409     MVT VT = Node->getSimpleValueType(0);
6410     SDValue Result;
6411     if (Subtarget->hasSBBDepBreaking()) {
6412       // We have to do this manually because tblgen will put the eflags copy in
6413       // the wrong place if we use an extract_subreg in the pattern.
6414       // Copy flags to the EFLAGS register and glue it to next node.
6415       SDValue EFLAGS =
6416           CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6417                                Node->getOperand(1), SDValue());
6418
6419       // Create a 64-bit instruction if the result is 64-bits otherwise use the
6420       // 32-bit version.
6421       unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6422       MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6423       Result = SDValue(
6424           CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6425           0);
6426     } else {
6427       // The target does not recognize sbb with the same reg operand as a
6428       // no-source idiom, so we explicitly zero the input values.
6429       Result = getSBBZero(Node);
6430     }
6431
6432     // For less than 32-bits we need to extract from the 32-bit node.
6433     if (VT == MVT::i8 || VT == MVT::i16) {
6434       int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6435       Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6436     }
6437
6438     ReplaceUses(SDValue(Node, 0), Result);
6439     CurDAG->RemoveDeadNode(Node);
6440     return;
6441   }
6442   case X86ISD::SBB: {
6443     if (isNullConstant(Node->getOperand(0)) &&
6444         isNullConstant(Node->getOperand(1))) {
6445       SDValue Result = getSBBZero(Node);
6446
6447       // Replace the flag use.
6448       ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6449
6450       // Replace the result use.
6451       if (!SDValue(Node, 0).use_empty()) {
6452         // For less than 32-bits we need to extract from the 32-bit node.
6453         MVT VT = Node->getSimpleValueType(0);
6454         if (VT == MVT::i8 || VT == MVT::i16) {
6455           int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6456           Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6457         }
6458         ReplaceUses(SDValue(Node, 0), Result);
6459       }
6460
6461       CurDAG->RemoveDeadNode(Node);
6462       return;
6463     }
6464     break;
6465   }
6466   case X86ISD::MGATHER: {
6467     auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6468     SDValue IndexOp = Mgt->getIndex();
6469     SDValue Mask = Mgt->getMask();
6470     MVT IndexVT = IndexOp.getSimpleValueType();
6471     MVT ValueVT = Node->getSimpleValueType(0);
6472     MVT MaskVT = Mask.getSimpleValueType();
6473
6474     // This is just to prevent crashes if the nodes are malformed somehow. We're
6475     // otherwise only doing loose type checking in here based on type what
6476     // a type constraint would say just like table based isel.
6477     if (!ValueVT.isVector() || !MaskVT.isVector())
6478       break;
6479
6480     unsigned NumElts = ValueVT.getVectorNumElements();
6481     MVT ValueSVT = ValueVT.getVectorElementType();
6482
6483     bool IsFP = ValueSVT.isFloatingPoint();
6484     unsigned EltSize = ValueSVT.getSizeInBits();
6485
6486     unsigned Opc = 0;
6487     bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6488     if (AVX512Gather) {
6489       if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6490         Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6491       else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6492         Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6493       else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6494         Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6495       else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6496         Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6497       else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6498         Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6499       else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6500         Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6501       else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6502         Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6503       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6504         Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6505       else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6506         Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6507       else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6508         Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6509       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6510         Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6511       else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6512         Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6513     } else {
6514       assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6515              "Unexpected mask VT!");
6516       if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6517         Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6518       else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6519         Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6520       else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6521         Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6522       else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6523         Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6524       else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6525         Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6526       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6527         Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6528       else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6529         Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6530       else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6531         Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6532     }
6533
6534     if (!Opc)
6535       break;
6536
6537     SDValue Base, Scale, Index, Disp, Segment;
6538     if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6539                           Base, Scale, Index, Disp, Segment))
6540       break;
6541
6542     SDValue PassThru = Mgt->getPassThru();
6543     SDValue Chain = Mgt->getChain();
6544     // Gather instructions have a mask output not in the ISD node.
6545     SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6546
6547     MachineSDNode *NewNode;
6548     if (AVX512Gather) {
6549       SDValue Ops[] = {PassThru, Mask, Base,    Scale,
6550                        Index,    Disp, Segment, Chain};
6551       NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6552     } else {
6553       SDValue Ops[] = {PassThru, Base,    Scale, Index,
6554                        Disp,     Segment, Mask,  Chain};
6555       NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6556     }
6557     CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6558     ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6559     ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6560     CurDAG->RemoveDeadNode(Node);
6561     return;
6562   }
6563   case X86ISD::MSCATTER: {
6564     auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6565     SDValue Value = Sc->getValue();
6566     SDValue IndexOp = Sc->getIndex();
6567     MVT IndexVT = IndexOp.getSimpleValueType();
6568     MVT ValueVT = Value.getSimpleValueType();
6569
6570     // This is just to prevent crashes if the nodes are malformed somehow. We're
6571     // otherwise only doing loose type checking in here based on type what
6572     // a type constraint would say just like table based isel.
6573     if (!ValueVT.isVector())
6574       break;
6575
6576     unsigned NumElts = ValueVT.getVectorNumElements();
6577     MVT ValueSVT = ValueVT.getVectorElementType();
6578
6579     bool IsFP = ValueSVT.isFloatingPoint();
6580     unsigned EltSize = ValueSVT.getSizeInBits();
6581
6582     unsigned Opc;
6583     if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6584       Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6585     else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6586       Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6587     else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6588       Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6589     else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6590       Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6591     else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6592       Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6593     else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6594       Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6595     else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6596       Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6597     else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6598       Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6599     else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6600       Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6601     else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6602       Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6603     else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6604       Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6605     else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6606       Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6607     else
6608       break;
6609
6610     SDValue Base, Scale, Index, Disp, Segment;
6611     if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6612                           Base, Scale, Index, Disp, Segment))
6613       break;
6614
6615     SDValue Mask = Sc->getMask();
6616     SDValue Chain = Sc->getChain();
6617     // Scatter instructions have a mask output not in the ISD node.
6618     SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6619     SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6620
6621     MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6622     CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6623     ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6624     CurDAG->RemoveDeadNode(Node);
6625     return;
6626   }
6627   case ISD::PREALLOCATED_SETUP: {
6628     auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6629     auto CallId = MFI->getPreallocatedIdForCallSite(
6630         cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6631     SDValue Chain = Node->getOperand(0);
6632     SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6633     MachineSDNode *New = CurDAG->getMachineNode(
6634         TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6635     ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6636     CurDAG->RemoveDeadNode(Node);
6637     return;
6638   }
6639   case ISD::PREALLOCATED_ARG: {
6640     auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6641     auto CallId = MFI->getPreallocatedIdForCallSite(
6642         cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6643     SDValue Chain = Node->getOperand(0);
6644     SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6645     SDValue ArgIndex = Node->getOperand(2);
6646     SDValue Ops[3];
6647     Ops[0] = CallIdValue;
6648     Ops[1] = ArgIndex;
6649     Ops[2] = Chain;
6650     MachineSDNode *New = CurDAG->getMachineNode(
6651         TargetOpcode::PREALLOCATED_ARG, dl,
6652         CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6653                           MVT::Other),
6654         Ops);
6655     ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6656     ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6657     CurDAG->RemoveDeadNode(Node);
6658     return;
6659   }
6660   case X86ISD::AESENCWIDE128KL:
6661   case X86ISD::AESDECWIDE128KL:
6662   case X86ISD::AESENCWIDE256KL:
6663   case X86ISD::AESDECWIDE256KL: {
6664     if (!Subtarget->hasWIDEKL())
6665       break;
6666
6667     unsigned Opcode;
6668     switch (Node->getOpcode()) {
6669     default:
6670       llvm_unreachable("Unexpected opcode!");
6671     case X86ISD::AESENCWIDE128KL:
6672       Opcode = X86::AESENCWIDE128KL;
6673       break;
6674     case X86ISD::AESDECWIDE128KL:
6675       Opcode = X86::AESDECWIDE128KL;
6676       break;
6677     case X86ISD::AESENCWIDE256KL:
6678       Opcode = X86::AESENCWIDE256KL;
6679       break;
6680     case X86ISD::AESDECWIDE256KL:
6681       Opcode = X86::AESDECWIDE256KL;
6682       break;
6683     }
6684
6685     SDValue Chain = Node->getOperand(0);
6686     SDValue Addr = Node->getOperand(1);
6687
6688     SDValue Base, Scale, Index, Disp, Segment;
6689     if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6690       break;
6691
6692     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6693                                  SDValue());
6694     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6695                                  Chain.getValue(1));
6696     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6697                                  Chain.getValue(1));
6698     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6699                                  Chain.getValue(1));
6700     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6701                                  Chain.getValue(1));
6702     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6703                                  Chain.getValue(1));
6704     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6705                                  Chain.getValue(1));
6706     Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6707                                  Chain.getValue(1));
6708
6709     MachineSDNode *Res = CurDAG->getMachineNode(
6710         Opcode, dl, Node->getVTList(),
6711         {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6712     CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6713     ReplaceNode(Node, Res);
6714     return;
6715   }
6716   }
6717
6718   SelectCode(Node);
6719 }
6720
6721 bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6722     const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6723     std::vector<SDValue> &OutOps) {
6724   SDValue Op0, Op1, Op2, Op3, Op4;
6725   switch (ConstraintID) {
6726   default:
6727     llvm_unreachable("Unexpected asm memory constraint");
6728   case InlineAsm::ConstraintCode::o: // offsetable        ??
6729   case InlineAsm::ConstraintCode::v: // not offsetable    ??
6730   case InlineAsm::ConstraintCode::m: // memory
6731   case InlineAsm::ConstraintCode::X:
6732   case InlineAsm::ConstraintCode::p: // address
6733     if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6734       return true;
6735     break;
6736   }
6737
6738   OutOps.push_back(Op0);
6739   OutOps.push_back(Op1);
6740   OutOps.push_back(Op2);
6741   OutOps.push_back(Op3);
6742   OutOps.push_back(Op4);
6743   return false;
6744 }
6745
6746 X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6747     : SelectionDAGISelPass(
6748           std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6749
6750 /// This pass converts a legalized DAG into a X86-specific DAG,
6751 /// ready for instruction scheduling.
6752 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6753                                      CodeGenOptLevel OptLevel) {
6754   return new X86DAGToDAGISelLegacy(TM, OptLevel);
6755 }