lib/Target/X86/X86ISelDAGToDAG.cpp

   1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines a DAG pattern matching instruction selector for X86,
  10 // converting from a legalized dag to a X86 dag.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "X86.h"
  15 #include "X86MachineFunctionInfo.h"
  16 #include "X86RegisterInfo.h"
  17 #include "X86Subtarget.h"
  18 #include "X86TargetMachine.h"
  19 #include "llvm/ADT/Statistic.h"
  20 #include "llvm/CodeGen/MachineFrameInfo.h"
  21 #include "llvm/CodeGen/MachineFunction.h"
  22 #include "llvm/CodeGen/SelectionDAGISel.h"
  23 #include "llvm/Config/llvm-config.h"
  24 #include "llvm/IR/ConstantRange.h"
  25 #include "llvm/IR/Function.h"
  26 #include "llvm/IR/Instructions.h"
  27 #include "llvm/IR/Intrinsics.h"
  28 #include "llvm/IR/Type.h"
  29 #include "llvm/Support/Debug.h"
  30 #include "llvm/Support/ErrorHandling.h"
  31 #include "llvm/Support/KnownBits.h"
  32 #include "llvm/Support/MathExtras.h"
  33 #include "llvm/Support/raw_ostream.h"
  34 #include "llvm/Target/TargetMachine.h"
  35 #include "llvm/Target/TargetOptions.h"
  36 #include <stdint.h>
  37 using namespace llvm;
  38
  39 #define DEBUG_TYPE "x86-isel"
  40
  41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
  42
  43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
  44     cl::desc("Enable setting constant bits to reduce size of mask immediates"),
  45     cl::Hidden);
  46
  47 //===----------------------------------------------------------------------===//
  48 //                      Pattern Matcher Implementation
  49 //===----------------------------------------------------------------------===//
  50
  51 namespace {
  52   /// This corresponds to X86AddressMode, but uses SDValue's instead of register
  53   /// numbers for the leaves of the matched tree.
  54   struct X86ISelAddressMode {
  55     enum {
  56       RegBase,
  57       FrameIndexBase
  58     } BaseType;
  59
  60     // This is really a union, discriminated by BaseType!
  61     SDValue Base_Reg;
  62     int Base_FrameIndex;
  63
  64     unsigned Scale;
  65     SDValue IndexReg;
  66     int32_t Disp;
  67     SDValue Segment;
  68     const GlobalValue *GV;
  69     const Constant *CP;
  70     const BlockAddress *BlockAddr;
  71     const char *ES;
  72     MCSymbol *MCSym;
  73     int JT;
  74     unsigned Align;    // CP alignment.
  75     unsigned char SymbolFlags;  // X86II::MO_*
  76
  77     X86ISelAddressMode()
  78         : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
  79           Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
  80           MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
  81
  82     bool hasSymbolicDisplacement() const {
  83       return GV != nullptr || CP != nullptr || ES != nullptr ||
  84              MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
  85     }
  86
  87     bool hasBaseOrIndexReg() const {
  88       return BaseType == FrameIndexBase ||
  89              IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
  90     }
  91
  92     /// Return true if this addressing mode is already RIP-relative.
  93     bool isRIPRelative() const {
  94       if (BaseType != RegBase) return false;
  95       if (RegisterSDNode *RegNode =
  96             dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
  97         return RegNode->getReg() == X86::RIP;
  98       return false;
  99     }
 100
 101     void setBaseReg(SDValue Reg) {
 102       BaseType = RegBase;
 103       Base_Reg = Reg;
 104     }
 105
 106 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 107     void dump(SelectionDAG *DAG = nullptr) {
 108       dbgs() << "X86ISelAddressMode " << this << '\n';
 109       dbgs() << "Base_Reg ";
 110       if (Base_Reg.getNode())
 111         Base_Reg.getNode()->dump(DAG);
 112       else
 113         dbgs() << "nul\n";
 114       if (BaseType == FrameIndexBase)
 115         dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
 116       dbgs() << " Scale " << Scale << '\n'
 117              << "IndexReg ";
 118       if (IndexReg.getNode())
 119         IndexReg.getNode()->dump(DAG);
 120       else
 121         dbgs() << "nul\n";
 122       dbgs() << " Disp " << Disp << '\n'
 123              << "GV ";
 124       if (GV)
 125         GV->dump();
 126       else
 127         dbgs() << "nul";
 128       dbgs() << " CP ";
 129       if (CP)
 130         CP->dump();
 131       else
 132         dbgs() << "nul";
 133       dbgs() << '\n'
 134              << "ES ";
 135       if (ES)
 136         dbgs() << ES;
 137       else
 138         dbgs() << "nul";
 139       dbgs() << " MCSym ";
 140       if (MCSym)
 141         dbgs() << MCSym;
 142       else
 143         dbgs() << "nul";
 144       dbgs() << " JT" << JT << " Align" << Align << '\n';
 145     }
 146 #endif
 147   };
 148 }
 149
 150 namespace {
 151   //===--------------------------------------------------------------------===//
 152   /// ISel - X86-specific code to select X86 machine instructions for
 153   /// SelectionDAG operations.
 154   ///
 155   class X86DAGToDAGISel final : public SelectionDAGISel {
 156     /// Keep a pointer to the X86Subtarget around so that we can
 157     /// make the right decision when generating code for different targets.
 158     const X86Subtarget *Subtarget;
 159
 160     /// If true, selector should try to optimize for code size instead of
 161     /// performance.
 162     bool OptForSize;
 163
 164     /// If true, selector should try to optimize for minimum code size.
 165     bool OptForMinSize;
 166
 167     /// Disable direct TLS access through segment registers.
 168     bool IndirectTlsSegRefs;
 169
 170   public:
 171     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
 172         : SelectionDAGISel(tm, OptLevel), OptForSize(false),
 173           OptForMinSize(false) {}
 174
 175     StringRef getPassName() const override {
 176       return "X86 DAG->DAG Instruction Selection";
 177     }
 178
 179     bool runOnMachineFunction(MachineFunction &MF) override {
 180       // Reset the subtarget each time through.
 181       Subtarget = &MF.getSubtarget<X86Subtarget>();
 182       IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
 183                              "indirect-tls-seg-refs");
 184       SelectionDAGISel::runOnMachineFunction(MF);
 185       return true;
 186     }
 187
 188     void EmitFunctionEntryCode() override;
 189
 190     bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
 191
 192     void PreprocessISelDAG() override;
 193     void PostprocessISelDAG() override;
 194
 195 // Include the pieces autogenerated from the target description.
 196 #include "X86GenDAGISel.inc"
 197
 198   private:
 199     void Select(SDNode *N) override;
 200
 201     bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
 202     bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
 203     bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
 204     bool matchAddress(SDValue N, X86ISelAddressMode &AM);
 205     bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
 206     bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
 207     bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 208                                  unsigned Depth);
 209     bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
 210     bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
 211                     SDValue &Scale, SDValue &Index, SDValue &Disp,
 212                     SDValue &Segment);
 213     bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
 214                           SDValue &Scale, SDValue &Index, SDValue &Disp,
 215                           SDValue &Segment);
 216     bool selectMOV64Imm32(SDValue N, SDValue &Imm);
 217     bool selectLEAAddr(SDValue N, SDValue &Base,
 218                        SDValue &Scale, SDValue &Index, SDValue &Disp,
 219                        SDValue &Segment);
 220     bool selectLEA64_32Addr(SDValue N, SDValue &Base,
 221                             SDValue &Scale, SDValue &Index, SDValue &Disp,
 222                             SDValue &Segment);
 223     bool selectTLSADDRAddr(SDValue N, SDValue &Base,
 224                            SDValue &Scale, SDValue &Index, SDValue &Disp,
 225                            SDValue &Segment);
 226     bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
 227                              SDValue &Base, SDValue &Scale,
 228                              SDValue &Index, SDValue &Disp,
 229                              SDValue &Segment,
 230                              SDValue &NodeWithChain);
 231     bool selectRelocImm(SDValue N, SDValue &Op);
 232
 233     bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
 234                      SDValue &Base, SDValue &Scale,
 235                      SDValue &Index, SDValue &Disp,
 236                      SDValue &Segment);
 237
 238     // Convenience method where P is also root.
 239     bool tryFoldLoad(SDNode *P, SDValue N,
 240                      SDValue &Base, SDValue &Scale,
 241                      SDValue &Index, SDValue &Disp,
 242                      SDValue &Segment) {
 243       return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
 244     }
 245
 246     /// Implement addressing mode selection for inline asm expressions.
 247     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
 248                                       unsigned ConstraintID,
 249                                       std::vector<SDValue> &OutOps) override;
 250
 251     void emitSpecialCodeForMain();
 252
 253     inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
 254                                    SDValue &Base, SDValue &Scale,
 255                                    SDValue &Index, SDValue &Disp,
 256                                    SDValue &Segment) {
 257       Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
 258                  ? CurDAG->getTargetFrameIndex(
 259                        AM.Base_FrameIndex,
 260                        TLI->getPointerTy(CurDAG->getDataLayout()))
 261                  : AM.Base_Reg;
 262       Scale = getI8Imm(AM.Scale, DL);
 263       Index = AM.IndexReg;
 264       // These are 32-bit even in 64-bit mode since RIP-relative offset
 265       // is 32-bit.
 266       if (AM.GV)
 267         Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
 268                                               MVT::i32, AM.Disp,
 269                                               AM.SymbolFlags);
 270       else if (AM.CP)
 271         Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
 272                                              AM.Align, AM.Disp, AM.SymbolFlags);
 273       else if (AM.ES) {
 274         assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
 275         Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
 276       } else if (AM.MCSym) {
 277         assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
 278         assert(AM.SymbolFlags == 0 && "oo");
 279         Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
 280       } else if (AM.JT != -1) {
 281         assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
 282         Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
 283       } else if (AM.BlockAddr)
 284         Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
 285                                              AM.SymbolFlags);
 286       else
 287         Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
 288
 289       if (AM.Segment.getNode())
 290         Segment = AM.Segment;
 291       else
 292         Segment = CurDAG->getRegister(0, MVT::i32);
 293     }
 294
 295     // Utility function to determine whether we should avoid selecting
 296     // immediate forms of instructions for better code size or not.
 297     // At a high level, we'd like to avoid such instructions when
 298     // we have similar constants used within the same basic block
 299     // that can be kept in a register.
 300     //
 301     bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
 302       uint32_t UseCount = 0;
 303
 304       // Do not want to hoist if we're not optimizing for size.
 305       // TODO: We'd like to remove this restriction.
 306       // See the comment in X86InstrInfo.td for more info.
 307       if (!OptForSize)
 308         return false;
 309
 310       // Walk all the users of the immediate.
 311       for (SDNode::use_iterator UI = N->use_begin(),
 312            UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
 313
 314         SDNode *User = *UI;
 315
 316         // This user is already selected. Count it as a legitimate use and
 317         // move on.
 318         if (User->isMachineOpcode()) {
 319           UseCount++;
 320           continue;
 321         }
 322
 323         // We want to count stores of immediates as real uses.
 324         if (User->getOpcode() == ISD::STORE &&
 325             User->getOperand(1).getNode() == N) {
 326           UseCount++;
 327           continue;
 328         }
 329
 330         // We don't currently match users that have > 2 operands (except
 331         // for stores, which are handled above)
 332         // Those instruction won't match in ISEL, for now, and would
 333         // be counted incorrectly.
 334         // This may change in the future as we add additional instruction
 335         // types.
 336         if (User->getNumOperands() != 2)
 337           continue;
 338
 339         // Immediates that are used for offsets as part of stack
 340         // manipulation should be left alone. These are typically
 341         // used to indicate SP offsets for argument passing and
 342         // will get pulled into stores/pushes (implicitly).
 343         if (User->getOpcode() == X86ISD::ADD ||
 344             User->getOpcode() == ISD::ADD    ||
 345             User->getOpcode() == X86ISD::SUB ||
 346             User->getOpcode() == ISD::SUB) {
 347
 348           // Find the other operand of the add/sub.
 349           SDValue OtherOp = User->getOperand(0);
 350           if (OtherOp.getNode() == N)
 351             OtherOp = User->getOperand(1);
 352
 353           // Don't count if the other operand is SP.
 354           RegisterSDNode *RegNode;
 355           if (OtherOp->getOpcode() == ISD::CopyFromReg &&
 356               (RegNode = dyn_cast_or_null<RegisterSDNode>(
 357                  OtherOp->getOperand(1).getNode())))
 358             if ((RegNode->getReg() == X86::ESP) ||
 359                 (RegNode->getReg() == X86::RSP))
 360               continue;
 361         }
 362
 363         // ... otherwise, count this and move on.
 364         UseCount++;
 365       }
 366
 367       // If we have more than 1 use, then recommend for hoisting.
 368       return (UseCount > 1);
 369     }
 370
 371     /// Return a target constant with the specified value of type i8.
 372     inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
 373       return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
 374     }
 375
 376     /// Return a target constant with the specified value, of type i32.
 377     inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
 378       return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
 379     }
 380
 381     /// Return a target constant with the specified value, of type i64.
 382     inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
 383       return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
 384     }
 385
 386     SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
 387                                         const SDLoc &DL) {
 388       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
 389       uint64_t Index = N->getConstantOperandVal(1);
 390       MVT VecVT = N->getOperand(0).getSimpleValueType();
 391       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
 392     }
 393
 394     SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
 395                                       const SDLoc &DL) {
 396       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
 397       uint64_t Index = N->getConstantOperandVal(2);
 398       MVT VecVT = N->getSimpleValueType(0);
 399       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
 400     }
 401
 402     /// Return an SDNode that returns the value of the global base register.
 403     /// Output instructions required to initialize the global base register,
 404     /// if necessary.
 405     SDNode *getGlobalBaseReg();
 406
 407     /// Return a reference to the TargetMachine, casted to the target-specific
 408     /// type.
 409     const X86TargetMachine &getTargetMachine() const {
 410       return static_cast<const X86TargetMachine &>(TM);
 411     }
 412
 413     /// Return a reference to the TargetInstrInfo, casted to the target-specific
 414     /// type.
 415     const X86InstrInfo *getInstrInfo() const {
 416       return Subtarget->getInstrInfo();
 417     }
 418
 419     /// Address-mode matching performs shift-of-and to and-of-shift
 420     /// reassociation in order to expose more scaled addressing
 421     /// opportunities.
 422     bool ComplexPatternFuncMutatesDAG() const override {
 423       return true;
 424     }
 425
 426     bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
 427
 428     /// Returns whether this is a relocatable immediate in the range
 429     /// [-2^Width .. 2^Width-1].
 430     template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
 431       if (auto *CN = dyn_cast<ConstantSDNode>(N))
 432         return isInt<Width>(CN->getSExtValue());
 433       return isSExtAbsoluteSymbolRef(Width, N);
 434     }
 435
 436     // Indicates we should prefer to use a non-temporal load for this load.
 437     bool useNonTemporalLoad(LoadSDNode *N) const {
 438       if (!N->isNonTemporal())
 439         return false;
 440
 441       unsigned StoreSize = N->getMemoryVT().getStoreSize();
 442
 443       if (N->getAlignment() < StoreSize)
 444         return false;
 445
 446       switch (StoreSize) {
 447       default: llvm_unreachable("Unsupported store size");
 448       case 4:
 449       case 8:
 450         return false;
 451       case 16:
 452         return Subtarget->hasSSE41();
 453       case 32:
 454         return Subtarget->hasAVX2();
 455       case 64:
 456         return Subtarget->hasAVX512();
 457       }
 458     }
 459
 460     bool foldLoadStoreIntoMemOperand(SDNode *Node);
 461     MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
 462     bool matchBitExtract(SDNode *Node);
 463     bool shrinkAndImmediate(SDNode *N);
 464     bool isMaskZeroExtended(SDNode *N) const;
 465     bool tryShiftAmountMod(SDNode *N);
 466
 467     MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
 468                                 const SDLoc &dl, MVT VT, SDNode *Node);
 469     MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
 470                                 const SDLoc &dl, MVT VT, SDNode *Node,
 471                                 SDValue &InFlag);
 472
 473     bool tryOptimizeRem8Extend(SDNode *N);
 474
 475     bool onlyUsesZeroFlag(SDValue Flags) const;
 476     bool hasNoSignFlagUses(SDValue Flags) const;
 477     bool hasNoCarryFlagUses(SDValue Flags) const;
 478   };
 479 }
 480
 481
 482 // Returns true if this masked compare can be implemented legally with this
 483 // type.
 484 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
 485   unsigned Opcode = N->getOpcode();
 486   if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
 487       Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) {
 488     // We can get 256-bit 8 element types here without VLX being enabled. When
 489     // this happens we will use 512-bit operations and the mask will not be
 490     // zero extended.
 491     EVT OpVT = N->getOperand(0).getValueType();
 492     if (OpVT.is256BitVector() || OpVT.is128BitVector())
 493       return Subtarget->hasVLX();
 494
 495     return true;
 496   }
 497   // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
 498   if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
 499       Opcode == X86ISD::FSETCCM_RND)
 500     return true;
 501
 502   return false;
 503 }
 504
 505 // Returns true if we can assume the writer of the mask has zero extended it
 506 // for us.
 507 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
 508   // If this is an AND, check if we have a compare on either side. As long as
 509   // one side guarantees the mask is zero extended, the AND will preserve those
 510   // zeros.
 511   if (N->getOpcode() == ISD::AND)
 512     return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
 513            isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
 514
 515   return isLegalMaskCompare(N, Subtarget);
 516 }
 517
 518 bool
 519 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
 520   if (OptLevel == CodeGenOpt::None) return false;
 521
 522   if (!N.hasOneUse())
 523     return false;
 524
 525   if (N.getOpcode() != ISD::LOAD)
 526     return true;
 527
 528   // Don't fold non-temporal loads if we have an instruction for them.
 529   if (useNonTemporalLoad(cast<LoadSDNode>(N)))
 530     return false;
 531
 532   // If N is a load, do additional profitability checks.
 533   if (U == Root) {
 534     switch (U->getOpcode()) {
 535     default: break;
 536     case X86ISD::ADD:
 537     case X86ISD::ADC:
 538     case X86ISD::SUB:
 539     case X86ISD::SBB:
 540     case X86ISD::AND:
 541     case X86ISD::XOR:
 542     case X86ISD::OR:
 543     case ISD::ADD:
 544     case ISD::ADDCARRY:
 545     case ISD::AND:
 546     case ISD::OR:
 547     case ISD::XOR: {
 548       SDValue Op1 = U->getOperand(1);
 549
 550       // If the other operand is a 8-bit immediate we should fold the immediate
 551       // instead. This reduces code size.
 552       // e.g.
 553       // movl 4(%esp), %eax
 554       // addl $4, %eax
 555       // vs.
 556       // movl $4, %eax
 557       // addl 4(%esp), %eax
 558       // The former is 2 bytes shorter. In case where the increment is 1, then
 559       // the saving can be 4 bytes (by using incl %eax).
 560       if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
 561         if (Imm->getAPIntValue().isSignedIntN(8))
 562           return false;
 563
 564         // If this is a 64-bit AND with an immediate that fits in 32-bits,
 565         // prefer using the smaller and over folding the load. This is needed to
 566         // make sure immediates created by shrinkAndImmediate are always folded.
 567         // Ideally we would narrow the load during DAG combine and get the
 568         // best of both worlds.
 569         if (U->getOpcode() == ISD::AND &&
 570             Imm->getAPIntValue().getBitWidth() == 64 &&
 571             Imm->getAPIntValue().isIntN(32))
 572           return false;
 573       }
 574
 575       // If the other operand is a TLS address, we should fold it instead.
 576       // This produces
 577       // movl    %gs:0, %eax
 578       // leal    i@NTPOFF(%eax), %eax
 579       // instead of
 580       // movl    $i@NTPOFF, %eax
 581       // addl    %gs:0, %eax
 582       // if the block also has an access to a second TLS address this will save
 583       // a load.
 584       // FIXME: This is probably also true for non-TLS addresses.
 585       if (Op1.getOpcode() == X86ISD::Wrapper) {
 586         SDValue Val = Op1.getOperand(0);
 587         if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
 588           return false;
 589       }
 590
 591       // Don't fold load if this matches the BTS/BTR/BTC patterns.
 592       // BTS: (or X, (shl 1, n))
 593       // BTR: (and X, (rotl -2, n))
 594       // BTC: (xor X, (shl 1, n))
 595       if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
 596         if (U->getOperand(0).getOpcode() == ISD::SHL &&
 597             isOneConstant(U->getOperand(0).getOperand(0)))
 598           return false;
 599
 600         if (U->getOperand(1).getOpcode() == ISD::SHL &&
 601             isOneConstant(U->getOperand(1).getOperand(0)))
 602           return false;
 603       }
 604       if (U->getOpcode() == ISD::AND) {
 605         SDValue U0 = U->getOperand(0);
 606         SDValue U1 = U->getOperand(1);
 607         if (U0.getOpcode() == ISD::ROTL) {
 608           auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
 609           if (C && C->getSExtValue() == -2)
 610             return false;
 611         }
 612
 613         if (U1.getOpcode() == ISD::ROTL) {
 614           auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
 615           if (C && C->getSExtValue() == -2)
 616             return false;
 617         }
 618       }
 619
 620       break;
 621     }
 622     case ISD::SHL:
 623     case ISD::SRA:
 624     case ISD::SRL:
 625       // Don't fold a load into a shift by immediate. The BMI2 instructions
 626       // support folding a load, but not an immediate. The legacy instructions
 627       // support folding an immediate, but can't fold a load. Folding an
 628       // immediate is preferable to folding a load.
 629       if (isa<ConstantSDNode>(U->getOperand(1)))
 630         return false;
 631
 632       break;
 633     }
 634   }
 635
 636   // Prevent folding a load if this can implemented with an insert_subreg or
 637   // a move that implicitly zeroes.
 638   if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
 639       isNullConstant(Root->getOperand(2)) &&
 640       (Root->getOperand(0).isUndef() ||
 641        ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
 642     return false;
 643
 644   return true;
 645 }
 646
 647 /// Replace the original chain operand of the call with
 648 /// load's chain operand and move load below the call's chain operand.
 649 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
 650                                SDValue Call, SDValue OrigChain) {
 651   SmallVector<SDValue, 8> Ops;
 652   SDValue Chain = OrigChain.getOperand(0);
 653   if (Chain.getNode() == Load.getNode())
 654     Ops.push_back(Load.getOperand(0));
 655   else {
 656     assert(Chain.getOpcode() == ISD::TokenFactor &&
 657            "Unexpected chain operand");
 658     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
 659       if (Chain.getOperand(i).getNode() == Load.getNode())
 660         Ops.push_back(Load.getOperand(0));
 661       else
 662         Ops.push_back(Chain.getOperand(i));
 663     SDValue NewChain =
 664       CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
 665     Ops.clear();
 666     Ops.push_back(NewChain);
 667   }
 668   Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
 669   CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
 670   CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
 671                              Load.getOperand(1), Load.getOperand(2));
 672
 673   Ops.clear();
 674   Ops.push_back(SDValue(Load.getNode(), 1));
 675   Ops.append(Call->op_begin() + 1, Call->op_end());
 676   CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
 677 }
 678
 679 /// Return true if call address is a load and it can be
 680 /// moved below CALLSEQ_START and the chains leading up to the call.
 681 /// Return the CALLSEQ_START by reference as a second output.
 682 /// In the case of a tail call, there isn't a callseq node between the call
 683 /// chain and the load.
 684 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
 685   // The transformation is somewhat dangerous if the call's chain was glued to
 686   // the call. After MoveBelowOrigChain the load is moved between the call and
 687   // the chain, this can create a cycle if the load is not folded. So it is
 688   // *really* important that we are sure the load will be folded.
 689   if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
 690     return false;
 691   LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
 692   if (!LD ||
 693       LD->isVolatile() ||
 694       LD->getAddressingMode() != ISD::UNINDEXED ||
 695       LD->getExtensionType() != ISD::NON_EXTLOAD)
 696     return false;
 697
 698   // Now let's find the callseq_start.
 699   while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
 700     if (!Chain.hasOneUse())
 701       return false;
 702     Chain = Chain.getOperand(0);
 703   }
 704
 705   if (!Chain.getNumOperands())
 706     return false;
 707   // Since we are not checking for AA here, conservatively abort if the chain
 708   // writes to memory. It's not safe to move the callee (a load) across a store.
 709   if (isa<MemSDNode>(Chain.getNode()) &&
 710       cast<MemSDNode>(Chain.getNode())->writeMem())
 711     return false;
 712   if (Chain.getOperand(0).getNode() == Callee.getNode())
 713     return true;
 714   if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
 715       Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
 716       Callee.getValue(1).hasOneUse())
 717     return true;
 718   return false;
 719 }
 720
 721 void X86DAGToDAGISel::PreprocessISelDAG() {
 722   // OptFor[Min]Size are used in pattern predicates that isel is matching.
 723   OptForSize = MF->getFunction().optForSize();
 724   OptForMinSize = MF->getFunction().optForMinSize();
 725   assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
 726
 727   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
 728        E = CurDAG->allnodes_end(); I != E; ) {
 729     SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
 730
 731     // If this is a target specific AND node with no flag usages, turn it back
 732     // into ISD::AND to enable test instruction matching.
 733     if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
 734       SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
 735                                     N->getOperand(0), N->getOperand(1));
 736       --I;
 737       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
 738       ++I;
 739       CurDAG->DeleteNode(N);
 740       continue;
 741     }
 742
 743     if (OptLevel != CodeGenOpt::None &&
 744         // Only do this when the target can fold the load into the call or
 745         // jmp.
 746         !Subtarget->useRetpolineIndirectCalls() &&
 747         ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
 748          (N->getOpcode() == X86ISD::TC_RETURN &&
 749           (Subtarget->is64Bit() ||
 750            !getTargetMachine().isPositionIndependent())))) {
 751       /// Also try moving call address load from outside callseq_start to just
 752       /// before the call to allow it to be folded.
 753       ///
 754       ///     [Load chain]
 755       ///         ^
 756       ///         |
 757       ///       [Load]
 758       ///       ^    ^
 759       ///       |    |
 760       ///      /      \--
 761       ///     /          |
 762       ///[CALLSEQ_START] |
 763       ///     ^          |
 764       ///     |          |
 765       /// [LOAD/C2Reg]   |
 766       ///     |          |
 767       ///      \        /
 768       ///       \      /
 769       ///       [CALL]
 770       bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
 771       SDValue Chain = N->getOperand(0);
 772       SDValue Load  = N->getOperand(1);
 773       if (!isCalleeLoad(Load, Chain, HasCallSeq))
 774         continue;
 775       moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
 776       ++NumLoadMoved;
 777       continue;
 778     }
 779
 780     // Lower fpround and fpextend nodes that target the FP stack to be store and
 781     // load to the stack.  This is a gross hack.  We would like to simply mark
 782     // these as being illegal, but when we do that, legalize produces these when
 783     // it expands calls, then expands these in the same legalize pass.  We would
 784     // like dag combine to be able to hack on these between the call expansion
 785     // and the node legalization.  As such this pass basically does "really
 786     // late" legalization of these inline with the X86 isel pass.
 787     // FIXME: This should only happen when not compiled with -O0.
 788     if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
 789       continue;
 790
 791     MVT SrcVT = N->getOperand(0).getSimpleValueType();
 792     MVT DstVT = N->getSimpleValueType(0);
 793
 794     // If any of the sources are vectors, no fp stack involved.
 795     if (SrcVT.isVector() || DstVT.isVector())
 796       continue;
 797
 798     // If the source and destination are SSE registers, then this is a legal
 799     // conversion that should not be lowered.
 800     const X86TargetLowering *X86Lowering =
 801         static_cast<const X86TargetLowering *>(TLI);
 802     bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
 803     bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
 804     if (SrcIsSSE && DstIsSSE)
 805       continue;
 806
 807     if (!SrcIsSSE && !DstIsSSE) {
 808       // If this is an FPStack extension, it is a noop.
 809       if (N->getOpcode() == ISD::FP_EXTEND)
 810         continue;
 811       // If this is a value-preserving FPStack truncation, it is a noop.
 812       if (N->getConstantOperandVal(1))
 813         continue;
 814     }
 815
 816     // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
 817     // FPStack has extload and truncstore.  SSE can fold direct loads into other
 818     // operations.  Based on this, decide what we want to do.
 819     MVT MemVT;
 820     if (N->getOpcode() == ISD::FP_ROUND)
 821       MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
 822     else
 823       MemVT = SrcIsSSE ? SrcVT : DstVT;
 824
 825     SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
 826     SDLoc dl(N);
 827
 828     // FIXME: optimize the case where the src/dest is a load or store?
 829     SDValue Store =
 830         CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
 831                               MemTmp, MachinePointerInfo(), MemVT);
 832     SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
 833                                         MachinePointerInfo(), MemVT);
 834
 835     // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
 836     // extload we created.  This will cause general havok on the dag because
 837     // anything below the conversion could be folded into other existing nodes.
 838     // To avoid invalidating 'I', back it up to the convert node.
 839     --I;
 840     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
 841
 842     // Now that we did that, the node is dead.  Increment the iterator to the
 843     // next node to process, then delete N.
 844     ++I;
 845     CurDAG->DeleteNode(N);
 846   }
 847 }
 848
 849 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
 850 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
 851   unsigned Opc = N->getMachineOpcode();
 852   if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
 853       Opc != X86::MOVSX64rr8)
 854     return false;
 855
 856   SDValue N0 = N->getOperand(0);
 857
 858   // We need to be extracting the lower bit of an extend.
 859   if (!N0.isMachineOpcode() ||
 860       N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
 861       N0.getConstantOperandVal(1) != X86::sub_8bit)
 862     return false;
 863
 864   // We're looking for either a movsx or movzx to match the original opcode.
 865   unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
 866                                                 : X86::MOVSX32rr8_NOREX;
 867   SDValue N00 = N0.getOperand(0);
 868   if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
 869     return false;
 870
 871   if (Opc == X86::MOVSX64rr8) {
 872     // If we had a sign extend from 8 to 64 bits. We still need to go from 32
 873     // to 64.
 874     MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
 875                                                    MVT::i64, N00);
 876     ReplaceUses(N, Extend);
 877   } else {
 878     // Ok we can drop this extend and just use the original extend.
 879     ReplaceUses(N, N00.getNode());
 880   }
 881
 882   return true;
 883 }
 884
 885 void X86DAGToDAGISel::PostprocessISelDAG() {
 886   // Skip peepholes at -O0.
 887   if (TM.getOptLevel() == CodeGenOpt::None)
 888     return;
 889
 890   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 891
 892   bool MadeChange = false;
 893   while (Position != CurDAG->allnodes_begin()) {
 894     SDNode *N = &*--Position;
 895     // Skip dead nodes and any non-machine opcodes.
 896     if (N->use_empty() || !N->isMachineOpcode())
 897       continue;
 898
 899     if (tryOptimizeRem8Extend(N)) {
 900       MadeChange = true;
 901       continue;
 902     }
 903
 904     // Look for a TESTrr+ANDrr pattern where both operands of the test are
 905     // the same. Rewrite to remove the AND.
 906     unsigned Opc = N->getMachineOpcode();
 907     if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
 908          Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
 909         N->getOperand(0) == N->getOperand(1) &&
 910         N->isOnlyUserOf(N->getOperand(0).getNode()) &&
 911         N->getOperand(0).isMachineOpcode()) {
 912       SDValue And = N->getOperand(0);
 913       unsigned N0Opc = And.getMachineOpcode();
 914       if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
 915           N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) {
 916         MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
 917                                                      MVT::i32,
 918                                                      And.getOperand(0),
 919                                                      And.getOperand(1));
 920         ReplaceUses(N, Test);
 921         MadeChange = true;
 922         continue;
 923       }
 924       if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
 925           N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) {
 926         unsigned NewOpc;
 927         switch (N0Opc) {
 928         case X86::AND8rm:  NewOpc = X86::TEST8mr; break;
 929         case X86::AND16rm: NewOpc = X86::TEST16mr; break;
 930         case X86::AND32rm: NewOpc = X86::TEST32mr; break;
 931         case X86::AND64rm: NewOpc = X86::TEST64mr; break;
 932         }
 933
 934         // Need to swap the memory and register operand.
 935         SDValue Ops[] = { And.getOperand(1),
 936                           And.getOperand(2),
 937                           And.getOperand(3),
 938                           And.getOperand(4),
 939                           And.getOperand(5),
 940                           And.getOperand(0),
 941                           And.getOperand(6)  /* Chain */ };
 942         MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
 943                                                      MVT::i32, MVT::Other, Ops);
 944         ReplaceUses(N, Test);
 945         MadeChange = true;
 946         continue;
 947       }
 948     }
 949
 950     // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
 951     // used. We're doing this late so we can prefer to fold the AND into masked
 952     // comparisons. Doing that can be better for the live range of the mask
 953     // register.
 954     if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
 955          Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
 956         N->getOperand(0) == N->getOperand(1) &&
 957         N->isOnlyUserOf(N->getOperand(0).getNode()) &&
 958         N->getOperand(0).isMachineOpcode() &&
 959         onlyUsesZeroFlag(SDValue(N, 0))) {
 960       SDValue And = N->getOperand(0);
 961       unsigned N0Opc = And.getMachineOpcode();
 962       // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
 963       // KAND instructions and KTEST use the same ISA feature.
 964       if (N0Opc == X86::KANDBrr ||
 965           (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
 966           N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
 967         unsigned NewOpc;
 968         switch (Opc) {
 969         default: llvm_unreachable("Unexpected opcode!");
 970         case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
 971         case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
 972         case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
 973         case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
 974         }
 975         MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
 976                                                       MVT::i32,
 977                                                       And.getOperand(0),
 978                                                       And.getOperand(1));
 979         ReplaceUses(N, KTest);
 980         MadeChange = true;
 981         continue;
 982       }
 983     }
 984
 985     // Attempt to remove vectors moves that were inserted to zero upper bits.
 986     if (Opc != TargetOpcode::SUBREG_TO_REG)
 987       continue;
 988
 989     unsigned SubRegIdx = N->getConstantOperandVal(2);
 990     if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
 991       continue;
 992
 993     SDValue Move = N->getOperand(1);
 994     if (!Move.isMachineOpcode())
 995       continue;
 996
 997     // Make sure its one of the move opcodes we recognize.
 998     switch (Move.getMachineOpcode()) {
 999     default:
1000       continue;
1001     case X86::VMOVAPDrr:       case X86::VMOVUPDrr:
1002     case X86::VMOVAPSrr:       case X86::VMOVUPSrr:
1003     case X86::VMOVDQArr:       case X86::VMOVDQUrr:
1004     case X86::VMOVAPDYrr:      case X86::VMOVUPDYrr:
1005     case X86::VMOVAPSYrr:      case X86::VMOVUPSYrr:
1006     case X86::VMOVDQAYrr:      case X86::VMOVDQUYrr:
1007     case X86::VMOVAPDZ128rr:   case X86::VMOVUPDZ128rr:
1008     case X86::VMOVAPSZ128rr:   case X86::VMOVUPSZ128rr:
1009     case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1010     case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1011     case X86::VMOVAPDZ256rr:   case X86::VMOVUPDZ256rr:
1012     case X86::VMOVAPSZ256rr:   case X86::VMOVUPSZ256rr:
1013     case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1014     case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1015       break;
1016     }
1017
1018     SDValue In = Move.getOperand(0);
1019     if (!In.isMachineOpcode() ||
1020         In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1021       continue;
1022
1023     // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1024     // the SHA instructions which use a legacy encoding.
1025     uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1026     if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1027         (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1028         (TSFlags & X86II::EncodingMask) != X86II::XOP)
1029       continue;
1030
1031     // Producing instruction is another vector instruction. We can drop the
1032     // move.
1033     CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1034     MadeChange = true;
1035   }
1036
1037   if (MadeChange)
1038     CurDAG->RemoveDeadNodes();
1039 }
1040
1041
1042 /// Emit any code that needs to be executed only in the main function.
1043 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1044   if (Subtarget->isTargetCygMing()) {
1045     TargetLowering::ArgListTy Args;
1046     auto &DL = CurDAG->getDataLayout();
1047
1048     TargetLowering::CallLoweringInfo CLI(*CurDAG);
1049     CLI.setChain(CurDAG->getRoot())
1050         .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1051                    CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1052                    std::move(Args));
1053     const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1054     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1055     CurDAG->setRoot(Result.second);
1056   }
1057 }
1058
1059 void X86DAGToDAGISel::EmitFunctionEntryCode() {
1060   // If this is main, emit special code for main.
1061   const Function &F = MF->getFunction();
1062   if (F.hasExternalLinkage() && F.getName() == "main")
1063     emitSpecialCodeForMain();
1064 }
1065
1066 static bool isDispSafeForFrameIndex(int64_t Val) {
1067   // On 64-bit platforms, we can run into an issue where a frame index
1068   // includes a displacement that, when added to the explicit displacement,
1069   // will overflow the displacement field. Assuming that the frame index
1070   // displacement fits into a 31-bit integer  (which is only slightly more
1071   // aggressive than the current fundamental assumption that it fits into
1072   // a 32-bit integer), a 31-bit disp should always be safe.
1073   return isInt<31>(Val);
1074 }
1075
1076 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1077                                             X86ISelAddressMode &AM) {
1078   // If there's no offset to fold, we don't need to do any work.
1079   if (Offset == 0)
1080     return false;
1081
1082   // Cannot combine ExternalSymbol displacements with integer offsets.
1083   if (AM.ES || AM.MCSym)
1084     return true;
1085
1086   int64_t Val = AM.Disp + Offset;
1087   CodeModel::Model M = TM.getCodeModel();
1088   if (Subtarget->is64Bit()) {
1089     if (!X86::isOffsetSuitableForCodeModel(Val, M,
1090                                            AM.hasSymbolicDisplacement()))
1091       return true;
1092     // In addition to the checks required for a register base, check that
1093     // we do not try to use an unsafe Disp with a frame index.
1094     if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1095         !isDispSafeForFrameIndex(Val))
1096       return true;
1097   }
1098   AM.Disp = Val;
1099   return false;
1100
1101 }
1102
1103 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
1104   SDValue Address = N->getOperand(1);
1105
1106   // load gs:0 -> GS segment register.
1107   // load fs:0 -> FS segment register.
1108   //
1109   // This optimization is valid because the GNU TLS model defines that
1110   // gs:0 (or fs:0 on X86-64) contains its own address.
1111   // For more information see http://people.redhat.com/drepper/tls.pdf
1112   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
1113     if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
1114         !IndirectTlsSegRefs &&
1115         (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1116          Subtarget->isTargetFuchsia()))
1117       switch (N->getPointerInfo().getAddrSpace()) {
1118       case 256:
1119         AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1120         return false;
1121       case 257:
1122         AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1123         return false;
1124       // Address space 258 is not handled here, because it is not used to
1125       // address TLS areas.
1126       }
1127
1128   return true;
1129 }
1130
1131 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1132 /// mode. These wrap things that will resolve down into a symbol reference.
1133 /// If no match is possible, this returns true, otherwise it returns false.
1134 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1135   // If the addressing mode already has a symbol as the displacement, we can
1136   // never match another symbol.
1137   if (AM.hasSymbolicDisplacement())
1138     return true;
1139
1140   bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1141
1142   // We can't use an addressing mode in the 64-bit large code model. In the
1143   // medium code model, we use can use an mode when RIP wrappers are present.
1144   // That signifies access to globals that are known to be "near", such as the
1145   // GOT itself.
1146   CodeModel::Model M = TM.getCodeModel();
1147   if (Subtarget->is64Bit() &&
1148       (M == CodeModel::Large || (M == CodeModel::Medium && !IsRIPRel)))
1149     return true;
1150
1151   // Base and index reg must be 0 in order to use %rip as base.
1152   if (IsRIPRel && AM.hasBaseOrIndexReg())
1153     return true;
1154
1155   // Make a local copy in case we can't do this fold.
1156   X86ISelAddressMode Backup = AM;
1157
1158   int64_t Offset = 0;
1159   SDValue N0 = N.getOperand(0);
1160   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1161     AM.GV = G->getGlobal();
1162     AM.SymbolFlags = G->getTargetFlags();
1163     Offset = G->getOffset();
1164   } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1165     AM.CP = CP->getConstVal();
1166     AM.Align = CP->getAlignment();
1167     AM.SymbolFlags = CP->getTargetFlags();
1168     Offset = CP->getOffset();
1169   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1170     AM.ES = S->getSymbol();
1171     AM.SymbolFlags = S->getTargetFlags();
1172   } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1173     AM.MCSym = S->getMCSymbol();
1174   } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
1175     AM.JT = J->getIndex();
1176     AM.SymbolFlags = J->getTargetFlags();
1177   } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1178     AM.BlockAddr = BA->getBlockAddress();
1179     AM.SymbolFlags = BA->getTargetFlags();
1180     Offset = BA->getOffset();
1181   } else
1182     llvm_unreachable("Unhandled symbol reference node.");
1183
1184   if (foldOffsetIntoAddress(Offset, AM)) {
1185     AM = Backup;
1186     return true;
1187   }
1188
1189   if (IsRIPRel)
1190     AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1191
1192   // Commit the changes now that we know this fold is safe.
1193   return false;
1194 }
1195
1196 /// Add the specified node to the specified addressing mode, returning true if
1197 /// it cannot be done. This just pattern matches for the addressing mode.
1198 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1199   if (matchAddressRecursively(N, AM, 0))
1200     return true;
1201
1202   // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1203   // a smaller encoding and avoids a scaled-index.
1204   if (AM.Scale == 2 &&
1205       AM.BaseType == X86ISelAddressMode::RegBase &&
1206       AM.Base_Reg.getNode() == nullptr) {
1207     AM.Base_Reg = AM.IndexReg;
1208     AM.Scale = 1;
1209   }
1210
1211   // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1212   // because it has a smaller encoding.
1213   // TODO: Which other code models can use this?
1214   if (TM.getCodeModel() == CodeModel::Small &&
1215       Subtarget->is64Bit() &&
1216       AM.Scale == 1 &&
1217       AM.BaseType == X86ISelAddressMode::RegBase &&
1218       AM.Base_Reg.getNode() == nullptr &&
1219       AM.IndexReg.getNode() == nullptr &&
1220       AM.SymbolFlags == X86II::MO_NO_FLAG &&
1221       AM.hasSymbolicDisplacement())
1222     AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1223
1224   return false;
1225 }
1226
1227 bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
1228                                unsigned Depth) {
1229   // Add an artificial use to this node so that we can keep track of
1230   // it if it gets CSE'd with a different node.
1231   HandleSDNode Handle(N);
1232
1233   X86ISelAddressMode Backup = AM;
1234   if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1235       !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1236     return false;
1237   AM = Backup;
1238
1239   // Try again after commuting the operands.
1240   if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
1241       !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
1242     return false;
1243   AM = Backup;
1244
1245   // If we couldn't fold both operands into the address at the same time,
1246   // see if we can just put each operand into a register and fold at least
1247   // the add.
1248   if (AM.BaseType == X86ISelAddressMode::RegBase &&
1249       !AM.Base_Reg.getNode() &&
1250       !AM.IndexReg.getNode()) {
1251     N = Handle.getValue();
1252     AM.Base_Reg = N.getOperand(0);
1253     AM.IndexReg = N.getOperand(1);
1254     AM.Scale = 1;
1255     return false;
1256   }
1257   N = Handle.getValue();
1258   return true;
1259 }
1260
1261 // Insert a node into the DAG at least before the Pos node's position. This
1262 // will reposition the node as needed, and will assign it a node ID that is <=
1263 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1264 // IDs! The selection DAG must no longer depend on their uniqueness when this
1265 // is used.
1266 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1267   if (N->getNodeId() == -1 ||
1268       (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
1269        SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
1270     DAG.RepositionNode(Pos->getIterator(), N.getNode());
1271     // Mark Node as invalid for pruning as after this it may be a successor to a
1272     // selected node but otherwise be in the same position of Pos.
1273     // Conservatively mark it with the same -abs(Id) to assure node id
1274     // invariant is preserved.
1275     N->setNodeId(Pos->getNodeId());
1276     SelectionDAGISel::InvalidateNodeId(N.getNode());
1277   }
1278 }
1279
1280 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1281 // safe. This allows us to convert the shift and and into an h-register
1282 // extract and a scaled index. Returns false if the simplification is
1283 // performed.
1284 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
1285                                       uint64_t Mask,
1286                                       SDValue Shift, SDValue X,
1287                                       X86ISelAddressMode &AM) {
1288   if (Shift.getOpcode() != ISD::SRL ||
1289       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1290       !Shift.hasOneUse())
1291     return true;
1292
1293   int ScaleLog = 8 - Shift.getConstantOperandVal(1);
1294   if (ScaleLog <= 0 || ScaleLog >= 4 ||
1295       Mask != (0xffu << ScaleLog))
1296     return true;
1297
1298   MVT VT = N.getSimpleValueType();
1299   SDLoc DL(N);
1300   SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
1301   SDValue NewMask = DAG.getConstant(0xff, DL, VT);
1302   SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
1303   SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
1304   SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
1305   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
1306
1307   // Insert the new nodes into the topological ordering. We must do this in
1308   // a valid topological ordering as nothing is going to go back and re-sort
1309   // these nodes. We continually insert before 'N' in sequence as this is
1310   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1311   // hierarchy left to express.
1312   insertDAGNode(DAG, N, Eight);
1313   insertDAGNode(DAG, N, Srl);
1314   insertDAGNode(DAG, N, NewMask);
1315   insertDAGNode(DAG, N, And);
1316   insertDAGNode(DAG, N, ShlCount);
1317   insertDAGNode(DAG, N, Shl);
1318   DAG.ReplaceAllUsesWith(N, Shl);
1319   AM.IndexReg = And;
1320   AM.Scale = (1 << ScaleLog);
1321   return false;
1322 }
1323
1324 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1325 // allows us to fold the shift into this addressing mode. Returns false if the
1326 // transform succeeded.
1327 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
1328                                         uint64_t Mask,
1329                                         SDValue Shift, SDValue X,
1330                                         X86ISelAddressMode &AM) {
1331   if (Shift.getOpcode() != ISD::SHL ||
1332       !isa<ConstantSDNode>(Shift.getOperand(1)))
1333     return true;
1334
1335   // Not likely to be profitable if either the AND or SHIFT node has more
1336   // than one use (unless all uses are for address computation). Besides,
1337   // isel mechanism requires their node ids to be reused.
1338   if (!N.hasOneUse() || !Shift.hasOneUse())
1339     return true;
1340
1341   // Verify that the shift amount is something we can fold.
1342   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1343   if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
1344     return true;
1345
1346   MVT VT = N.getSimpleValueType();
1347   SDLoc DL(N);
1348   SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
1349   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
1350   SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
1351
1352   // Insert the new nodes into the topological ordering. We must do this in
1353   // a valid topological ordering as nothing is going to go back and re-sort
1354   // these nodes. We continually insert before 'N' in sequence as this is
1355   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1356   // hierarchy left to express.
1357   insertDAGNode(DAG, N, NewMask);
1358   insertDAGNode(DAG, N, NewAnd);
1359   insertDAGNode(DAG, N, NewShift);
1360   DAG.ReplaceAllUsesWith(N, NewShift);
1361
1362   AM.Scale = 1 << ShiftAmt;
1363   AM.IndexReg = NewAnd;
1364   return false;
1365 }
1366
1367 // Implement some heroics to detect shifts of masked values where the mask can
1368 // be replaced by extending the shift and undoing that in the addressing mode
1369 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1370 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1371 // the addressing mode. This results in code such as:
1372 //
1373 //   int f(short *y, int *lookup_table) {
1374 //     ...
1375 //     return *y + lookup_table[*y >> 11];
1376 //   }
1377 //
1378 // Turning into:
1379 //   movzwl (%rdi), %eax
1380 //   movl %eax, %ecx
1381 //   shrl $11, %ecx
1382 //   addl (%rsi,%rcx,4), %eax
1383 //
1384 // Instead of:
1385 //   movzwl (%rdi), %eax
1386 //   movl %eax, %ecx
1387 //   shrl $9, %ecx
1388 //   andl $124, %rcx
1389 //   addl (%rsi,%rcx), %eax
1390 //
1391 // Note that this function assumes the mask is provided as a mask *after* the
1392 // value is shifted. The input chain may or may not match that, but computing
1393 // such a mask is trivial.
1394 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
1395                                     uint64_t Mask,
1396                                     SDValue Shift, SDValue X,
1397                                     X86ISelAddressMode &AM) {
1398   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
1399       !isa<ConstantSDNode>(Shift.getOperand(1)))
1400     return true;
1401
1402   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1403   unsigned MaskLZ = countLeadingZeros(Mask);
1404   unsigned MaskTZ = countTrailingZeros(Mask);
1405
1406   // The amount of shift we're trying to fit into the addressing mode is taken
1407   // from the trailing zeros of the mask.
1408   unsigned AMShiftAmt = MaskTZ;
1409
1410   // There is nothing we can do here unless the mask is removing some bits.
1411   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1412   if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1413
1414   // We also need to ensure that mask is a continuous run of bits.
1415   if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
1416
1417   // Scale the leading zero count down based on the actual size of the value.
1418   // Also scale it down based on the size of the shift.
1419   unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
1420   if (MaskLZ < ScaleDown)
1421     return true;
1422   MaskLZ -= ScaleDown;
1423
1424   // The final check is to ensure that any masked out high bits of X are
1425   // already known to be zero. Otherwise, the mask has a semantic impact
1426   // other than masking out a couple of low bits. Unfortunately, because of
1427   // the mask, zero extensions will be removed from operands in some cases.
1428   // This code works extra hard to look through extensions because we can
1429   // replace them with zero extensions cheaply if necessary.
1430   bool ReplacingAnyExtend = false;
1431   if (X.getOpcode() == ISD::ANY_EXTEND) {
1432     unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
1433                           X.getOperand(0).getSimpleValueType().getSizeInBits();
1434     // Assume that we'll replace the any-extend with a zero-extend, and
1435     // narrow the search to the extended value.
1436     X = X.getOperand(0);
1437     MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
1438     ReplacingAnyExtend = true;
1439   }
1440   APInt MaskedHighBits =
1441     APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
1442   KnownBits Known = DAG.computeKnownBits(X);
1443   if (MaskedHighBits != Known.Zero) return true;
1444
1445   // We've identified a pattern that can be transformed into a single shift
1446   // and an addressing mode. Make it so.
1447   MVT VT = N.getSimpleValueType();
1448   if (ReplacingAnyExtend) {
1449     assert(X.getValueType() != VT);
1450     // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1451     SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
1452     insertDAGNode(DAG, N, NewX);
1453     X = NewX;
1454   }
1455   SDLoc DL(N);
1456   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1457   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1458   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1459   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
1460
1461   // Insert the new nodes into the topological ordering. We must do this in
1462   // a valid topological ordering as nothing is going to go back and re-sort
1463   // these nodes. We continually insert before 'N' in sequence as this is
1464   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1465   // hierarchy left to express.
1466   insertDAGNode(DAG, N, NewSRLAmt);
1467   insertDAGNode(DAG, N, NewSRL);
1468   insertDAGNode(DAG, N, NewSHLAmt);
1469   insertDAGNode(DAG, N, NewSHL);
1470   DAG.ReplaceAllUsesWith(N, NewSHL);
1471
1472   AM.Scale = 1 << AMShiftAmt;
1473   AM.IndexReg = NewSRL;
1474   return false;
1475 }
1476
1477 // Transform "(X >> SHIFT) & (MASK << C1)" to
1478 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
1479 // matched to a BEXTR later. Returns false if the simplification is performed.
1480 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
1481                                    uint64_t Mask,
1482                                    SDValue Shift, SDValue X,
1483                                    X86ISelAddressMode &AM,
1484                                    const X86Subtarget &Subtarget) {
1485   if (Shift.getOpcode() != ISD::SRL ||
1486       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1487       !Shift.hasOneUse() || !N.hasOneUse())
1488     return true;
1489
1490   // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
1491   if (!Subtarget.hasTBM() &&
1492       !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
1493     return true;
1494
1495   // We need to ensure that mask is a continuous run of bits.
1496   if (!isShiftedMask_64(Mask)) return true;
1497
1498   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1499
1500   // The amount of shift we're trying to fit into the addressing mode is taken
1501   // from the trailing zeros of the mask.
1502   unsigned AMShiftAmt = countTrailingZeros(Mask);
1503
1504   // There is nothing we can do here unless the mask is removing some bits.
1505   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1506   if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1507
1508   MVT VT = N.getSimpleValueType();
1509   SDLoc DL(N);
1510   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1511   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1512   SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
1513   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
1514   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1515   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
1516
1517   // Insert the new nodes into the topological ordering. We must do this in
1518   // a valid topological ordering as nothing is going to go back and re-sort
1519   // these nodes. We continually insert before 'N' in sequence as this is
1520   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1521   // hierarchy left to express.
1522   insertDAGNode(DAG, N, NewSRLAmt);
1523   insertDAGNode(DAG, N, NewSRL);
1524   insertDAGNode(DAG, N, NewMask);
1525   insertDAGNode(DAG, N, NewAnd);
1526   insertDAGNode(DAG, N, NewSHLAmt);
1527   insertDAGNode(DAG, N, NewSHL);
1528   DAG.ReplaceAllUsesWith(N, NewSHL);
1529
1530   AM.Scale = 1 << AMShiftAmt;
1531   AM.IndexReg = NewAnd;
1532   return false;
1533 }
1534
1535 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
1536                                               unsigned Depth) {
1537   SDLoc dl(N);
1538   LLVM_DEBUG({
1539     dbgs() << "MatchAddress: ";
1540     AM.dump(CurDAG);
1541   });
1542   // Limit recursion.
1543   if (Depth > 5)
1544     return matchAddressBase(N, AM);
1545
1546   // If this is already a %rip relative address, we can only merge immediates
1547   // into it.  Instead of handling this in every case, we handle it here.
1548   // RIP relative addressing: %rip + 32-bit displacement!
1549   if (AM.isRIPRelative()) {
1550     // FIXME: JumpTable and ExternalSymbol address currently don't like
1551     // displacements.  It isn't very important, but this should be fixed for
1552     // consistency.
1553     if (!(AM.ES || AM.MCSym) && AM.JT != -1)
1554       return true;
1555
1556     if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
1557       if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
1558         return false;
1559     return true;
1560   }
1561
1562   switch (N.getOpcode()) {
1563   default: break;
1564   case ISD::LOCAL_RECOVER: {
1565     if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
1566       if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
1567         // Use the symbol and don't prefix it.
1568         AM.MCSym = ESNode->getMCSymbol();
1569         return false;
1570       }
1571     break;
1572   }
1573   case ISD::Constant: {
1574     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
1575     if (!foldOffsetIntoAddress(Val, AM))
1576       return false;
1577     break;
1578   }
1579
1580   case X86ISD::Wrapper:
1581   case X86ISD::WrapperRIP:
1582     if (!matchWrapper(N, AM))
1583       return false;
1584     break;
1585
1586   case ISD::LOAD:
1587     if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
1588       return false;
1589     break;
1590
1591   case ISD::FrameIndex:
1592     if (AM.BaseType == X86ISelAddressMode::RegBase &&
1593         AM.Base_Reg.getNode() == nullptr &&
1594         (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
1595       AM.BaseType = X86ISelAddressMode::FrameIndexBase;
1596       AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
1597       return false;
1598     }
1599     break;
1600
1601   case ISD::SHL:
1602     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
1603       break;
1604
1605     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
1606       unsigned Val = CN->getZExtValue();
1607       // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1608       // that the base operand remains free for further matching. If
1609       // the base doesn't end up getting used, a post-processing step
1610       // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1611       if (Val == 1 || Val == 2 || Val == 3) {
1612         AM.Scale = 1 << Val;
1613         SDValue ShVal = N.getOperand(0);
1614
1615         // Okay, we know that we have a scale by now.  However, if the scaled
1616         // value is an add of something and a constant, we can fold the
1617         // constant into the disp field here.
1618         if (CurDAG->isBaseWithConstantOffset(ShVal)) {
1619           AM.IndexReg = ShVal.getOperand(0);
1620           ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
1621           uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
1622           if (!foldOffsetIntoAddress(Disp, AM))
1623             return false;
1624         }
1625
1626         AM.IndexReg = ShVal;
1627         return false;
1628       }
1629     }
1630     break;
1631
1632   case ISD::SRL: {
1633     // Scale must not be used already.
1634     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
1635
1636     SDValue And = N.getOperand(0);
1637     if (And.getOpcode() != ISD::AND) break;
1638     SDValue X = And.getOperand(0);
1639
1640     // We only handle up to 64-bit values here as those are what matter for
1641     // addressing mode optimizations.
1642     if (X.getSimpleValueType().getSizeInBits() > 64) break;
1643
1644     // The mask used for the transform is expected to be post-shift, but we
1645     // found the shift first so just apply the shift to the mask before passing
1646     // it down.
1647     if (!isa<ConstantSDNode>(N.getOperand(1)) ||
1648         !isa<ConstantSDNode>(And.getOperand(1)))
1649       break;
1650     uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
1651
1652     // Try to fold the mask and shift into the scale, and return false if we
1653     // succeed.
1654     if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
1655       return false;
1656     break;
1657   }
1658
1659   case ISD::SMUL_LOHI:
1660   case ISD::UMUL_LOHI:
1661     // A mul_lohi where we need the low part can be folded as a plain multiply.
1662     if (N.getResNo() != 0) break;
1663     LLVM_FALLTHROUGH;
1664   case ISD::MUL:
1665   case X86ISD::MUL_IMM:
1666     // X*[3,5,9] -> X+X*[2,4,8]
1667     if (AM.BaseType == X86ISelAddressMode::RegBase &&
1668         AM.Base_Reg.getNode() == nullptr &&
1669         AM.IndexReg.getNode() == nullptr) {
1670       if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
1671         if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
1672             CN->getZExtValue() == 9) {
1673           AM.Scale = unsigned(CN->getZExtValue())-1;
1674
1675           SDValue MulVal = N.getOperand(0);
1676           SDValue Reg;
1677
1678           // Okay, we know that we have a scale by now.  However, if the scaled
1679           // value is an add of something and a constant, we can fold the
1680           // constant into the disp field here.
1681           if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
1682               isa<ConstantSDNode>(MulVal.getOperand(1))) {
1683             Reg = MulVal.getOperand(0);
1684             ConstantSDNode *AddVal =
1685               cast<ConstantSDNode>(MulVal.getOperand(1));
1686             uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
1687             if (foldOffsetIntoAddress(Disp, AM))
1688               Reg = N.getOperand(0);
1689           } else {
1690             Reg = N.getOperand(0);
1691           }
1692
1693           AM.IndexReg = AM.Base_Reg = Reg;
1694           return false;
1695         }
1696     }
1697     break;
1698
1699   case ISD::SUB: {
1700     // Given A-B, if A can be completely folded into the address and
1701     // the index field with the index field unused, use -B as the index.
1702     // This is a win if a has multiple parts that can be folded into
1703     // the address. Also, this saves a mov if the base register has
1704     // other uses, since it avoids a two-address sub instruction, however
1705     // it costs an additional mov if the index register has other uses.
1706
1707     // Add an artificial use to this node so that we can keep track of
1708     // it if it gets CSE'd with a different node.
1709     HandleSDNode Handle(N);
1710
1711     // Test if the LHS of the sub can be folded.
1712     X86ISelAddressMode Backup = AM;
1713     if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
1714       AM = Backup;
1715       break;
1716     }
1717     // Test if the index field is free for use.
1718     if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
1719       AM = Backup;
1720       break;
1721     }
1722
1723     int Cost = 0;
1724     SDValue RHS = Handle.getValue().getOperand(1);
1725     // If the RHS involves a register with multiple uses, this
1726     // transformation incurs an extra mov, due to the neg instruction
1727     // clobbering its operand.
1728     if (!RHS.getNode()->hasOneUse() ||
1729         RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
1730         RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
1731         RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
1732         (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
1733          RHS.getOperand(0).getValueType() == MVT::i32))
1734       ++Cost;
1735     // If the base is a register with multiple uses, this
1736     // transformation may save a mov.
1737     // FIXME: Don't rely on DELETED_NODEs.
1738     if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
1739          AM.Base_Reg->getOpcode() != ISD::DELETED_NODE &&
1740          !AM.Base_Reg.getNode()->hasOneUse()) ||
1741         AM.BaseType == X86ISelAddressMode::FrameIndexBase)
1742       --Cost;
1743     // If the folded LHS was interesting, this transformation saves
1744     // address arithmetic.
1745     if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
1746         ((AM.Disp != 0) && (Backup.Disp == 0)) +
1747         (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
1748       --Cost;
1749     // If it doesn't look like it may be an overall win, don't do it.
1750     if (Cost >= 0) {
1751       AM = Backup;
1752       break;
1753     }
1754
1755     // Ok, the transformation is legal and appears profitable. Go for it.
1756     SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType());
1757     SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
1758     AM.IndexReg = Neg;
1759     AM.Scale = 1;
1760
1761     // Insert the new nodes into the topological ordering.
1762     insertDAGNode(*CurDAG, Handle.getValue(), Zero);
1763     insertDAGNode(*CurDAG, Handle.getValue(), Neg);
1764     return false;
1765   }
1766
1767   case ISD::ADD:
1768     if (!matchAdd(N, AM, Depth))
1769       return false;
1770     break;
1771
1772   case ISD::OR:
1773     // We want to look through a transform in InstCombine and DAGCombiner that
1774     // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
1775     // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
1776     // An 'lea' can then be used to match the shift (multiply) and add:
1777     // and $1, %esi
1778     // lea (%rsi, %rdi, 8), %rax
1779     if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
1780         !matchAdd(N, AM, Depth))
1781       return false;
1782     break;
1783
1784   case ISD::AND: {
1785     // Perform some heroic transforms on an and of a constant-count shift
1786     // with a constant to enable use of the scaled offset field.
1787
1788     // Scale must not be used already.
1789     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
1790
1791     SDValue Shift = N.getOperand(0);
1792     if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
1793     SDValue X = Shift.getOperand(0);
1794
1795     // We only handle up to 64-bit values here as those are what matter for
1796     // addressing mode optimizations.
1797     if (X.getSimpleValueType().getSizeInBits() > 64) break;
1798
1799     if (!isa<ConstantSDNode>(N.getOperand(1)))
1800       break;
1801     uint64_t Mask = N.getConstantOperandVal(1);
1802
1803     // Try to fold the mask and shift into an extract and scale.
1804     if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
1805       return false;
1806
1807     // Try to fold the mask and shift directly into the scale.
1808     if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
1809       return false;
1810
1811     // Try to swap the mask and shift to place shifts which can be done as
1812     // a scale on the outside of the mask.
1813     if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
1814       return false;
1815
1816     // Try to fold the mask and shift into BEXTR and scale.
1817     if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
1818       return false;
1819
1820     break;
1821   }
1822   }
1823
1824   return matchAddressBase(N, AM);
1825 }
1826
1827 /// Helper for MatchAddress. Add the specified node to the
1828 /// specified addressing mode without any further recursion.
1829 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
1830   // Is the base register already occupied?
1831   if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
1832     // If so, check to see if the scale index register is set.
1833     if (!AM.IndexReg.getNode()) {
1834       AM.IndexReg = N;
1835       AM.Scale = 1;
1836       return false;
1837     }
1838
1839     // Otherwise, we cannot select it.
1840     return true;
1841   }
1842
1843   // Default, generate it as a register.
1844   AM.BaseType = X86ISelAddressMode::RegBase;
1845   AM.Base_Reg = N;
1846   return false;
1847 }
1848
1849 /// Helper for selectVectorAddr. Handles things that can be folded into a
1850 /// gather scatter address. The index register and scale should have already
1851 /// been handled.
1852 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
1853   // TODO: Support other operations.
1854   switch (N.getOpcode()) {
1855   case ISD::Constant: {
1856     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
1857     if (!foldOffsetIntoAddress(Val, AM))
1858       return false;
1859     break;
1860   }
1861   case X86ISD::Wrapper:
1862     if (!matchWrapper(N, AM))
1863       return false;
1864     break;
1865   }
1866
1867   return matchAddressBase(N, AM);
1868 }
1869
1870 bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
1871                                        SDValue &Scale, SDValue &Index,
1872                                        SDValue &Disp, SDValue &Segment) {
1873   X86ISelAddressMode AM;
1874   auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
1875   AM.IndexReg = Mgs->getIndex();
1876   AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
1877
1878   unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
1879   // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
1880   if (AddrSpace == 256)
1881     AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1882   if (AddrSpace == 257)
1883     AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1884   if (AddrSpace == 258)
1885     AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
1886
1887   // Try to match into the base and displacement fields.
1888   if (matchVectorAddress(N, AM))
1889     return false;
1890
1891   MVT VT = N.getSimpleValueType();
1892   if (AM.BaseType == X86ISelAddressMode::RegBase) {
1893     if (!AM.Base_Reg.getNode())
1894       AM.Base_Reg = CurDAG->getRegister(0, VT);
1895   }
1896
1897   getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
1898   return true;
1899 }
1900
1901 /// Returns true if it is able to pattern match an addressing mode.
1902 /// It returns the operands which make up the maximal addressing mode it can
1903 /// match by reference.
1904 ///
1905 /// Parent is the parent node of the addr operand that is being matched.  It
1906 /// is always a load, store, atomic node, or null.  It is only null when
1907 /// checking memory operands for inline asm nodes.
1908 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
1909                                  SDValue &Scale, SDValue &Index,
1910                                  SDValue &Disp, SDValue &Segment) {
1911   X86ISelAddressMode AM;
1912
1913   if (Parent &&
1914       // This list of opcodes are all the nodes that have an "addr:$ptr" operand
1915       // that are not a MemSDNode, and thus don't have proper addrspace info.
1916       Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
1917       Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
1918       Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
1919       Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
1920       Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
1921     unsigned AddrSpace =
1922       cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
1923     // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
1924     if (AddrSpace == 256)
1925       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1926     if (AddrSpace == 257)
1927       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1928     if (AddrSpace == 258)
1929       AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
1930   }
1931
1932   if (matchAddress(N, AM))
1933     return false;
1934
1935   MVT VT = N.getSimpleValueType();
1936   if (AM.BaseType == X86ISelAddressMode::RegBase) {
1937     if (!AM.Base_Reg.getNode())
1938       AM.Base_Reg = CurDAG->getRegister(0, VT);
1939   }
1940
1941   if (!AM.IndexReg.getNode())
1942     AM.IndexReg = CurDAG->getRegister(0, VT);
1943
1944   getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
1945   return true;
1946 }
1947
1948 // We can only fold a load if all nodes between it and the root node have a
1949 // single use. If there are additional uses, we could end up duplicating the
1950 // load.
1951 static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
1952   while (User != Root) {
1953     if (!User->hasOneUse())
1954       return false;
1955     User = *User->use_begin();
1956   }
1957
1958   return true;
1959 }
1960
1961 /// Match a scalar SSE load. In particular, we want to match a load whose top
1962 /// elements are either undef or zeros. The load flavor is derived from the
1963 /// type of N, which is either v4f32 or v2f64.
1964 ///
1965 /// We also return:
1966 ///   PatternChainNode: this is the matched node that has a chain input and
1967 ///   output.
1968 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
1969                                           SDValue N, SDValue &Base,
1970                                           SDValue &Scale, SDValue &Index,
1971                                           SDValue &Disp, SDValue &Segment,
1972                                           SDValue &PatternNodeWithChain) {
1973   if (!hasSingleUsesFromRoot(Root, Parent))
1974     return false;
1975
1976   // We can allow a full vector load here since narrowing a load is ok.
1977   if (ISD::isNON_EXTLoad(N.getNode())) {
1978     PatternNodeWithChain = N;
1979     if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
1980         IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
1981       LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
1982       return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
1983                         Segment);
1984     }
1985   }
1986
1987   // We can also match the special zero extended load opcode.
1988   if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
1989     PatternNodeWithChain = N;
1990     if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
1991         IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
1992       auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
1993       return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
1994                         Segment);
1995     }
1996   }
1997
1998   // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
1999   // once. Otherwise the load might get duplicated and the chain output of the
2000   // duplicate load will not be observed by all dependencies.
2001   if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
2002     PatternNodeWithChain = N.getOperand(0);
2003     if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
2004         IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2005         IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
2006       LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2007       return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2008                         Segment);
2009     }
2010   }
2011
2012   // Also handle the case where we explicitly require zeros in the top
2013   // elements.  This is a vector shuffle from the zero vector.
2014   if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
2015       // Check to see if the top elements are all zeros (or bitcast of zeros).
2016       N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
2017       N.getOperand(0).getNode()->hasOneUse()) {
2018     PatternNodeWithChain = N.getOperand(0).getOperand(0);
2019     if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
2020         IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2021         IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
2022       // Okay, this is a zero extending load.  Fold it.
2023       LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2024       return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2025                         Segment);
2026     }
2027   }
2028
2029   return false;
2030 }
2031
2032
2033 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2034   if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
2035     uint64_t ImmVal = CN->getZExtValue();
2036     if (!isUInt<32>(ImmVal))
2037       return false;
2038
2039     Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
2040     return true;
2041   }
2042
2043   // In static codegen with small code model, we can get the address of a label
2044   // into a register with 'movl'
2045   if (N->getOpcode() != X86ISD::Wrapper)
2046     return false;
2047
2048   N = N.getOperand(0);
2049
2050   // At least GNU as does not accept 'movl' for TPOFF relocations.
2051   // FIXME: We could use 'movl' when we know we are targeting MC.
2052   if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2053     return false;
2054
2055   Imm = N;
2056   if (N->getOpcode() != ISD::TargetGlobalAddress)
2057     return TM.getCodeModel() == CodeModel::Small;
2058
2059   Optional<ConstantRange> CR =
2060       cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
2061   if (!CR)
2062     return TM.getCodeModel() == CodeModel::Small;
2063
2064   return CR->getUnsignedMax().ult(1ull << 32);
2065 }
2066
2067 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2068                                          SDValue &Scale, SDValue &Index,
2069                                          SDValue &Disp, SDValue &Segment) {
2070   // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2071   SDLoc DL(N);
2072
2073   if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2074     return false;
2075
2076   RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
2077   if (RN && RN->getReg() == 0)
2078     Base = CurDAG->getRegister(0, MVT::i64);
2079   else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) {
2080     // Base could already be %rip, particularly in the x32 ABI.
2081     Base = SDValue(CurDAG->getMachineNode(
2082                        TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
2083                        CurDAG->getTargetConstant(0, DL, MVT::i64),
2084                        Base,
2085                        CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)),
2086                    0);
2087   }
2088
2089   RN = dyn_cast<RegisterSDNode>(Index);
2090   if (RN && RN->getReg() == 0)
2091     Index = CurDAG->getRegister(0, MVT::i64);
2092   else {
2093     assert(Index.getValueType() == MVT::i32 &&
2094            "Expect to be extending 32-bit registers for use in LEA");
2095     Index = SDValue(CurDAG->getMachineNode(
2096                         TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
2097                         CurDAG->getTargetConstant(0, DL, MVT::i64),
2098                         Index,
2099                         CurDAG->getTargetConstant(X86::sub_32bit, DL,
2100                                                   MVT::i32)),
2101                     0);
2102   }
2103
2104   return true;
2105 }
2106
2107 /// Calls SelectAddr and determines if the maximal addressing
2108 /// mode it matches can be cost effectively emitted as an LEA instruction.
2109 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
2110                                     SDValue &Base, SDValue &Scale,
2111                                     SDValue &Index, SDValue &Disp,
2112                                     SDValue &Segment) {
2113   X86ISelAddressMode AM;
2114
2115   // Save the DL and VT before calling matchAddress, it can invalidate N.
2116   SDLoc DL(N);
2117   MVT VT = N.getSimpleValueType();
2118
2119   // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
2120   // segments.
2121   SDValue Copy = AM.Segment;
2122   SDValue T = CurDAG->getRegister(0, MVT::i32);
2123   AM.Segment = T;
2124   if (matchAddress(N, AM))
2125     return false;
2126   assert (T == AM.Segment);
2127   AM.Segment = Copy;
2128
2129   unsigned Complexity = 0;
2130   if (AM.BaseType == X86ISelAddressMode::RegBase)
2131     if (AM.Base_Reg.getNode())
2132       Complexity = 1;
2133     else
2134       AM.Base_Reg = CurDAG->getRegister(0, VT);
2135   else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2136     Complexity = 4;
2137
2138   if (AM.IndexReg.getNode())
2139     Complexity++;
2140   else
2141     AM.IndexReg = CurDAG->getRegister(0, VT);
2142
2143   // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
2144   // a simple shift.
2145   if (AM.Scale > 1)
2146     Complexity++;
2147
2148   // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
2149   // to a LEA. This is determined with some experimentation but is by no means
2150   // optimal (especially for code size consideration). LEA is nice because of
2151   // its three-address nature. Tweak the cost function again when we can run
2152   // convertToThreeAddress() at register allocation time.
2153   if (AM.hasSymbolicDisplacement()) {
2154     // For X86-64, always use LEA to materialize RIP-relative addresses.
2155     if (Subtarget->is64Bit())
2156       Complexity = 4;
2157     else
2158       Complexity += 2;
2159   }
2160
2161   if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
2162     Complexity++;
2163
2164   // If it isn't worth using an LEA, reject it.
2165   if (Complexity <= 2)
2166     return false;
2167
2168   getAddressOperands(AM, DL, Base, Scale, Index, Disp, Segment);
2169   return true;
2170 }
2171
2172 /// This is only run on TargetGlobalTLSAddress nodes.
2173 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
2174                                         SDValue &Scale, SDValue &Index,
2175                                         SDValue &Disp, SDValue &Segment) {
2176   assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
2177   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
2178
2179   X86ISelAddressMode AM;
2180   AM.GV = GA->getGlobal();
2181   AM.Disp += GA->getOffset();
2182   AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
2183   AM.SymbolFlags = GA->getTargetFlags();
2184
2185   if (N.getValueType() == MVT::i32) {
2186     AM.Scale = 1;
2187     AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
2188   } else {
2189     AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
2190   }
2191
2192   getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
2193   return true;
2194 }
2195
2196 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
2197   if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
2198     Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
2199                                    N.getValueType());
2200     return true;
2201   }
2202
2203   // Keep track of the original value type and whether this value was
2204   // truncated. If we see a truncation from pointer type to VT that truncates
2205   // bits that are known to be zero, we can use a narrow reference.
2206   EVT VT = N.getValueType();
2207   bool WasTruncated = false;
2208   if (N.getOpcode() == ISD::TRUNCATE) {
2209     WasTruncated = true;
2210     N = N.getOperand(0);
2211   }
2212
2213   if (N.getOpcode() != X86ISD::Wrapper)
2214     return false;
2215
2216   // We can only use non-GlobalValues as immediates if they were not truncated,
2217   // as we do not have any range information. If we have a GlobalValue and the
2218   // address was not truncated, we can select it as an operand directly.
2219   unsigned Opc = N.getOperand(0)->getOpcode();
2220   if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
2221     Op = N.getOperand(0);
2222     // We can only select the operand directly if we didn't have to look past a
2223     // truncate.
2224     return !WasTruncated;
2225   }
2226
2227   // Check that the global's range fits into VT.
2228   auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
2229   Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2230   if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
2231     return false;
2232
2233   // Okay, we can use a narrow reference.
2234   Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
2235                                       GA->getOffset(), GA->getTargetFlags());
2236   return true;
2237 }
2238
2239 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
2240                                   SDValue &Base, SDValue &Scale,
2241                                   SDValue &Index, SDValue &Disp,
2242                                   SDValue &Segment) {
2243   if (!ISD::isNON_EXTLoad(N.getNode()) ||
2244       !IsProfitableToFold(N, P, Root) ||
2245       !IsLegalToFold(N, P, Root, OptLevel))
2246     return false;
2247
2248   return selectAddr(N.getNode(),
2249                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
2250 }
2251
2252 /// Return an SDNode that returns the value of the global base register.
2253 /// Output instructions required to initialize the global base register,
2254 /// if necessary.
2255 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
2256   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
2257   auto &DL = MF->getDataLayout();
2258   return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
2259 }
2260
2261 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
2262   if (N->getOpcode() == ISD::TRUNCATE)
2263     N = N->getOperand(0).getNode();
2264   if (N->getOpcode() != X86ISD::Wrapper)
2265     return false;
2266
2267   auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
2268   if (!GA)
2269     return false;
2270
2271   Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2272   return CR && CR->getSignedMin().sge(-1ull << Width) &&
2273          CR->getSignedMax().slt(1ull << Width);
2274 }
2275
2276 static X86::CondCode getCondFromOpc(unsigned Opc) {
2277   X86::CondCode CC = X86::COND_INVALID;
2278   if (CC == X86::COND_INVALID)
2279     CC = X86::getCondFromBranchOpc(Opc);
2280   if (CC == X86::COND_INVALID)
2281     CC = X86::getCondFromSETOpc(Opc);
2282   if (CC == X86::COND_INVALID)
2283     CC = X86::getCondFromCMovOpc(Opc);
2284
2285   return CC;
2286 }
2287
2288 /// Test whether the given X86ISD::CMP node has any users that use a flag
2289 /// other than ZF.
2290 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
2291   // Examine each user of the node.
2292   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2293          UI != UE; ++UI) {
2294     // Only check things that use the flags.
2295     if (UI.getUse().getResNo() != Flags.getResNo())
2296       continue;
2297     // Only examine CopyToReg uses that copy to EFLAGS.
2298     if (UI->getOpcode() != ISD::CopyToReg ||
2299         cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2300       return false;
2301     // Examine each user of the CopyToReg use.
2302     for (SDNode::use_iterator FlagUI = UI->use_begin(),
2303            FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2304       // Only examine the Flag result.
2305       if (FlagUI.getUse().getResNo() != 1) continue;
2306       // Anything unusual: assume conservatively.
2307       if (!FlagUI->isMachineOpcode()) return false;
2308       // Examine the condition code of the user.
2309       X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode());
2310
2311       switch (CC) {
2312       // Comparisons which only use the zero flag.
2313       case X86::COND_E: case X86::COND_NE:
2314         continue;
2315       // Anything else: assume conservatively.
2316       default:
2317         return false;
2318       }
2319     }
2320   }
2321   return true;
2322 }
2323
2324 /// Test whether the given X86ISD::CMP node has any uses which require the SF
2325 /// flag to be accurate.
2326 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
2327   // Examine each user of the node.
2328   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2329          UI != UE; ++UI) {
2330     // Only check things that use the flags.
2331     if (UI.getUse().getResNo() != Flags.getResNo())
2332       continue;
2333     // Only examine CopyToReg uses that copy to EFLAGS.
2334     if (UI->getOpcode() != ISD::CopyToReg ||
2335         cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2336       return false;
2337     // Examine each user of the CopyToReg use.
2338     for (SDNode::use_iterator FlagUI = UI->use_begin(),
2339            FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2340       // Only examine the Flag result.
2341       if (FlagUI.getUse().getResNo() != 1) continue;
2342       // Anything unusual: assume conservatively.
2343       if (!FlagUI->isMachineOpcode()) return false;
2344       // Examine the condition code of the user.
2345       X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode());
2346
2347       switch (CC) {
2348       // Comparisons which don't examine the SF flag.
2349       case X86::COND_A: case X86::COND_AE:
2350       case X86::COND_B: case X86::COND_BE:
2351       case X86::COND_E: case X86::COND_NE:
2352       case X86::COND_O: case X86::COND_NO:
2353       case X86::COND_P: case X86::COND_NP:
2354         continue;
2355       // Anything else: assume conservatively.
2356       default:
2357         return false;
2358       }
2359     }
2360   }
2361   return true;
2362 }
2363
2364 static bool mayUseCarryFlag(X86::CondCode CC) {
2365   switch (CC) {
2366   // Comparisons which don't examine the CF flag.
2367   case X86::COND_O: case X86::COND_NO:
2368   case X86::COND_E: case X86::COND_NE:
2369   case X86::COND_S: case X86::COND_NS:
2370   case X86::COND_P: case X86::COND_NP:
2371   case X86::COND_L: case X86::COND_GE:
2372   case X86::COND_G: case X86::COND_LE:
2373     return false;
2374   // Anything else: assume conservatively.
2375   default:
2376     return true;
2377   }
2378 }
2379
2380 /// Test whether the given node which sets flags has any uses which require the
2381 /// CF flag to be accurate.
2382  bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
2383   // Examine each user of the node.
2384   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2385          UI != UE; ++UI) {
2386     // Only check things that use the flags.
2387     if (UI.getUse().getResNo() != Flags.getResNo())
2388       continue;
2389
2390     unsigned UIOpc = UI->getOpcode();
2391
2392     if (UIOpc == ISD::CopyToReg) {
2393       // Only examine CopyToReg uses that copy to EFLAGS.
2394       if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2395         return false;
2396       // Examine each user of the CopyToReg use.
2397       for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
2398            FlagUI != FlagUE; ++FlagUI) {
2399         // Only examine the Flag result.
2400         if (FlagUI.getUse().getResNo() != 1)
2401           continue;
2402         // Anything unusual: assume conservatively.
2403         if (!FlagUI->isMachineOpcode())
2404           return false;
2405         // Examine the condition code of the user.
2406         X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode());
2407
2408         if (mayUseCarryFlag(CC))
2409           return false;
2410       }
2411
2412       // This CopyToReg is ok. Move on to the next user.
2413       continue;
2414     }
2415
2416     // This might be an unselected node. So look for the pre-isel opcodes that
2417     // use flags.
2418     unsigned CCOpNo;
2419     switch (UIOpc) {
2420     default:
2421       // Something unusual. Be conservative.
2422       return false;
2423     case X86ISD::SETCC:       CCOpNo = 0; break;
2424     case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
2425     case X86ISD::CMOV:        CCOpNo = 2; break;
2426     case X86ISD::BRCOND:      CCOpNo = 2; break;
2427     }
2428
2429     X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
2430     if (mayUseCarryFlag(CC))
2431       return false;
2432   }
2433   return true;
2434 }
2435
2436 /// Check whether or not the chain ending in StoreNode is suitable for doing
2437 /// the {load; op; store} to modify transformation.
2438 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
2439                                         SDValue StoredVal, SelectionDAG *CurDAG,
2440                                         unsigned LoadOpNo,
2441                                         LoadSDNode *&LoadNode,
2442                                         SDValue &InputChain) {
2443   // Is the stored value result 0 of the operation?
2444   if (StoredVal.getResNo() != 0) return false;
2445
2446   // Are there other uses of the operation other than the store?
2447   if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
2448
2449   // Is the store non-extending and non-indexed?
2450   if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
2451     return false;
2452
2453   SDValue Load = StoredVal->getOperand(LoadOpNo);
2454   // Is the stored value a non-extending and non-indexed load?
2455   if (!ISD::isNormalLoad(Load.getNode())) return false;
2456
2457   // Return LoadNode by reference.
2458   LoadNode = cast<LoadSDNode>(Load);
2459
2460   // Is store the only read of the loaded value?
2461   if (!Load.hasOneUse())
2462     return false;
2463
2464   // Is the address of the store the same as the load?
2465   if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
2466       LoadNode->getOffset() != StoreNode->getOffset())
2467     return false;
2468
2469   bool FoundLoad = false;
2470   SmallVector<SDValue, 4> ChainOps;
2471   SmallVector<const SDNode *, 4> LoopWorklist;
2472   SmallPtrSet<const SDNode *, 16> Visited;
2473   const unsigned int Max = 1024;
2474
2475   //  Visualization of Load-Op-Store fusion:
2476   // -------------------------
2477   // Legend:
2478   //    *-lines = Chain operand dependencies.
2479   //    |-lines = Normal operand dependencies.
2480   //    Dependencies flow down and right. n-suffix references multiple nodes.
2481   //
2482   //        C                        Xn  C
2483   //        *                         *  *
2484   //        *                          * *
2485   //  Xn  A-LD    Yn                    TF         Yn
2486   //   *    * \   |                       *        |
2487   //    *   *  \  |                        *       |
2488   //     *  *   \ |             =>       A--LD_OP_ST
2489   //      * *    \|                                 \
2490   //       TF    OP                                  \
2491   //         *   | \                                  Zn
2492   //          *  |  \
2493   //         A-ST    Zn
2494   //
2495
2496   // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2497   //                                      #2: Yn -> LD
2498   //                                      #3: ST -> Zn
2499
2500   // Ensure the transform is safe by checking for the dual
2501   // dependencies to make sure we do not induce a loop.
2502
2503   // As LD is a predecessor to both OP and ST we can do this by checking:
2504   //  a). if LD is a predecessor to a member of Xn or Yn.
2505   //  b). if a Zn is a predecessor to ST.
2506
2507   // However, (b) can only occur through being a chain predecessor to
2508   // ST, which is the same as Zn being a member or predecessor of Xn,
2509   // which is a subset of LD being a predecessor of Xn. So it's
2510   // subsumed by check (a).
2511
2512   SDValue Chain = StoreNode->getChain();
2513
2514   // Gather X elements in ChainOps.
2515   if (Chain == Load.getValue(1)) {
2516     FoundLoad = true;
2517     ChainOps.push_back(Load.getOperand(0));
2518   } else if (Chain.getOpcode() == ISD::TokenFactor) {
2519     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
2520       SDValue Op = Chain.getOperand(i);
2521       if (Op == Load.getValue(1)) {
2522         FoundLoad = true;
2523         // Drop Load, but keep its chain. No cycle check necessary.
2524         ChainOps.push_back(Load.getOperand(0));
2525         continue;
2526       }
2527       LoopWorklist.push_back(Op.getNode());
2528       ChainOps.push_back(Op);
2529     }
2530   }
2531
2532   if (!FoundLoad)
2533     return false;
2534
2535   // Worklist is currently Xn. Add Yn to worklist.
2536   for (SDValue Op : StoredVal->ops())
2537     if (Op.getNode() != LoadNode)
2538       LoopWorklist.push_back(Op.getNode());
2539
2540   // Check (a) if Load is a predecessor to Xn + Yn
2541   if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
2542                                    true))
2543     return false;
2544
2545   InputChain =
2546       CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
2547   return true;
2548 }
2549
2550 // Change a chain of {load; op; store} of the same value into a simple op
2551 // through memory of that value, if the uses of the modified value and its
2552 // address are suitable.
2553 //
2554 // The tablegen pattern memory operand pattern is currently not able to match
2555 // the case where the EFLAGS on the original operation are used.
2556 //
2557 // To move this to tablegen, we'll need to improve tablegen to allow flags to
2558 // be transferred from a node in the pattern to the result node, probably with
2559 // a new keyword. For example, we have this
2560 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2561 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2562 //   (implicit EFLAGS)]>;
2563 // but maybe need something like this
2564 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2565 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2566 //   (transferrable EFLAGS)]>;
2567 //
2568 // Until then, we manually fold these and instruction select the operation
2569 // here.
2570 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
2571   StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
2572   SDValue StoredVal = StoreNode->getOperand(1);
2573   unsigned Opc = StoredVal->getOpcode();
2574
2575   // Before we try to select anything, make sure this is memory operand size
2576   // and opcode we can handle. Note that this must match the code below that
2577   // actually lowers the opcodes.
2578   EVT MemVT = StoreNode->getMemoryVT();
2579   if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
2580       MemVT != MVT::i8)
2581     return false;
2582
2583   bool IsCommutable = false;
2584   switch (Opc) {
2585   default:
2586     return false;
2587   case X86ISD::SUB:
2588   case X86ISD::SBB:
2589     break;
2590   case X86ISD::ADD:
2591   case X86ISD::ADC:
2592   case X86ISD::AND:
2593   case X86ISD::OR:
2594   case X86ISD::XOR:
2595     IsCommutable = true;
2596     break;
2597   }
2598
2599   unsigned LoadOpNo = 0;
2600   LoadSDNode *LoadNode = nullptr;
2601   SDValue InputChain;
2602   if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2603                                    LoadNode, InputChain)) {
2604     if (!IsCommutable)
2605       return false;
2606
2607     // This operation is commutable, try the other operand.
2608     LoadOpNo = 1;
2609     if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2610                                      LoadNode, InputChain))
2611       return false;
2612   }
2613
2614   SDValue Base, Scale, Index, Disp, Segment;
2615   if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
2616                   Segment))
2617     return false;
2618
2619   auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
2620                           unsigned Opc8) {
2621     switch (MemVT.getSimpleVT().SimpleTy) {
2622     case MVT::i64:
2623       return Opc64;
2624     case MVT::i32:
2625       return Opc32;
2626     case MVT::i16:
2627       return Opc16;
2628     case MVT::i8:
2629       return Opc8;
2630     default:
2631       llvm_unreachable("Invalid size!");
2632     }
2633   };
2634
2635   MachineSDNode *Result;
2636   switch (Opc) {
2637   case X86ISD::ADD:
2638   case X86ISD::SUB:
2639     // Try to match inc/dec.
2640     if (!Subtarget->slowIncDec() ||
2641         CurDAG->getMachineFunction().getFunction().optForSize()) {
2642       bool IsOne = isOneConstant(StoredVal.getOperand(1));
2643       bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
2644       // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
2645       if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
2646         unsigned NewOpc =
2647           ((Opc == X86ISD::ADD) == IsOne)
2648               ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
2649               : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
2650         const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
2651         Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
2652                                         MVT::Other, Ops);
2653         break;
2654       }
2655     }
2656     LLVM_FALLTHROUGH;
2657   case X86ISD::ADC:
2658   case X86ISD::SBB:
2659   case X86ISD::AND:
2660   case X86ISD::OR:
2661   case X86ISD::XOR: {
2662     auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
2663       switch (Opc) {
2664       case X86ISD::ADD:
2665         return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
2666                             X86::ADD8mr);
2667       case X86ISD::ADC:
2668         return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
2669                             X86::ADC8mr);
2670       case X86ISD::SUB:
2671         return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
2672                             X86::SUB8mr);
2673       case X86ISD::SBB:
2674         return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
2675                             X86::SBB8mr);
2676       case X86ISD::AND:
2677         return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
2678                             X86::AND8mr);
2679       case X86ISD::OR:
2680         return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
2681       case X86ISD::XOR:
2682         return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
2683                             X86::XOR8mr);
2684       default:
2685         llvm_unreachable("Invalid opcode!");
2686       }
2687     };
2688     auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
2689       switch (Opc) {
2690       case X86ISD::ADD:
2691         return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
2692       case X86ISD::ADC:
2693         return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
2694       case X86ISD::SUB:
2695         return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
2696       case X86ISD::SBB:
2697         return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
2698       case X86ISD::AND:
2699         return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
2700       case X86ISD::OR:
2701         return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
2702       case X86ISD::XOR:
2703         return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
2704       default:
2705         llvm_unreachable("Invalid opcode!");
2706       }
2707     };
2708     auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
2709       switch (Opc) {
2710       case X86ISD::ADD:
2711         return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
2712                             X86::ADD8mi);
2713       case X86ISD::ADC:
2714         return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
2715                             X86::ADC8mi);
2716       case X86ISD::SUB:
2717         return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
2718                             X86::SUB8mi);
2719       case X86ISD::SBB:
2720         return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
2721                             X86::SBB8mi);
2722       case X86ISD::AND:
2723         return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
2724                             X86::AND8mi);
2725       case X86ISD::OR:
2726         return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
2727                             X86::OR8mi);
2728       case X86ISD::XOR:
2729         return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
2730                             X86::XOR8mi);
2731       default:
2732         llvm_unreachable("Invalid opcode!");
2733       }
2734     };
2735
2736     unsigned NewOpc = SelectRegOpcode(Opc);
2737     SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
2738
2739     // See if the operand is a constant that we can fold into an immediate
2740     // operand.
2741     if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
2742       auto OperandV = OperandC->getAPIntValue();
2743
2744       // Check if we can shrink the operand enough to fit in an immediate (or
2745       // fit into a smaller immediate) by negating it and switching the
2746       // operation.
2747       if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
2748           ((MemVT != MVT::i8 && OperandV.getMinSignedBits() > 8 &&
2749             (-OperandV).getMinSignedBits() <= 8) ||
2750            (MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 &&
2751             (-OperandV).getMinSignedBits() <= 32)) &&
2752           hasNoCarryFlagUses(StoredVal.getValue(1))) {
2753         OperandV = -OperandV;
2754         Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
2755       }
2756
2757       // First try to fit this into an Imm8 operand. If it doesn't fit, then try
2758       // the larger immediate operand.
2759       if (MemVT != MVT::i8 && OperandV.getMinSignedBits() <= 8) {
2760         Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
2761         NewOpc = SelectImm8Opcode(Opc);
2762       } else if (OperandV.getActiveBits() <= MemVT.getSizeInBits() &&
2763                  (MemVT != MVT::i64 || OperandV.getMinSignedBits() <= 32)) {
2764         Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
2765         NewOpc = SelectImmOpcode(Opc);
2766       }
2767     }
2768
2769     if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
2770       SDValue CopyTo =
2771           CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
2772                                StoredVal.getOperand(2), SDValue());
2773
2774       const SDValue Ops[] = {Base,    Scale,   Index,  Disp,
2775                              Segment, Operand, CopyTo, CopyTo.getValue(1)};
2776       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
2777                                       Ops);
2778     } else {
2779       const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
2780                              Segment, Operand, InputChain};
2781       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
2782                                       Ops);
2783     }
2784     break;
2785   }
2786   default:
2787     llvm_unreachable("Invalid opcode!");
2788   }
2789
2790   MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
2791                                  LoadNode->getMemOperand()};
2792   CurDAG->setNodeMemRefs(Result, MemOps);
2793
2794   // Update Load Chain uses as well.
2795   ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
2796   ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
2797   ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
2798   CurDAG->RemoveDeadNode(Node);
2799   return true;
2800 }
2801
2802 // See if this is an  X & Mask  that we can match to BEXTR/BZHI.
2803 // Where Mask is one of the following patterns:
2804 //   a) x &  (1 << nbits) - 1
2805 //   b) x & ~(-1 << nbits)
2806 //   c) x &  (-1 >> (32 - y))
2807 //   d) x << (32 - y) >> (32 - y)
2808 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
2809   assert(
2810       (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
2811       "Should be either an and-mask, or right-shift after clearing high bits.");
2812
2813   // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
2814   if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
2815     return false;
2816
2817   MVT NVT = Node->getSimpleValueType(0);
2818
2819   // Only supported for 32 and 64 bits.
2820   if (NVT != MVT::i32 && NVT != MVT::i64)
2821     return false;
2822
2823   unsigned Size = NVT.getSizeInBits();
2824
2825   SDValue NBits;
2826
2827   // If we have BMI2's BZHI, we are ok with muti-use patterns.
2828   // Else, if we only have BMI1's BEXTR, we require one-use.
2829   const bool CanHaveExtraUses = Subtarget->hasBMI2();
2830   auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
2831     return CanHaveExtraUses ||
2832            Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
2833   };
2834   auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
2835   auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
2836
2837   // a) x & ((1 << nbits) + (-1))
2838   auto matchPatternA = [&checkOneUse, &NBits](SDValue Mask) -> bool {
2839     // Match `add`. Must only have one use!
2840     if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
2841       return false;
2842     // We should be adding all-ones constant (i.e. subtracting one.)
2843     if (!isAllOnesConstant(Mask->getOperand(1)))
2844       return false;
2845     // Match `1 << nbits`. Must only have one use!
2846     SDValue M0 = Mask->getOperand(0);
2847     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
2848       return false;
2849     if (!isOneConstant(M0->getOperand(0)))
2850       return false;
2851     NBits = M0->getOperand(1);
2852     return true;
2853   };
2854
2855   // b) x & ~(-1 << nbits)
2856   auto matchPatternB = [&checkOneUse, &NBits](SDValue Mask) -> bool {
2857     // Match `~()`. Must only have one use!
2858     if (!isBitwiseNot(Mask) || !checkOneUse(Mask))
2859       return false;
2860     // Match `-1 << nbits`. Must only have one use!
2861     SDValue M0 = Mask->getOperand(0);
2862     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
2863       return false;
2864     if (!isAllOnesConstant(M0->getOperand(0)))
2865       return false;
2866     NBits = M0->getOperand(1);
2867     return true;
2868   };
2869
2870   // Match potentially-truncated (bitwidth - y)
2871   auto matchShiftAmt = [checkOneUse, Size, &NBits](SDValue ShiftAmt) {
2872     // Skip over a truncate of the shift amount.
2873     if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
2874       ShiftAmt = ShiftAmt.getOperand(0);
2875       // The trunc should have been the only user of the real shift amount.
2876       if (!checkOneUse(ShiftAmt))
2877         return false;
2878     }
2879     // Match the shift amount as: (bitwidth - y). It should go away, too.
2880     if (ShiftAmt.getOpcode() != ISD::SUB)
2881       return false;
2882     auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
2883     if (!V0 || V0->getZExtValue() != Size)
2884       return false;
2885     NBits = ShiftAmt.getOperand(1);
2886     return true;
2887   };
2888
2889   // c) x &  (-1 >> (32 - y))
2890   auto matchPatternC = [&checkOneUse, matchShiftAmt](SDValue Mask) -> bool {
2891     // Match `l>>`. Must only have one use!
2892     if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
2893       return false;
2894     // We should be shifting all-ones constant.
2895     if (!isAllOnesConstant(Mask.getOperand(0)))
2896       return false;
2897     SDValue M1 = Mask.getOperand(1);
2898     // The shift amount should not be used externally.
2899     if (!checkOneUse(M1))
2900       return false;
2901     return matchShiftAmt(M1);
2902   };
2903
2904   SDValue X;
2905
2906   // d) x << (32 - y) >> (32 - y)
2907   auto matchPatternD = [&checkOneUse, &checkTwoUse, matchShiftAmt,
2908                         &X](SDNode *Node) -> bool {
2909     if (Node->getOpcode() != ISD::SRL)
2910       return false;
2911     SDValue N0 = Node->getOperand(0);
2912     if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
2913       return false;
2914     SDValue N1 = Node->getOperand(1);
2915     SDValue N01 = N0->getOperand(1);
2916     // Both of the shifts must be by the exact same value.
2917     // There should not be any uses of the shift amount outside of the pattern.
2918     if (N1 != N01 || !checkTwoUse(N1))
2919       return false;
2920     if (!matchShiftAmt(N1))
2921       return false;
2922     X = N0->getOperand(0);
2923     return true;
2924   };
2925
2926   auto matchLowBitMask = [&matchPatternA, &matchPatternB,
2927                           &matchPatternC](SDValue Mask) -> bool {
2928     // FIXME: pattern c.
2929     return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
2930   };
2931
2932   if (Node->getOpcode() == ISD::AND) {
2933     X = Node->getOperand(0);
2934     SDValue Mask = Node->getOperand(1);
2935
2936     if (matchLowBitMask(Mask)) {
2937       // Great.
2938     } else {
2939       std::swap(X, Mask);
2940       if (!matchLowBitMask(Mask))
2941         return false;
2942     }
2943   } else if (!matchPatternD(Node))
2944     return false;
2945
2946   SDLoc DL(Node);
2947
2948   // If we do *NOT* have BMI2, let's find out if the if the 'X' is *logically*
2949   // shifted (potentially with one-use trunc inbetween),
2950   // and if so look past one-use truncation.
2951   MVT XVT = NVT;
2952   if (!Subtarget->hasBMI2() && X.getOpcode() == ISD::TRUNCATE &&
2953       X.hasOneUse() && X.getOperand(0).getOpcode() == ISD::SRL) {
2954     assert(NVT == MVT::i32 && "Expected target valuetype to be i32");
2955     X = X.getOperand(0);
2956     XVT = X.getSimpleValueType();
2957     assert(XVT == MVT::i64 && "Expected truncation from i64");
2958   }
2959
2960   SDValue OrigNBits = NBits;
2961   // Truncate the shift amount.
2962   NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
2963   insertDAGNode(*CurDAG, OrigNBits, NBits);
2964
2965   // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
2966   // All the other bits are undefined, we do not care about them.
2967   SDValue ImplDef = SDValue(
2968       CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
2969   insertDAGNode(*CurDAG, OrigNBits, ImplDef);
2970   NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef,
2971                                         NBits);
2972   insertDAGNode(*CurDAG, OrigNBits, NBits);
2973
2974   if (Subtarget->hasBMI2()) {
2975     // Great, just emit the the BZHI..
2976     if (XVT != MVT::i32) {
2977       // But have to place the bit count into the wide-enough register first.
2978       SDValue ImplDef = SDValue(
2979           CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, XVT), 0);
2980       insertDAGNode(*CurDAG, OrigNBits, ImplDef);
2981       NBits = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, XVT, ImplDef,
2982                                             NBits);
2983       insertDAGNode(*CurDAG, OrigNBits, NBits);
2984     }
2985
2986     SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, XVT, X, NBits);
2987     ReplaceNode(Node, Extract.getNode());
2988     SelectCode(Extract.getNode());
2989     return true;
2990   }
2991
2992   // Else, emitting BEXTR requires one more step.
2993   // The 'control' of BEXTR has the pattern of:
2994   // [15...8 bit][ 7...0 bit] location
2995   // [ bit count][     shift] name
2996   // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
2997
2998   // Shift NBits left by 8 bits, thus producing 'control'.
2999   // This makes the low 8 bits to be zero.
3000   SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
3001   SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3002   insertDAGNode(*CurDAG, OrigNBits, Control);
3003
3004   // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3005   if (X.getOpcode() == ISD::SRL) {
3006     SDValue ShiftAmt = X.getOperand(1);
3007     X = X.getOperand(0);
3008
3009     assert(ShiftAmt.getValueType() == MVT::i8 &&
3010            "Expected shift amount to be i8");
3011
3012     // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3013     // We could zext to i16 in some form, but we intentionally don't do that.
3014     SDValue OrigShiftAmt = ShiftAmt;
3015     ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
3016     insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
3017
3018     // And now 'or' these low 8 bits of shift amount into the 'control'.
3019     Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
3020     insertDAGNode(*CurDAG, OrigNBits, Control);
3021   }
3022
3023   // But have to place the 'control' into the wide-enough register first.
3024   if (XVT != MVT::i32) {
3025     SDValue ImplDef =
3026         SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, XVT), 0);
3027     insertDAGNode(*CurDAG, OrigNBits, ImplDef);
3028     Control = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, XVT, ImplDef,
3029                                             Control);
3030     insertDAGNode(*CurDAG, OrigNBits, Control);
3031   }
3032
3033   // And finally, form the BEXTR itself.
3034   SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
3035
3036   // The 'X' was originally truncated. Do that now.
3037   if (XVT != NVT) {
3038     insertDAGNode(*CurDAG, OrigNBits, Extract);
3039     Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
3040   }
3041
3042   ReplaceNode(Node, Extract.getNode());
3043   SelectCode(Extract.getNode());
3044
3045   return true;
3046 }
3047
3048 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
3049 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
3050   MVT NVT = Node->getSimpleValueType(0);
3051   SDLoc dl(Node);
3052
3053   SDValue N0 = Node->getOperand(0);
3054   SDValue N1 = Node->getOperand(1);
3055
3056   // If we have TBM we can use an immediate for the control. If we have BMI
3057   // we should only do this if the BEXTR instruction is implemented well.
3058   // Otherwise moving the control into a register makes this more costly.
3059   // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
3060   // hoisting the move immediate would make it worthwhile with a less optimal
3061   // BEXTR?
3062   if (!Subtarget->hasTBM() &&
3063       !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
3064     return nullptr;
3065
3066   // Must have a shift right.
3067   if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
3068     return nullptr;
3069
3070   // Shift can't have additional users.
3071   if (!N0->hasOneUse())
3072     return nullptr;
3073
3074   // Only supported for 32 and 64 bits.
3075   if (NVT != MVT::i32 && NVT != MVT::i64)
3076     return nullptr;
3077
3078   // Shift amount and RHS of and must be constant.
3079   ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
3080   ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
3081   if (!MaskCst || !ShiftCst)
3082     return nullptr;
3083
3084   // And RHS must be a mask.
3085   uint64_t Mask = MaskCst->getZExtValue();
3086   if (!isMask_64(Mask))
3087     return nullptr;
3088
3089   uint64_t Shift = ShiftCst->getZExtValue();
3090   uint64_t MaskSize = countPopulation(Mask);
3091
3092   // Don't interfere with something that can be handled by extracting AH.
3093   // TODO: If we are able to fold a load, BEXTR might still be better than AH.
3094   if (Shift == 8 && MaskSize == 8)
3095     return nullptr;
3096
3097   // Make sure we are only using bits that were in the original value, not
3098   // shifted in.
3099   if (Shift + MaskSize > NVT.getSizeInBits())
3100     return nullptr;
3101
3102   SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
3103   unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
3104   unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
3105
3106   // BMI requires the immediate to placed in a register.
3107   if (!Subtarget->hasTBM()) {
3108     ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
3109     MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
3110     unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3111     New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
3112   }
3113
3114   MachineSDNode *NewNode;
3115   SDValue Input = N0->getOperand(0);
3116   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3117   if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3118     SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
3119     SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
3120     NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3121     // Update the chain.
3122     ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
3123     // Record the mem-refs
3124     CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
3125   } else {
3126     NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
3127   }
3128
3129   return NewNode;
3130 }
3131
3132 // Emit a PCMISTR(I/M) instruction.
3133 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
3134                                              bool MayFoldLoad, const SDLoc &dl,
3135                                              MVT VT, SDNode *Node) {
3136   SDValue N0 = Node->getOperand(0);
3137   SDValue N1 = Node->getOperand(1);
3138   SDValue Imm = Node->getOperand(2);
3139   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3140   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3141
3142   // Try to fold a load. No need to check alignment.
3143   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3144   if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3145     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3146                       N1.getOperand(0) };
3147     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
3148     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3149     // Update the chain.
3150     ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
3151     // Record the mem-refs
3152     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3153     return CNode;
3154   }
3155
3156   SDValue Ops[] = { N0, N1, Imm };
3157   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
3158   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3159   return CNode;
3160 }
3161
3162 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
3163 // to emit a second instruction after this one. This is needed since we have two
3164 // copyToReg nodes glued before this and we need to continue that glue through.
3165 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
3166                                              bool MayFoldLoad, const SDLoc &dl,
3167                                              MVT VT, SDNode *Node,
3168                                              SDValue &InFlag) {
3169   SDValue N0 = Node->getOperand(0);
3170   SDValue N2 = Node->getOperand(2);
3171   SDValue Imm = Node->getOperand(4);
3172   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3173   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3174
3175   // Try to fold a load. No need to check alignment.
3176   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3177   if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3178     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3179                       N2.getOperand(0), InFlag };
3180     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
3181     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3182     InFlag = SDValue(CNode, 3);
3183     // Update the chain.
3184     ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
3185     // Record the mem-refs
3186     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
3187     return CNode;
3188   }
3189
3190   SDValue Ops[] = { N0, N2, Imm, InFlag };
3191   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
3192   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3193   InFlag = SDValue(CNode, 2);
3194   return CNode;
3195 }
3196
3197 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
3198   EVT VT = N->getValueType(0);
3199
3200   // Only handle scalar shifts.
3201   if (VT.isVector())
3202     return false;
3203
3204   // Narrower shifts only mask to 5 bits in hardware.
3205   unsigned Size = VT == MVT::i64 ? 64 : 32;
3206
3207   SDValue OrigShiftAmt = N->getOperand(1);
3208   SDValue ShiftAmt = OrigShiftAmt;
3209   SDLoc DL(N);
3210
3211   // Skip over a truncate of the shift amount.
3212   if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
3213     ShiftAmt = ShiftAmt->getOperand(0);
3214
3215   // This function is called after X86DAGToDAGISel::matchBitExtract(),
3216   // so we are not afraid that we might mess up BZHI/BEXTR pattern.
3217
3218   SDValue NewShiftAmt;
3219   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
3220     SDValue Add0 = ShiftAmt->getOperand(0);
3221     SDValue Add1 = ShiftAmt->getOperand(1);
3222     // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3223     // to avoid the ADD/SUB.
3224     if (isa<ConstantSDNode>(Add1) &&
3225         cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
3226       NewShiftAmt = Add0;
3227     // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
3228     // generate a NEG instead of a SUB of a constant.
3229     } else if (ShiftAmt->getOpcode() == ISD::SUB &&
3230                isa<ConstantSDNode>(Add0) &&
3231                cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
3232                cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
3233       // Insert a negate op.
3234       // TODO: This isn't guaranteed to replace the sub if there is a logic cone
3235       // that uses it that's not a shift.
3236       EVT SubVT = ShiftAmt.getValueType();
3237       SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
3238       SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
3239       NewShiftAmt = Neg;
3240
3241       // Insert these operands into a valid topological order so they can
3242       // get selected independently.
3243       insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
3244       insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
3245     } else
3246       return false;
3247   } else
3248     return false;
3249
3250   if (NewShiftAmt.getValueType() != MVT::i8) {
3251     // Need to truncate the shift amount.
3252     NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
3253     // Add to a correct topological ordering.
3254     insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3255   }
3256
3257   // Insert a new mask to keep the shift amount legal. This should be removed
3258   // by isel patterns.
3259   NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
3260                                 CurDAG->getConstant(Size - 1, DL, MVT::i8));
3261   // Place in a correct topological ordering.
3262   insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3263
3264   SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
3265                                                    NewShiftAmt);
3266   if (UpdatedNode != N) {
3267     // If we found an existing node, we should replace ourselves with that node
3268     // and wait for it to be selected after its other users.
3269     ReplaceNode(N, UpdatedNode);
3270     return true;
3271   }
3272
3273   // If the original shift amount is now dead, delete it so that we don't run
3274   // it through isel.
3275   if (OrigShiftAmt.getNode()->use_empty())
3276     CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
3277
3278   // Now that we've optimized the shift amount, defer to normal isel to get
3279   // load folding and legacy vs BMI2 selection without repeating it here.
3280   SelectCode(N);
3281   return true;
3282 }
3283
3284 /// If the high bits of an 'and' operand are known zero, try setting the
3285 /// high bits of an 'and' constant operand to produce a smaller encoding by
3286 /// creating a small, sign-extended negative immediate rather than a large
3287 /// positive one. This reverses a transform in SimplifyDemandedBits that
3288 /// shrinks mask constants by clearing bits. There is also a possibility that
3289 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
3290 /// case, just replace the 'and'. Return 'true' if the node is replaced.
3291 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
3292   // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
3293   // have immediate operands.
3294   MVT VT = And->getSimpleValueType(0);
3295   if (VT != MVT::i32 && VT != MVT::i64)
3296     return false;
3297
3298   auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
3299   if (!And1C)
3300     return false;
3301
3302   // Bail out if the mask constant is already negative. It's can't shrink more.
3303   // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
3304   // patterns to use a 32-bit and instead of a 64-bit and by relying on the
3305   // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
3306   // are negative too.
3307   APInt MaskVal = And1C->getAPIntValue();
3308   unsigned MaskLZ = MaskVal.countLeadingZeros();
3309   if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
3310     return false;
3311
3312   // Don't extend into the upper 32 bits of a 64 bit mask.
3313   if (VT == MVT::i64 && MaskLZ >= 32) {
3314     MaskLZ -= 32;
3315     MaskVal = MaskVal.trunc(32);
3316   }
3317
3318   SDValue And0 = And->getOperand(0);
3319   APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
3320   APInt NegMaskVal = MaskVal | HighZeros;
3321
3322   // If a negative constant would not allow a smaller encoding, there's no need
3323   // to continue. Only change the constant when we know it's a win.
3324   unsigned MinWidth = NegMaskVal.getMinSignedBits();
3325   if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
3326     return false;
3327
3328   // Extend masks if we truncated above.
3329   if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
3330     NegMaskVal = NegMaskVal.zext(64);
3331     HighZeros = HighZeros.zext(64);
3332   }
3333
3334   // The variable operand must be all zeros in the top bits to allow using the
3335   // new, negative constant as the mask.
3336   if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
3337     return false;
3338
3339   // Check if the mask is -1. In that case, this is an unnecessary instruction
3340   // that escaped earlier analysis.
3341   if (NegMaskVal.isAllOnesValue()) {
3342     ReplaceNode(And, And0.getNode());
3343     return true;
3344   }
3345
3346   // A negative mask allows a smaller encoding. Create a new 'and' node.
3347   SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
3348   SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
3349   ReplaceNode(And, NewAnd.getNode());
3350   SelectCode(NewAnd.getNode());
3351   return true;
3352 }
3353
3354 void X86DAGToDAGISel::Select(SDNode *Node) {
3355   MVT NVT = Node->getSimpleValueType(0);
3356   unsigned Opcode = Node->getOpcode();
3357   SDLoc dl(Node);
3358
3359   if (Node->isMachineOpcode()) {
3360     LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
3361     Node->setNodeId(-1);
3362     return;   // Already selected.
3363   }
3364
3365   switch (Opcode) {
3366   default: break;
3367   case ISD::BRIND: {
3368     if (Subtarget->isTargetNaCl())
3369       // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
3370       // leave the instruction alone.
3371       break;
3372     if (Subtarget->isTarget64BitILP32()) {
3373       // Converts a 32-bit register to a 64-bit, zero-extended version of
3374       // it. This is needed because x86-64 can do many things, but jmp %r32
3375       // ain't one of them.
3376       const SDValue &Target = Node->getOperand(1);
3377       assert(Target.getSimpleValueType() == llvm::MVT::i32);
3378       SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
3379       SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
3380                                       Node->getOperand(0), ZextTarget);
3381       ReplaceNode(Node, Brind.getNode());
3382       SelectCode(ZextTarget.getNode());
3383       SelectCode(Brind.getNode());
3384       return;
3385     }
3386     break;
3387   }
3388   case X86ISD::GlobalBaseReg:
3389     ReplaceNode(Node, getGlobalBaseReg());
3390     return;
3391
3392   case ISD::BITCAST:
3393     // Just drop all 128/256/512-bit bitcasts.
3394     if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
3395         NVT == MVT::f128) {
3396       ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
3397       CurDAG->RemoveDeadNode(Node);
3398       return;
3399     }
3400     break;
3401
3402   case ISD::VSELECT: {
3403     // Replace VSELECT with non-mask conditions with with BLENDV.
3404     if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
3405       break;
3406
3407     assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
3408     SDValue Blendv = CurDAG->getNode(
3409         X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
3410         Node->getOperand(1), Node->getOperand(2));
3411     ReplaceNode(Node, Blendv.getNode());
3412     SelectCode(Blendv.getNode());
3413     // We already called ReplaceUses.
3414     return;
3415   }
3416
3417   case ISD::SRL:
3418     if (matchBitExtract(Node))
3419       return;
3420     LLVM_FALLTHROUGH;
3421   case ISD::SRA:
3422   case ISD::SHL:
3423     if (tryShiftAmountMod(Node))
3424       return;
3425     break;
3426
3427   case ISD::AND:
3428     if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
3429       ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
3430       CurDAG->RemoveDeadNode(Node);
3431       return;
3432     }
3433     if (matchBitExtract(Node))
3434       return;
3435     if (AndImmShrink && shrinkAndImmediate(Node))
3436       return;
3437
3438     LLVM_FALLTHROUGH;
3439   case ISD::OR:
3440   case ISD::XOR: {
3441
3442     // For operations of the form (x << C1) op C2, check if we can use a smaller
3443     // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
3444     SDValue N0 = Node->getOperand(0);
3445     SDValue N1 = Node->getOperand(1);
3446
3447     if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
3448       break;
3449
3450     // i8 is unshrinkable, i16 should be promoted to i32.
3451     if (NVT != MVT::i32 && NVT != MVT::i64)
3452       break;
3453
3454     ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
3455     ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
3456     if (!Cst || !ShlCst)
3457       break;
3458
3459     int64_t Val = Cst->getSExtValue();
3460     uint64_t ShlVal = ShlCst->getZExtValue();
3461
3462     // Make sure that we don't change the operation by removing bits.
3463     // This only matters for OR and XOR, AND is unaffected.
3464     uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
3465     if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
3466       break;
3467
3468     unsigned ShlOp, AddOp, Op;
3469     MVT CstVT = NVT;
3470
3471     // Check the minimum bitwidth for the new constant.
3472     // TODO: AND32ri is the same as AND64ri32 with zext imm.
3473     // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
3474     // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
3475     if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
3476       CstVT = MVT::i8;
3477     else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
3478       CstVT = MVT::i32;
3479
3480     // Bail if there is no smaller encoding.
3481     if (NVT == CstVT)
3482       break;
3483
3484     switch (NVT.SimpleTy) {
3485     default: llvm_unreachable("Unsupported VT!");
3486     case MVT::i32:
3487       assert(CstVT == MVT::i8);
3488       ShlOp = X86::SHL32ri;
3489       AddOp = X86::ADD32rr;
3490
3491       switch (Opcode) {
3492       default: llvm_unreachable("Impossible opcode");
3493       case ISD::AND: Op = X86::AND32ri8; break;
3494       case ISD::OR:  Op =  X86::OR32ri8; break;
3495       case ISD::XOR: Op = X86::XOR32ri8; break;
3496       }
3497       break;
3498     case MVT::i64:
3499       assert(CstVT == MVT::i8 || CstVT == MVT::i32);
3500       ShlOp = X86::SHL64ri;
3501       AddOp = X86::ADD64rr;
3502
3503       switch (Opcode) {
3504       default: llvm_unreachable("Impossible opcode");
3505       case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
3506       case ISD::OR:  Op = CstVT==MVT::i8?  X86::OR64ri8 :  X86::OR64ri32; break;
3507       case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
3508       }
3509       break;
3510     }
3511
3512     // Emit the smaller op and the shift.
3513     SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
3514     SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
3515     if (ShlVal == 1)
3516       CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
3517                            SDValue(New, 0));
3518     else
3519       CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
3520                            getI8Imm(ShlVal, dl));
3521     return;
3522   }
3523   case X86ISD::SMUL:
3524     // i16/i32/i64 are handled with isel patterns.
3525     if (NVT != MVT::i8)
3526       break;
3527     LLVM_FALLTHROUGH;
3528   case X86ISD::UMUL: {
3529     SDValue N0 = Node->getOperand(0);
3530     SDValue N1 = Node->getOperand(1);
3531
3532     unsigned LoReg, ROpc, MOpc;
3533     switch (NVT.SimpleTy) {
3534     default: llvm_unreachable("Unsupported VT!");
3535     case MVT::i8:
3536       LoReg = X86::AL;
3537       ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
3538       MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
3539       break;
3540     case MVT::i16:
3541       LoReg = X86::AX;
3542       ROpc = X86::MUL16r;
3543       MOpc = X86::MUL16m;
3544       break;
3545     case MVT::i32:
3546       LoReg = X86::EAX;
3547       ROpc = X86::MUL32r;
3548       MOpc = X86::MUL32m;
3549       break;
3550     case MVT::i64:
3551       LoReg = X86::RAX;
3552       ROpc = X86::MUL64r;
3553       MOpc = X86::MUL64m;
3554       break;
3555     }
3556
3557     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3558     bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
3559     // Multiply is commmutative.
3560     if (!FoldedLoad) {
3561       FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
3562       if (FoldedLoad)
3563         std::swap(N0, N1);
3564     }
3565
3566     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
3567                                           N0, SDValue()).getValue(1);
3568
3569     MachineSDNode *CNode;
3570     if (FoldedLoad) {
3571       // i16/i32/i64 use an instruction that produces a low and high result even
3572       // though only the low result is used.
3573       SDVTList VTs;
3574       if (NVT == MVT::i8)
3575         VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
3576       else
3577         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
3578
3579       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
3580                         InFlag };
3581       CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3582
3583       // Update the chain.
3584       ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
3585       // Record the mem-refs
3586       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3587     } else {
3588       // i16/i32/i64 use an instruction that produces a low and high result even
3589       // though only the low result is used.
3590       SDVTList VTs;
3591       if (NVT == MVT::i8)
3592         VTs = CurDAG->getVTList(NVT, MVT::i32);
3593       else
3594         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
3595
3596       CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
3597     }
3598
3599     ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
3600     ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
3601     CurDAG->RemoveDeadNode(Node);
3602     return;
3603   }
3604
3605   case ISD::SMUL_LOHI:
3606   case ISD::UMUL_LOHI: {
3607     SDValue N0 = Node->getOperand(0);
3608     SDValue N1 = Node->getOperand(1);
3609
3610     unsigned Opc, MOpc;
3611     bool isSigned = Opcode == ISD::SMUL_LOHI;
3612     if (!isSigned) {
3613       switch (NVT.SimpleTy) {
3614       default: llvm_unreachable("Unsupported VT!");
3615       case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
3616       case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
3617       }
3618     } else {
3619       switch (NVT.SimpleTy) {
3620       default: llvm_unreachable("Unsupported VT!");
3621       case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
3622       case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
3623       }
3624     }
3625
3626     unsigned SrcReg, LoReg, HiReg;
3627     switch (Opc) {
3628     default: llvm_unreachable("Unknown MUL opcode!");
3629     case X86::IMUL32r:
3630     case X86::MUL32r:
3631       SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
3632       break;
3633     case X86::IMUL64r:
3634     case X86::MUL64r:
3635       SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
3636       break;
3637     }
3638
3639     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3640     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
3641     // Multiply is commmutative.
3642     if (!foldedLoad) {
3643       foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
3644       if (foldedLoad)
3645         std::swap(N0, N1);
3646     }
3647
3648     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
3649                                           N0, SDValue()).getValue(1);
3650     if (foldedLoad) {
3651       SDValue Chain;
3652       MachineSDNode *CNode = nullptr;
3653       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
3654                         InFlag };
3655       SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
3656       CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3657       Chain = SDValue(CNode, 0);
3658       InFlag = SDValue(CNode, 1);
3659
3660       // Update the chain.
3661       ReplaceUses(N1.getValue(1), Chain);
3662       // Record the mem-refs
3663       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3664     } else {
3665       SDValue Ops[] = { N1, InFlag };
3666       SDVTList VTs = CurDAG->getVTList(MVT::Glue);
3667       SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
3668       InFlag = SDValue(CNode, 0);
3669     }
3670
3671     // Copy the low half of the result, if it is needed.
3672     if (!SDValue(Node, 0).use_empty()) {
3673       assert(LoReg && "Register for low half is not defined!");
3674       SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
3675                                              NVT, InFlag);
3676       InFlag = ResLo.getValue(2);
3677       ReplaceUses(SDValue(Node, 0), ResLo);
3678       LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
3679                  dbgs() << '\n');
3680     }
3681     // Copy the high half of the result, if it is needed.
3682     if (!SDValue(Node, 1).use_empty()) {
3683       assert(HiReg && "Register for high half is not defined!");
3684       SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
3685                                              NVT, InFlag);
3686       InFlag = ResHi.getValue(2);
3687       ReplaceUses(SDValue(Node, 1), ResHi);
3688       LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
3689                  dbgs() << '\n');
3690     }
3691
3692     CurDAG->RemoveDeadNode(Node);
3693     return;
3694   }
3695
3696   case ISD::SDIVREM:
3697   case ISD::UDIVREM: {
3698     SDValue N0 = Node->getOperand(0);
3699     SDValue N1 = Node->getOperand(1);
3700
3701     unsigned Opc, MOpc;
3702     bool isSigned = Opcode == ISD::SDIVREM;
3703     if (!isSigned) {
3704       switch (NVT.SimpleTy) {
3705       default: llvm_unreachable("Unsupported VT!");
3706       case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
3707       case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
3708       case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
3709       case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
3710       }
3711     } else {
3712       switch (NVT.SimpleTy) {
3713       default: llvm_unreachable("Unsupported VT!");
3714       case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
3715       case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
3716       case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
3717       case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
3718       }
3719     }
3720
3721     unsigned LoReg, HiReg, ClrReg;
3722     unsigned SExtOpcode;
3723     switch (NVT.SimpleTy) {
3724     default: llvm_unreachable("Unsupported VT!");
3725     case MVT::i8:
3726       LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
3727       SExtOpcode = X86::CBW;
3728       break;
3729     case MVT::i16:
3730       LoReg = X86::AX;  HiReg = X86::DX;
3731       ClrReg = X86::DX;
3732       SExtOpcode = X86::CWD;
3733       break;
3734     case MVT::i32:
3735       LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
3736       SExtOpcode = X86::CDQ;
3737       break;
3738     case MVT::i64:
3739       LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
3740       SExtOpcode = X86::CQO;
3741       break;
3742     }
3743
3744     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3745     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
3746     bool signBitIsZero = CurDAG->SignBitIsZero(N0);
3747
3748     SDValue InFlag;
3749     if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
3750       // Special case for div8, just use a move with zero extension to AX to
3751       // clear the upper 8 bits (AH).
3752       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
3753       MachineSDNode *Move;
3754       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3755         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
3756         Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
3757                                       MVT::Other, Ops);
3758         Chain = SDValue(Move, 1);
3759         ReplaceUses(N0.getValue(1), Chain);
3760         // Record the mem-refs
3761         CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
3762       } else {
3763         Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0);
3764         Chain = CurDAG->getEntryNode();
3765       }
3766       Chain  = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0),
3767                                     SDValue());
3768       InFlag = Chain.getValue(1);
3769     } else {
3770       InFlag =
3771         CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
3772                              LoReg, N0, SDValue()).getValue(1);
3773       if (isSigned && !signBitIsZero) {
3774         // Sign extend the low part into the high part.
3775         InFlag =
3776           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
3777       } else {
3778         // Zero out the high part, effectively zero extending the input.
3779         SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
3780         switch (NVT.SimpleTy) {
3781         case MVT::i16:
3782           ClrNode =
3783               SDValue(CurDAG->getMachineNode(
3784                           TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
3785                           CurDAG->getTargetConstant(X86::sub_16bit, dl,
3786                                                     MVT::i32)),
3787                       0);
3788           break;
3789         case MVT::i32:
3790           break;
3791         case MVT::i64:
3792           ClrNode =
3793               SDValue(CurDAG->getMachineNode(
3794                           TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
3795                           CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
3796                           CurDAG->getTargetConstant(X86::sub_32bit, dl,
3797                                                     MVT::i32)),
3798                       0);
3799           break;
3800         default:
3801           llvm_unreachable("Unexpected division source");
3802         }
3803
3804         InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
3805                                       ClrNode, InFlag).getValue(1);
3806       }
3807     }
3808
3809     if (foldedLoad) {
3810       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
3811                         InFlag };
3812       MachineSDNode *CNode =
3813         CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
3814       InFlag = SDValue(CNode, 1);
3815       // Update the chain.
3816       ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
3817       // Record the mem-refs
3818       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3819     } else {
3820       InFlag =
3821         SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
3822     }
3823
3824     // Prevent use of AH in a REX instruction by explicitly copying it to
3825     // an ABCD_L register.
3826     //
3827     // The current assumption of the register allocator is that isel
3828     // won't generate explicit references to the GR8_ABCD_H registers. If
3829     // the allocator and/or the backend get enhanced to be more robust in
3830     // that regard, this can be, and should be, removed.
3831     if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
3832       SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
3833       unsigned AHExtOpcode =
3834           isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
3835
3836       SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
3837                                              MVT::Glue, AHCopy, InFlag);
3838       SDValue Result(RNode, 0);
3839       InFlag = SDValue(RNode, 1);
3840
3841       Result =
3842           CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
3843
3844       ReplaceUses(SDValue(Node, 1), Result);
3845       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
3846                  dbgs() << '\n');
3847     }
3848     // Copy the division (low) result, if it is needed.
3849     if (!SDValue(Node, 0).use_empty()) {
3850       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
3851                                                 LoReg, NVT, InFlag);
3852       InFlag = Result.getValue(2);
3853       ReplaceUses(SDValue(Node, 0), Result);
3854       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
3855                  dbgs() << '\n');
3856     }
3857     // Copy the remainder (high) result, if it is needed.
3858     if (!SDValue(Node, 1).use_empty()) {
3859       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
3860                                               HiReg, NVT, InFlag);
3861       InFlag = Result.getValue(2);
3862       ReplaceUses(SDValue(Node, 1), Result);
3863       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
3864                  dbgs() << '\n');
3865     }
3866     CurDAG->RemoveDeadNode(Node);
3867     return;
3868   }
3869
3870   case X86ISD::CMP: {
3871     SDValue N0 = Node->getOperand(0);
3872     SDValue N1 = Node->getOperand(1);
3873
3874     // Optimizations for TEST compares.
3875     if (!isNullConstant(N1))
3876       break;
3877
3878     // Save the original VT of the compare.
3879     MVT CmpVT = N0.getSimpleValueType();
3880
3881     // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
3882     // by a test instruction. The test should be removed later by
3883     // analyzeCompare if we are using only the zero flag.
3884     // TODO: Should we check the users and use the BEXTR flags directly?
3885     if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
3886       if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
3887         unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
3888                                              : X86::TEST32rr;
3889         SDValue BEXTR = SDValue(NewNode, 0);
3890         NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
3891         ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
3892         CurDAG->RemoveDeadNode(Node);
3893         return;
3894       }
3895     }
3896
3897     // We can peek through truncates, but we need to be careful below.
3898     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
3899       N0 = N0.getOperand(0);
3900
3901     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
3902     // use a smaller encoding.
3903     // Look past the truncate if CMP is the only use of it.
3904     if (N0.getOpcode() == ISD::AND &&
3905         N0.getNode()->hasOneUse() &&
3906         N0.getValueType() != MVT::i8) {
3907       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
3908       if (!C) break;
3909       uint64_t Mask = C->getZExtValue();
3910
3911       // Check if we can replace AND+IMM64 with a shift. This is possible for
3912       // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
3913       // flag.
3914       if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
3915           onlyUsesZeroFlag(SDValue(Node, 0))) {
3916         if (isMask_64(~Mask)) {
3917           unsigned TrailingZeros = countTrailingZeros(Mask);
3918           SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
3919           SDValue Shift =
3920             SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64,
3921                                            N0.getOperand(0), Imm), 0);
3922           MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
3923                                                        MVT::i32, Shift, Shift);
3924           ReplaceNode(Node, Test);
3925           return;
3926         }
3927         if (isMask_64(Mask)) {
3928           unsigned LeadingZeros = countLeadingZeros(Mask);
3929           SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
3930           SDValue Shift =
3931             SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64,
3932                                            N0.getOperand(0), Imm), 0);
3933           MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
3934                                                        MVT::i32, Shift, Shift);
3935           ReplaceNode(Node, Test);
3936           return;
3937         }
3938       }
3939
3940       MVT VT;
3941       int SubRegOp;
3942       unsigned ROpc, MOpc;
3943
3944       // For each of these checks we need to be careful if the sign flag is
3945       // being used. It is only safe to use the sign flag in two conditions,
3946       // either the sign bit in the shrunken mask is zero or the final test
3947       // size is equal to the original compare size.
3948
3949       if (isUInt<8>(Mask) &&
3950           (!(Mask & 0x80) || CmpVT == MVT::i8 ||
3951            hasNoSignFlagUses(SDValue(Node, 0)))) {
3952         // For example, convert "testl %eax, $8" to "testb %al, $8"
3953         VT = MVT::i8;
3954         SubRegOp = X86::sub_8bit;
3955         ROpc = X86::TEST8ri;
3956         MOpc = X86::TEST8mi;
3957       } else if (OptForMinSize && isUInt<16>(Mask) &&
3958                  (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
3959                   hasNoSignFlagUses(SDValue(Node, 0)))) {
3960         // For example, "testl %eax, $32776" to "testw %ax, $32776".
3961         // NOTE: We only want to form TESTW instructions if optimizing for
3962         // min size. Otherwise we only save one byte and possibly get a length
3963         // changing prefix penalty in the decoders.
3964         VT = MVT::i16;
3965         SubRegOp = X86::sub_16bit;
3966         ROpc = X86::TEST16ri;
3967         MOpc = X86::TEST16mi;
3968       } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
3969                  ((!(Mask & 0x80000000) &&
3970                    // Without minsize 16-bit Cmps can get here so we need to
3971                    // be sure we calculate the correct sign flag if needed.
3972                    (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
3973                   CmpVT == MVT::i32 ||
3974                   hasNoSignFlagUses(SDValue(Node, 0)))) {
3975         // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
3976         // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
3977         // Otherwize, we find ourselves in a position where we have to do
3978         // promotion. If previous passes did not promote the and, we assume
3979         // they had a good reason not to and do not promote here.
3980         VT = MVT::i32;
3981         SubRegOp = X86::sub_32bit;
3982         ROpc = X86::TEST32ri;
3983         MOpc = X86::TEST32mi;
3984       } else {
3985         // No eligible transformation was found.
3986         break;
3987       }
3988
3989       // FIXME: We should be able to fold loads here.
3990
3991       SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
3992       SDValue Reg = N0.getOperand(0);
3993
3994       // Emit a testl or testw.
3995       MachineSDNode *NewNode;
3996       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3997       if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3998         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3999                           Reg.getOperand(0) };
4000         NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
4001         // Update the chain.
4002         ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
4003         // Record the mem-refs
4004         CurDAG->setNodeMemRefs(NewNode,
4005                                {cast<LoadSDNode>(Reg)->getMemOperand()});
4006       } else {
4007         // Extract the subregister if necessary.
4008         if (N0.getValueType() != VT)
4009           Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
4010
4011         NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
4012       }
4013       // Replace CMP with TEST.
4014       ReplaceNode(Node, NewNode);
4015       return;
4016     }
4017     break;
4018   }
4019   case X86ISD::PCMPISTR: {
4020     if (!Subtarget->hasSSE42())
4021       break;
4022
4023     bool NeedIndex = !SDValue(Node, 0).use_empty();
4024     bool NeedMask = !SDValue(Node, 1).use_empty();
4025     // We can't fold a load if we are going to make two instructions.
4026     bool MayFoldLoad = !NeedIndex || !NeedMask;
4027
4028     MachineSDNode *CNode;
4029     if (NeedMask) {
4030       unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
4031       unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
4032       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
4033       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
4034     }
4035     if (NeedIndex || !NeedMask) {
4036       unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
4037       unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
4038       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
4039       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4040     }
4041
4042     // Connect the flag usage to the last instruction created.
4043     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
4044     CurDAG->RemoveDeadNode(Node);
4045     return;
4046   }
4047   case X86ISD::PCMPESTR: {
4048     if (!Subtarget->hasSSE42())
4049       break;
4050
4051     // Copy the two implicit register inputs.
4052     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
4053                                           Node->getOperand(1),
4054                                           SDValue()).getValue(1);
4055     InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
4056                                   Node->getOperand(3), InFlag).getValue(1);
4057
4058     bool NeedIndex = !SDValue(Node, 0).use_empty();
4059     bool NeedMask = !SDValue(Node, 1).use_empty();
4060     // We can't fold a load if we are going to make two instructions.
4061     bool MayFoldLoad = !NeedIndex || !NeedMask;
4062
4063     MachineSDNode *CNode;
4064     if (NeedMask) {
4065       unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
4066       unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
4067       CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
4068                            InFlag);
4069       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
4070     }
4071     if (NeedIndex || !NeedMask) {
4072       unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
4073       unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
4074       CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
4075       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4076     }
4077     // Connect the flag usage to the last instruction created.
4078     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
4079     CurDAG->RemoveDeadNode(Node);
4080     return;
4081   }
4082
4083   case ISD::STORE:
4084     if (foldLoadStoreIntoMemOperand(Node))
4085       return;
4086     break;
4087   }
4088
4089   SelectCode(Node);
4090 }
4091
4092 bool X86DAGToDAGISel::
4093 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
4094                              std::vector<SDValue> &OutOps) {
4095   SDValue Op0, Op1, Op2, Op3, Op4;
4096   switch (ConstraintID) {
4097   default:
4098     llvm_unreachable("Unexpected asm memory constraint");
4099   case InlineAsm::Constraint_i:
4100     // FIXME: It seems strange that 'i' is needed here since it's supposed to
4101     //        be an immediate and not a memory constraint.
4102     LLVM_FALLTHROUGH;
4103   case InlineAsm::Constraint_o: // offsetable        ??
4104   case InlineAsm::Constraint_v: // not offsetable    ??
4105   case InlineAsm::Constraint_m: // memory
4106   case InlineAsm::Constraint_X:
4107     if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
4108       return true;
4109     break;
4110   }
4111
4112   OutOps.push_back(Op0);
4113   OutOps.push_back(Op1);
4114   OutOps.push_back(Op2);
4115   OutOps.push_back(Op3);
4116   OutOps.push_back(Op4);
4117   return false;
4118 }
4119
4120 /// This pass converts a legalized DAG into a X86-specific DAG,
4121 /// ready for instruction scheduling.
4122 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
4123                                      CodeGenOpt::Level OptLevel) {
4124   return new X86DAGToDAGISel(TM, OptLevel);
4125 }